@livekit/agents 1.0.46 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/utils.cjs +31 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +31 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +383 -298
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +34 -7
- package/dist/voice/agent_activity.d.ts +34 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +383 -293
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +140 -40
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +19 -7
- package/dist/voice/agent_session.d.ts +19 -7
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +137 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +4 -0
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +4 -0
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +36 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +427 -289
- package/src/voice/agent_session.ts +178 -40
- package/src/voice/audio_recognition.ts +4 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
|
@@ -10,14 +10,20 @@ import {
|
|
|
10
10
|
} from "../llm/index.js";
|
|
11
11
|
import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
|
|
12
12
|
import { log } from "../log.js";
|
|
13
|
-
import {
|
|
13
|
+
import { MultiInputStream } from "../stream/multi_input_stream.js";
|
|
14
14
|
import { STT } from "../stt/stt.js";
|
|
15
15
|
import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
|
|
16
16
|
import { splitWords } from "../tokenize/basic/word.js";
|
|
17
17
|
import { TTS } from "../tts/tts.js";
|
|
18
18
|
import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
|
|
19
19
|
import { VAD } from "../vad.js";
|
|
20
|
-
import {
|
|
20
|
+
import {
|
|
21
|
+
StopResponse,
|
|
22
|
+
_getActivityTaskInfo,
|
|
23
|
+
_setActivityTaskInfo,
|
|
24
|
+
functionCallStorage,
|
|
25
|
+
speechHandleStorage
|
|
26
|
+
} from "./agent.js";
|
|
21
27
|
import {} from "./agent_session.js";
|
|
22
28
|
import {
|
|
23
29
|
AudioRecognition
|
|
@@ -41,8 +47,10 @@ import {
|
|
|
41
47
|
} from "./generation.js";
|
|
42
48
|
import { SpeechHandle } from "./speech_handle.js";
|
|
43
49
|
import { setParticipantSpanAttributes } from "./utils.js";
|
|
44
|
-
const
|
|
50
|
+
const agentActivityStorage = new AsyncLocalStorage();
|
|
45
51
|
class AgentActivity {
|
|
52
|
+
agent;
|
|
53
|
+
agentSession;
|
|
46
54
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
47
55
|
started = false;
|
|
48
56
|
audioRecognition;
|
|
@@ -51,22 +59,29 @@ class AgentActivity {
|
|
|
51
59
|
// Maps response_id to OTEL span for metrics recording
|
|
52
60
|
turnDetectionMode;
|
|
53
61
|
logger = log();
|
|
54
|
-
|
|
62
|
+
_schedulingPaused = true;
|
|
63
|
+
_drainBlockedTasks = [];
|
|
55
64
|
_currentSpeech;
|
|
56
65
|
speechQueue;
|
|
57
66
|
// [priority, timestamp, speechHandle]
|
|
58
67
|
q_updated;
|
|
59
68
|
speechTasks = /* @__PURE__ */ new Set();
|
|
60
69
|
lock = new Mutex();
|
|
61
|
-
audioStream = new
|
|
70
|
+
audioStream = new MultiInputStream();
|
|
71
|
+
audioStreamId;
|
|
62
72
|
// default to null as None, which maps to the default provider tool choice value
|
|
63
73
|
toolChoice = null;
|
|
64
74
|
_preemptiveGeneration;
|
|
65
|
-
agent;
|
|
66
|
-
agentSession;
|
|
67
75
|
/** @internal */
|
|
68
76
|
_mainTask;
|
|
77
|
+
_onEnterTask;
|
|
78
|
+
_onExitTask;
|
|
69
79
|
_userTurnCompletedTask;
|
|
80
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
81
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
82
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
83
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
84
|
+
onModelError = (ev) => this.onError(ev);
|
|
70
85
|
constructor(agent, agentSession) {
|
|
71
86
|
this.agent = agent;
|
|
72
87
|
this.agentSession = agentSession;
|
|
@@ -77,7 +92,7 @@ class AgentActivity {
|
|
|
77
92
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
78
93
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
79
94
|
this.logger.warn(
|
|
80
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
95
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
81
96
|
);
|
|
82
97
|
this.turnDetectionMode = void 0;
|
|
83
98
|
}
|
|
@@ -127,107 +142,121 @@ class AgentActivity {
|
|
|
127
142
|
}
|
|
128
143
|
}
|
|
129
144
|
async start() {
|
|
130
|
-
var _a;
|
|
131
145
|
const unlock = await this.lock.lock();
|
|
132
146
|
try {
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
} else if (this.llm instanceof LLM) {
|
|
173
|
-
try {
|
|
174
|
-
updateInstructions({
|
|
175
|
-
chatCtx: this.agent._chatCtx,
|
|
176
|
-
instructions: this.agent.instructions,
|
|
177
|
-
addIfMissing: true
|
|
178
|
-
});
|
|
179
|
-
} catch (error) {
|
|
180
|
-
this.logger.error("failed to update the instructions", error);
|
|
181
|
-
}
|
|
147
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
148
|
+
} finally {
|
|
149
|
+
unlock();
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
async resume() {
|
|
153
|
+
const unlock = await this.lock.lock();
|
|
154
|
+
try {
|
|
155
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
156
|
+
} finally {
|
|
157
|
+
unlock();
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
async _startSession(options) {
|
|
161
|
+
var _a;
|
|
162
|
+
const { spanName, runOnEnter } = options;
|
|
163
|
+
const startSpan = tracer.startSpan({
|
|
164
|
+
name: spanName,
|
|
165
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
166
|
+
context: ROOT_CONTEXT
|
|
167
|
+
});
|
|
168
|
+
this.agent._agentActivity = this;
|
|
169
|
+
if (this.llm instanceof RealtimeModel) {
|
|
170
|
+
this.realtimeSession = this.llm.session();
|
|
171
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
172
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
173
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
174
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
175
|
+
this.realtimeSession.on(
|
|
176
|
+
"input_audio_transcription_completed",
|
|
177
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
178
|
+
);
|
|
179
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
180
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
181
|
+
removeInstructions(this.agent._chatCtx);
|
|
182
|
+
try {
|
|
183
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
184
|
+
} catch (error) {
|
|
185
|
+
this.logger.error(error, "failed to update the instructions");
|
|
182
186
|
}
|
|
183
|
-
|
|
184
|
-
this.
|
|
185
|
-
|
|
187
|
+
try {
|
|
188
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
189
|
+
} catch (error) {
|
|
190
|
+
this.logger.error(error, "failed to update the chat context");
|
|
186
191
|
}
|
|
187
|
-
|
|
188
|
-
this.
|
|
189
|
-
|
|
192
|
+
try {
|
|
193
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
194
|
+
} catch (error) {
|
|
195
|
+
this.logger.error(error, "failed to update the tools");
|
|
190
196
|
}
|
|
191
|
-
if (this.tts
|
|
192
|
-
this.
|
|
193
|
-
|
|
197
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
198
|
+
this.logger.error(
|
|
199
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
200
|
+
);
|
|
194
201
|
}
|
|
195
|
-
|
|
196
|
-
|
|
202
|
+
} else if (this.llm instanceof LLM) {
|
|
203
|
+
try {
|
|
204
|
+
updateInstructions({
|
|
205
|
+
chatCtx: this.agent._chatCtx,
|
|
206
|
+
instructions: this.agent.instructions,
|
|
207
|
+
addIfMissing: true
|
|
208
|
+
});
|
|
209
|
+
} catch (error) {
|
|
210
|
+
this.logger.error("failed to update the instructions", error);
|
|
197
211
|
}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
this
|
|
216
|
-
|
|
217
|
-
this.
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
this.
|
|
224
|
-
|
|
212
|
+
}
|
|
213
|
+
if (this.llm instanceof LLM) {
|
|
214
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
215
|
+
this.llm.on("error", this.onModelError);
|
|
216
|
+
}
|
|
217
|
+
if (this.stt instanceof STT) {
|
|
218
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
219
|
+
this.stt.on("error", this.onModelError);
|
|
220
|
+
}
|
|
221
|
+
if (this.tts instanceof TTS) {
|
|
222
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
223
|
+
this.tts.on("error", this.onModelError);
|
|
224
|
+
}
|
|
225
|
+
if (this.vad instanceof VAD) {
|
|
226
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
227
|
+
}
|
|
228
|
+
this.audioRecognition = new AudioRecognition({
|
|
229
|
+
recognitionHooks: this,
|
|
230
|
+
// Disable stt node if stt is not provided
|
|
231
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
232
|
+
vad: this.vad,
|
|
233
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
234
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
235
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
236
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
237
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
238
|
+
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
239
|
+
sttProvider: this.getSttProvider(),
|
|
240
|
+
getLinkedParticipant: () => {
|
|
241
|
+
var _a2;
|
|
242
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
this.audioRecognition.start();
|
|
246
|
+
this.started = true;
|
|
247
|
+
this._resumeSchedulingTask();
|
|
248
|
+
if (runOnEnter) {
|
|
249
|
+
this._onEnterTask = this.createSpeechTask({
|
|
250
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
251
|
+
name: "on_enter",
|
|
252
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
253
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
254
|
+
}),
|
|
255
|
+
inlineTask: true,
|
|
225
256
|
name: "AgentActivity_onEnter"
|
|
226
257
|
});
|
|
227
|
-
startSpan.end();
|
|
228
|
-
} finally {
|
|
229
|
-
unlock();
|
|
230
258
|
}
|
|
259
|
+
startSpan.end();
|
|
231
260
|
}
|
|
232
261
|
get currentSpeech() {
|
|
233
262
|
return this._currentSpeech;
|
|
@@ -256,8 +285,8 @@ class AgentActivity {
|
|
|
256
285
|
get tools() {
|
|
257
286
|
return this.agent.toolCtx;
|
|
258
287
|
}
|
|
259
|
-
get
|
|
260
|
-
return this.
|
|
288
|
+
get schedulingPaused() {
|
|
289
|
+
return this._schedulingPaused;
|
|
261
290
|
}
|
|
262
291
|
get realtimeLLMSession() {
|
|
263
292
|
return this.realtimeSession;
|
|
@@ -297,11 +326,9 @@ class AgentActivity {
|
|
|
297
326
|
}
|
|
298
327
|
}
|
|
299
328
|
attachAudioInput(audioStream) {
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
}
|
|
304
|
-
this.audioStream.setSource(audioStream);
|
|
329
|
+
void this.audioStream.close();
|
|
330
|
+
this.audioStream = new MultiInputStream();
|
|
331
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
305
332
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
306
333
|
if (this.realtimeSession) {
|
|
307
334
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -311,13 +338,21 @@ class AgentActivity {
|
|
|
311
338
|
}
|
|
312
339
|
}
|
|
313
340
|
detachAudioInput() {
|
|
314
|
-
this.
|
|
341
|
+
if (this.audioStreamId === void 0) {
|
|
342
|
+
return;
|
|
343
|
+
}
|
|
344
|
+
void this.audioStream.close();
|
|
345
|
+
this.audioStream = new MultiInputStream();
|
|
346
|
+
this.audioStreamId = void 0;
|
|
315
347
|
}
|
|
316
|
-
commitUserTurn() {
|
|
348
|
+
commitUserTurn(options = {}) {
|
|
349
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
317
350
|
if (!this.audioRecognition) {
|
|
318
|
-
|
|
351
|
+
if (throwIfNotReady) {
|
|
352
|
+
throw new Error("AudioRecognition is not initialized");
|
|
353
|
+
}
|
|
354
|
+
return;
|
|
319
355
|
}
|
|
320
|
-
const audioDetached = false;
|
|
321
356
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
322
357
|
}
|
|
323
358
|
clearUserTurn() {
|
|
@@ -353,13 +388,11 @@ class AgentActivity {
|
|
|
353
388
|
})
|
|
354
389
|
);
|
|
355
390
|
const task = this.createSpeechTask({
|
|
356
|
-
|
|
357
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
358
|
-
),
|
|
391
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
359
392
|
ownedSpeechHandle: handle,
|
|
360
393
|
name: "AgentActivity.say_tts"
|
|
361
394
|
});
|
|
362
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
395
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
363
396
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
364
397
|
return handle;
|
|
365
398
|
}
|
|
@@ -449,8 +482,8 @@ class AgentActivity {
|
|
|
449
482
|
if (ev.userInitiated) {
|
|
450
483
|
return;
|
|
451
484
|
}
|
|
452
|
-
if (this.
|
|
453
|
-
this.logger.warn("skipping new realtime generation, the
|
|
485
|
+
if (this.schedulingPaused) {
|
|
486
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
454
487
|
return;
|
|
455
488
|
}
|
|
456
489
|
const handle = SpeechHandle.create({
|
|
@@ -466,9 +499,7 @@ class AgentActivity {
|
|
|
466
499
|
);
|
|
467
500
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
468
501
|
this.createSpeechTask({
|
|
469
|
-
|
|
470
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
471
|
-
),
|
|
502
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
472
503
|
ownedSpeechHandle: handle,
|
|
473
504
|
name: "AgentActivity.realtimeGeneration"
|
|
474
505
|
});
|
|
@@ -555,7 +586,7 @@ class AgentActivity {
|
|
|
555
586
|
}
|
|
556
587
|
}
|
|
557
588
|
onPreemptiveGeneration(info) {
|
|
558
|
-
if (!this.agentSession.options.preemptiveGeneration || this.
|
|
589
|
+
if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
|
|
559
590
|
return;
|
|
560
591
|
}
|
|
561
592
|
this.cancelPreemptiveGeneration();
|
|
@@ -593,7 +624,21 @@ class AgentActivity {
|
|
|
593
624
|
}
|
|
594
625
|
}
|
|
595
626
|
createSpeechTask(options) {
|
|
596
|
-
const {
|
|
627
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
628
|
+
const wrappedFn = (ctrl) => {
|
|
629
|
+
return agentActivityStorage.run(this, () => {
|
|
630
|
+
const currentTask = Task.current();
|
|
631
|
+
if (currentTask) {
|
|
632
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
633
|
+
}
|
|
634
|
+
if (ownedSpeechHandle) {
|
|
635
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
636
|
+
}
|
|
637
|
+
return taskFn(ctrl);
|
|
638
|
+
});
|
|
639
|
+
};
|
|
640
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
641
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
597
642
|
this.speechTasks.add(task);
|
|
598
643
|
task.addDoneCallback(() => {
|
|
599
644
|
this.speechTasks.delete(task);
|
|
@@ -609,12 +654,15 @@ class AgentActivity {
|
|
|
609
654
|
task.addDoneCallback(() => {
|
|
610
655
|
this.wakeupMainTask();
|
|
611
656
|
});
|
|
612
|
-
return task
|
|
657
|
+
return task;
|
|
613
658
|
}
|
|
614
659
|
async onEndOfTurn(info) {
|
|
615
|
-
if (this.
|
|
660
|
+
if (this.schedulingPaused) {
|
|
616
661
|
this.cancelPreemptiveGeneration();
|
|
617
|
-
this.logger.warn(
|
|
662
|
+
this.logger.warn(
|
|
663
|
+
{ user_input: info.newTranscript },
|
|
664
|
+
"skipping user input, speech scheduling is paused"
|
|
665
|
+
);
|
|
618
666
|
return true;
|
|
619
667
|
}
|
|
620
668
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
@@ -633,7 +681,7 @@ class AgentActivity {
|
|
|
633
681
|
}
|
|
634
682
|
const oldTask = this._userTurnCompletedTask;
|
|
635
683
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
636
|
-
|
|
684
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
637
685
|
name: "AgentActivity.userTurnCompleted"
|
|
638
686
|
});
|
|
639
687
|
return true;
|
|
@@ -663,14 +711,41 @@ class AgentActivity {
|
|
|
663
711
|
await speechHandle._waitForGeneration();
|
|
664
712
|
this._currentSpeech = void 0;
|
|
665
713
|
}
|
|
666
|
-
|
|
667
|
-
|
|
714
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
715
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
716
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
668
717
|
break;
|
|
669
718
|
}
|
|
670
719
|
this.q_updated = new Future();
|
|
671
720
|
}
|
|
672
721
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
673
722
|
}
|
|
723
|
+
getDrainPendingSpeechTasks() {
|
|
724
|
+
const blockedHandles = [];
|
|
725
|
+
for (const task of this._drainBlockedTasks) {
|
|
726
|
+
const info = _getActivityTaskInfo(task);
|
|
727
|
+
if (!info) {
|
|
728
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
729
|
+
continue;
|
|
730
|
+
}
|
|
731
|
+
if (!info.speechHandle) {
|
|
732
|
+
continue;
|
|
733
|
+
}
|
|
734
|
+
blockedHandles.push(info.speechHandle);
|
|
735
|
+
}
|
|
736
|
+
const toWait = [];
|
|
737
|
+
for (const task of this.speechTasks) {
|
|
738
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
739
|
+
continue;
|
|
740
|
+
}
|
|
741
|
+
const info = _getActivityTaskInfo(task);
|
|
742
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
743
|
+
continue;
|
|
744
|
+
}
|
|
745
|
+
toWait.push(task);
|
|
746
|
+
}
|
|
747
|
+
return toWait;
|
|
748
|
+
}
|
|
674
749
|
wakeupMainTask() {
|
|
675
750
|
this.q_updated.resolve();
|
|
676
751
|
}
|
|
@@ -696,7 +771,7 @@ class AgentActivity {
|
|
|
696
771
|
if (this.llm === void 0) {
|
|
697
772
|
throw new Error("trying to generate reply without an LLM model");
|
|
698
773
|
}
|
|
699
|
-
const functionCall = (_a =
|
|
774
|
+
const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
700
775
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
701
776
|
toolChoice = "none";
|
|
702
777
|
}
|
|
@@ -714,19 +789,17 @@ class AgentActivity {
|
|
|
714
789
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
715
790
|
if (this.llm instanceof RealtimeModel) {
|
|
716
791
|
this.createSpeechTask({
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
})
|
|
729
|
-
),
|
|
792
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
793
|
+
speechHandle: handle,
|
|
794
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
795
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
796
|
+
instructions,
|
|
797
|
+
modelSettings: {
|
|
798
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
799
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
800
|
+
},
|
|
801
|
+
abortController
|
|
802
|
+
}),
|
|
730
803
|
ownedSpeechHandle: handle,
|
|
731
804
|
name: "AgentActivity.realtimeReply"
|
|
732
805
|
});
|
|
@@ -736,36 +809,36 @@ class AgentActivity {
|
|
|
736
809
|
${instructions}`;
|
|
737
810
|
}
|
|
738
811
|
const task = this.createSpeechTask({
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
userMessage
|
|
750
|
-
)
|
|
812
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
813
|
+
handle,
|
|
814
|
+
chatCtx ?? this.agent.chatCtx,
|
|
815
|
+
this.agent.toolCtx,
|
|
816
|
+
{
|
|
817
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
818
|
+
},
|
|
819
|
+
abortController,
|
|
820
|
+
instructions,
|
|
821
|
+
userMessage
|
|
751
822
|
),
|
|
752
823
|
ownedSpeechHandle: handle,
|
|
753
824
|
name: "AgentActivity.pipelineReply"
|
|
754
825
|
});
|
|
755
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
826
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
756
827
|
}
|
|
757
828
|
if (scheduleSpeech) {
|
|
758
829
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
759
830
|
}
|
|
760
831
|
return handle;
|
|
761
832
|
}
|
|
762
|
-
interrupt() {
|
|
833
|
+
interrupt(options = {}) {
|
|
763
834
|
var _a;
|
|
835
|
+
const { force = false } = options;
|
|
836
|
+
this.cancelPreemptiveGeneration();
|
|
764
837
|
const future = new Future();
|
|
765
838
|
const currentSpeech = this._currentSpeech;
|
|
766
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
839
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
767
840
|
for (const [_, __, speech] of this.speechQueue) {
|
|
768
|
-
speech.interrupt();
|
|
841
|
+
speech.interrupt(force);
|
|
769
842
|
}
|
|
770
843
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
771
844
|
if (currentSpeech === void 0) {
|
|
@@ -786,7 +859,7 @@ ${instructions}`;
|
|
|
786
859
|
async userTurnCompleted(info, oldTask) {
|
|
787
860
|
var _a, _b;
|
|
788
861
|
if (oldTask) {
|
|
789
|
-
await oldTask;
|
|
862
|
+
await oldTask.result;
|
|
790
863
|
}
|
|
791
864
|
if (this.llm instanceof RealtimeModel) {
|
|
792
865
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -973,7 +1046,7 @@ ${instructions}`;
|
|
|
973
1046
|
toolsMessages,
|
|
974
1047
|
span
|
|
975
1048
|
}) => {
|
|
976
|
-
var _a, _b
|
|
1049
|
+
var _a, _b;
|
|
977
1050
|
speechHandle._agentTurnContext = otelContext.active();
|
|
978
1051
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
979
1052
|
if (instructions) {
|
|
@@ -1119,11 +1192,11 @@ ${instructions}`;
|
|
|
1119
1192
|
for (const msg of toolsMessages) {
|
|
1120
1193
|
msg.createdAt = replyStartedAt;
|
|
1121
1194
|
}
|
|
1122
|
-
this.agent._chatCtx.insert(toolsMessages);
|
|
1123
1195
|
const toolCallOutputs = toolsMessages.filter(
|
|
1124
1196
|
(m) => m.type === "function_call_output"
|
|
1125
1197
|
);
|
|
1126
1198
|
if (toolCallOutputs.length > 0) {
|
|
1199
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1127
1200
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1128
1201
|
}
|
|
1129
1202
|
}
|
|
@@ -1211,45 +1284,15 @@ ${instructions}`;
|
|
|
1211
1284
|
);
|
|
1212
1285
|
return;
|
|
1213
1286
|
}
|
|
1214
|
-
const functionToolsExecutedEvent =
|
|
1215
|
-
functionCalls: [],
|
|
1216
|
-
functionCallOutputs: []
|
|
1217
|
-
});
|
|
1218
|
-
let shouldGenerateToolReply = false;
|
|
1219
|
-
let newAgentTask = null;
|
|
1220
|
-
let ignoreTaskSwitch = false;
|
|
1221
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1222
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1223
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1224
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1225
|
-
if (sanitizedOut.replyRequired) {
|
|
1226
|
-
shouldGenerateToolReply = true;
|
|
1227
|
-
}
|
|
1228
|
-
}
|
|
1229
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1230
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1231
|
-
ignoreTaskSwitch = true;
|
|
1232
|
-
}
|
|
1233
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1234
|
-
this.logger.debug(
|
|
1235
|
-
{
|
|
1236
|
-
speechId: speechHandle.id,
|
|
1237
|
-
name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
|
|
1238
|
-
args: sanitizedOut.toolCall.args,
|
|
1239
|
-
output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
|
|
1240
|
-
isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
|
|
1241
|
-
},
|
|
1242
|
-
"Tool call execution finished"
|
|
1243
|
-
);
|
|
1244
|
-
}
|
|
1287
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1245
1288
|
this.agentSession.emit(
|
|
1246
1289
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1247
1290
|
functionToolsExecutedEvent
|
|
1248
1291
|
);
|
|
1249
|
-
let
|
|
1292
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1250
1293
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1251
1294
|
this.agentSession.updateAgent(newAgentTask);
|
|
1252
|
-
|
|
1295
|
+
schedulingPaused = true;
|
|
1253
1296
|
}
|
|
1254
1297
|
const toolMessages = [
|
|
1255
1298
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1258,34 +1301,32 @@ ${instructions}`;
|
|
|
1258
1301
|
if (shouldGenerateToolReply) {
|
|
1259
1302
|
chatCtx.insert(toolMessages);
|
|
1260
1303
|
speechHandle._numSteps += 1;
|
|
1261
|
-
const respondToolChoice =
|
|
1304
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1262
1305
|
const toolResponseTask = this.createSpeechTask({
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
toolMessages
|
|
1273
|
-
)
|
|
1306
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1307
|
+
speechHandle,
|
|
1308
|
+
chatCtx,
|
|
1309
|
+
toolCtx,
|
|
1310
|
+
{ toolChoice: respondToolChoice },
|
|
1311
|
+
replyAbortController,
|
|
1312
|
+
instructions,
|
|
1313
|
+
void 0,
|
|
1314
|
+
toolMessages
|
|
1274
1315
|
),
|
|
1275
1316
|
ownedSpeechHandle: speechHandle,
|
|
1276
1317
|
name: "AgentActivity.pipelineReply"
|
|
1277
1318
|
});
|
|
1278
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1319
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1279
1320
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1280
1321
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1281
1322
|
for (const msg of toolMessages) {
|
|
1282
1323
|
msg.createdAt = replyStartedAt;
|
|
1283
1324
|
}
|
|
1284
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1285
1325
|
const toolCallOutputs = toolMessages.filter(
|
|
1286
1326
|
(m) => m.type === "function_call_output"
|
|
1287
1327
|
);
|
|
1288
1328
|
if (toolCallOutputs.length > 0) {
|
|
1329
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1289
1330
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1290
1331
|
}
|
|
1291
1332
|
}
|
|
@@ -1329,7 +1370,7 @@ ${instructions}`;
|
|
|
1329
1370
|
replyAbortController,
|
|
1330
1371
|
span
|
|
1331
1372
|
}) {
|
|
1332
|
-
var _a
|
|
1373
|
+
var _a;
|
|
1333
1374
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1334
1375
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1335
1376
|
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
@@ -1589,44 +1630,15 @@ ${instructions}`;
|
|
|
1589
1630
|
);
|
|
1590
1631
|
return;
|
|
1591
1632
|
}
|
|
1592
|
-
const functionToolsExecutedEvent =
|
|
1593
|
-
functionCalls: [],
|
|
1594
|
-
functionCallOutputs: []
|
|
1595
|
-
});
|
|
1596
|
-
let shouldGenerateToolReply = false;
|
|
1597
|
-
let newAgentTask = null;
|
|
1598
|
-
let ignoreTaskSwitch = false;
|
|
1599
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1600
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1601
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1602
|
-
if (sanitizedOut.replyRequired) {
|
|
1603
|
-
shouldGenerateToolReply = true;
|
|
1604
|
-
}
|
|
1605
|
-
}
|
|
1606
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1607
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1608
|
-
ignoreTaskSwitch = true;
|
|
1609
|
-
}
|
|
1610
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1611
|
-
this.logger.debug(
|
|
1612
|
-
{
|
|
1613
|
-
speechId: speechHandle.id,
|
|
1614
|
-
name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
|
|
1615
|
-
args: sanitizedOut.toolCall.args,
|
|
1616
|
-
output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
|
|
1617
|
-
isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
|
|
1618
|
-
},
|
|
1619
|
-
"Tool call execution finished"
|
|
1620
|
-
);
|
|
1621
|
-
}
|
|
1633
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1622
1634
|
this.agentSession.emit(
|
|
1623
1635
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1624
1636
|
functionToolsExecutedEvent
|
|
1625
1637
|
);
|
|
1626
|
-
let
|
|
1638
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1627
1639
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1628
1640
|
this.agentSession.updateAgent(newAgentTask);
|
|
1629
|
-
|
|
1641
|
+
schedulingPaused = true;
|
|
1630
1642
|
}
|
|
1631
1643
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1632
1644
|
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
@@ -1667,20 +1679,58 @@ ${instructions}`;
|
|
|
1667
1679
|
speechHandle: replySpeechHandle
|
|
1668
1680
|
})
|
|
1669
1681
|
);
|
|
1670
|
-
const toolChoice =
|
|
1682
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1671
1683
|
this.createSpeechTask({
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
})
|
|
1678
|
-
),
|
|
1684
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1685
|
+
speechHandle: replySpeechHandle,
|
|
1686
|
+
modelSettings: { toolChoice },
|
|
1687
|
+
abortController
|
|
1688
|
+
}),
|
|
1679
1689
|
ownedSpeechHandle: replySpeechHandle,
|
|
1680
1690
|
name: "AgentActivity.realtime_reply"
|
|
1681
1691
|
});
|
|
1682
1692
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1683
1693
|
}
|
|
1694
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1695
|
+
var _a, _b, _c;
|
|
1696
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
1697
|
+
functionCalls: [],
|
|
1698
|
+
functionCallOutputs: []
|
|
1699
|
+
});
|
|
1700
|
+
let shouldGenerateToolReply = false;
|
|
1701
|
+
let newAgentTask = null;
|
|
1702
|
+
let ignoreTaskSwitch = false;
|
|
1703
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1704
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1705
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1706
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1707
|
+
if (sanitizedOut.replyRequired) {
|
|
1708
|
+
shouldGenerateToolReply = true;
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1712
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1713
|
+
ignoreTaskSwitch = true;
|
|
1714
|
+
}
|
|
1715
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1716
|
+
this.logger.debug(
|
|
1717
|
+
{
|
|
1718
|
+
speechId: speechHandle.id,
|
|
1719
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1720
|
+
args: sanitizedOut.toolCall.args,
|
|
1721
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1722
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1723
|
+
},
|
|
1724
|
+
"Tool call execution finished"
|
|
1725
|
+
);
|
|
1726
|
+
}
|
|
1727
|
+
return {
|
|
1728
|
+
functionToolsExecutedEvent,
|
|
1729
|
+
shouldGenerateToolReply,
|
|
1730
|
+
newAgentTask,
|
|
1731
|
+
ignoreTaskSwitch
|
|
1732
|
+
};
|
|
1733
|
+
}
|
|
1684
1734
|
async realtimeReplyTask({
|
|
1685
1735
|
speechHandle,
|
|
1686
1736
|
modelSettings: { toolChoice },
|
|
@@ -1722,13 +1772,45 @@ ${instructions}`;
|
|
|
1722
1772
|
}
|
|
1723
1773
|
}
|
|
1724
1774
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1725
|
-
if (this.
|
|
1726
|
-
throw new Error("cannot schedule new speech, the
|
|
1775
|
+
if (this.schedulingPaused && !force) {
|
|
1776
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1727
1777
|
}
|
|
1728
1778
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1729
1779
|
speechHandle._markScheduled();
|
|
1730
1780
|
this.wakeupMainTask();
|
|
1731
1781
|
}
|
|
1782
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
1783
|
+
if (this._schedulingPaused) return;
|
|
1784
|
+
this._schedulingPaused = true;
|
|
1785
|
+
this._drainBlockedTasks = blockedTasks;
|
|
1786
|
+
this.wakeupMainTask();
|
|
1787
|
+
if (this._mainTask) {
|
|
1788
|
+
await this._mainTask.result;
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1791
|
+
_resumeSchedulingTask() {
|
|
1792
|
+
if (!this._schedulingPaused) return;
|
|
1793
|
+
this._schedulingPaused = false;
|
|
1794
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
1795
|
+
}
|
|
1796
|
+
async pause(options = {}) {
|
|
1797
|
+
const { blockedTasks = [] } = options;
|
|
1798
|
+
const unlock = await this.lock.lock();
|
|
1799
|
+
try {
|
|
1800
|
+
const span = tracer.startSpan({
|
|
1801
|
+
name: "pause_agent_activity",
|
|
1802
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1803
|
+
});
|
|
1804
|
+
try {
|
|
1805
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
1806
|
+
await this._closeSessionResources();
|
|
1807
|
+
} finally {
|
|
1808
|
+
span.end();
|
|
1809
|
+
}
|
|
1810
|
+
} finally {
|
|
1811
|
+
unlock();
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1732
1814
|
async drain() {
|
|
1733
1815
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
1734
1816
|
name: "drain_agent_activity",
|
|
@@ -1736,71 +1818,79 @@ ${instructions}`;
|
|
|
1736
1818
|
});
|
|
1737
1819
|
}
|
|
1738
1820
|
async _drainImpl(span) {
|
|
1739
|
-
var _a;
|
|
1740
1821
|
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1741
1822
|
const unlock = await this.lock.lock();
|
|
1742
1823
|
try {
|
|
1743
|
-
if (this.
|
|
1744
|
-
this.
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
task: Task.from(() => onExitTask),
|
|
1824
|
+
if (this._schedulingPaused) return;
|
|
1825
|
+
this._onExitTask = this.createSpeechTask({
|
|
1826
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
1827
|
+
name: "on_exit",
|
|
1828
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1829
|
+
}),
|
|
1830
|
+
inlineTask: true,
|
|
1751
1831
|
name: "AgentActivity_onExit"
|
|
1752
1832
|
});
|
|
1753
|
-
this.
|
|
1754
|
-
this.
|
|
1755
|
-
await
|
|
1833
|
+
this.cancelPreemptiveGeneration();
|
|
1834
|
+
await this._onExitTask.result;
|
|
1835
|
+
await this._pauseSchedulingTask([]);
|
|
1756
1836
|
} finally {
|
|
1757
1837
|
unlock();
|
|
1758
1838
|
}
|
|
1759
1839
|
}
|
|
1760
1840
|
async close() {
|
|
1761
|
-
var _a, _b, _c, _d;
|
|
1762
1841
|
const unlock = await this.lock.lock();
|
|
1763
1842
|
try {
|
|
1764
|
-
if (!this._draining) {
|
|
1765
|
-
this.logger.warn("task closing without draining");
|
|
1766
|
-
}
|
|
1767
1843
|
this.cancelPreemptiveGeneration();
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
if (this.realtimeSession) {
|
|
1772
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1773
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1774
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1775
|
-
this.realtimeSession.off(
|
|
1776
|
-
"input_audio_transcription_completed",
|
|
1777
|
-
this.onInputAudioTranscriptionCompleted
|
|
1778
|
-
);
|
|
1779
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1780
|
-
}
|
|
1781
|
-
if (this.stt instanceof STT) {
|
|
1782
|
-
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1844
|
+
await this._closeSessionResources();
|
|
1845
|
+
if (this._mainTask) {
|
|
1846
|
+
await this._mainTask.cancelAndWait();
|
|
1783
1847
|
}
|
|
1784
|
-
|
|
1785
|
-
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1786
|
-
}
|
|
1787
|
-
if (this.vad instanceof VAD) {
|
|
1788
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1789
|
-
}
|
|
1790
|
-
this.detachAudioInput();
|
|
1791
|
-
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1792
|
-
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1793
|
-
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1794
|
-
await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
|
|
1848
|
+
this.agent._agentActivity = void 0;
|
|
1795
1849
|
} finally {
|
|
1796
1850
|
unlock();
|
|
1797
1851
|
}
|
|
1798
1852
|
}
|
|
1853
|
+
async _closeSessionResources() {
|
|
1854
|
+
var _a, _b, _c;
|
|
1855
|
+
if (this.llm instanceof LLM) {
|
|
1856
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1857
|
+
this.llm.off("error", this.onModelError);
|
|
1858
|
+
}
|
|
1859
|
+
if (this.realtimeSession) {
|
|
1860
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
1861
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
1862
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
1863
|
+
this.realtimeSession.off(
|
|
1864
|
+
"input_audio_transcription_completed",
|
|
1865
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
1866
|
+
);
|
|
1867
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1868
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
1869
|
+
}
|
|
1870
|
+
if (this.stt instanceof STT) {
|
|
1871
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1872
|
+
this.stt.off("error", this.onModelError);
|
|
1873
|
+
}
|
|
1874
|
+
if (this.tts instanceof TTS) {
|
|
1875
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1876
|
+
this.tts.off("error", this.onModelError);
|
|
1877
|
+
}
|
|
1878
|
+
if (this.vad instanceof VAD) {
|
|
1879
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1880
|
+
}
|
|
1881
|
+
this.detachAudioInput();
|
|
1882
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1883
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1884
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1885
|
+
this.realtimeSession = void 0;
|
|
1886
|
+
this.audioRecognition = void 0;
|
|
1887
|
+
}
|
|
1799
1888
|
}
|
|
1800
1889
|
function toOaiToolChoice(toolChoice) {
|
|
1801
1890
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1802
1891
|
}
|
|
1803
1892
|
export {
|
|
1804
|
-
AgentActivity
|
|
1893
|
+
AgentActivity,
|
|
1894
|
+
agentActivityStorage
|
|
1805
1895
|
};
|
|
1806
1896
|
//# sourceMappingURL=agent_activity.js.map
|