@livekit/agents 1.0.46 → 1.0.48
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +59 -59
- package/dist/inference/api_protos.d.ts +59 -59
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -1
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +14 -1
- package/dist/llm/chat_context.d.ts +14 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -1
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/utils.cjs +32 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +32 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +153 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +30 -4
- package/dist/voice/agent.d.ts +30 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +149 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +406 -298
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +41 -7
- package/dist/voice/agent_activity.d.ts +41 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +407 -294
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +140 -40
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +19 -7
- package/dist/voice/agent_session.d.ts +19 -7
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +137 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +4 -0
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +4 -0
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +11 -2
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +12 -3
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/cli.ts +20 -33
- package/src/index.ts +2 -1
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +158 -0
- package/src/llm/index.ts +1 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +41 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +200 -10
- package/src/voice/agent_activity.ts +466 -290
- package/src/voice/agent_session.ts +178 -40
- package/src/voice/audio_recognition.ts +4 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/room_io/room_io.ts +14 -3
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/testing/run_result.ts +81 -23
|
@@ -6,18 +6,25 @@ import { ReadableStream } from "node:stream/web";
|
|
|
6
6
|
import { ChatMessage } from "../llm/chat_context.js";
|
|
7
7
|
import {
|
|
8
8
|
LLM,
|
|
9
|
-
RealtimeModel
|
|
9
|
+
RealtimeModel,
|
|
10
|
+
ToolFlag
|
|
10
11
|
} from "../llm/index.js";
|
|
11
12
|
import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
|
|
12
13
|
import { log } from "../log.js";
|
|
13
|
-
import {
|
|
14
|
+
import { MultiInputStream } from "../stream/multi_input_stream.js";
|
|
14
15
|
import { STT } from "../stt/stt.js";
|
|
15
16
|
import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
|
|
16
17
|
import { splitWords } from "../tokenize/basic/word.js";
|
|
17
18
|
import { TTS } from "../tts/tts.js";
|
|
18
19
|
import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
|
|
19
20
|
import { VAD } from "../vad.js";
|
|
20
|
-
import {
|
|
21
|
+
import {
|
|
22
|
+
StopResponse,
|
|
23
|
+
_getActivityTaskInfo,
|
|
24
|
+
_setActivityTaskInfo,
|
|
25
|
+
functionCallStorage,
|
|
26
|
+
speechHandleStorage
|
|
27
|
+
} from "./agent.js";
|
|
21
28
|
import {} from "./agent_session.js";
|
|
22
29
|
import {
|
|
23
30
|
AudioRecognition
|
|
@@ -41,8 +48,11 @@ import {
|
|
|
41
48
|
} from "./generation.js";
|
|
42
49
|
import { SpeechHandle } from "./speech_handle.js";
|
|
43
50
|
import { setParticipantSpanAttributes } from "./utils.js";
|
|
44
|
-
const
|
|
51
|
+
const agentActivityStorage = new AsyncLocalStorage();
|
|
52
|
+
const onEnterStorage = new AsyncLocalStorage();
|
|
45
53
|
class AgentActivity {
|
|
54
|
+
agent;
|
|
55
|
+
agentSession;
|
|
46
56
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
47
57
|
started = false;
|
|
48
58
|
audioRecognition;
|
|
@@ -51,22 +61,29 @@ class AgentActivity {
|
|
|
51
61
|
// Maps response_id to OTEL span for metrics recording
|
|
52
62
|
turnDetectionMode;
|
|
53
63
|
logger = log();
|
|
54
|
-
|
|
64
|
+
_schedulingPaused = true;
|
|
65
|
+
_drainBlockedTasks = [];
|
|
55
66
|
_currentSpeech;
|
|
56
67
|
speechQueue;
|
|
57
68
|
// [priority, timestamp, speechHandle]
|
|
58
69
|
q_updated;
|
|
59
70
|
speechTasks = /* @__PURE__ */ new Set();
|
|
60
71
|
lock = new Mutex();
|
|
61
|
-
audioStream = new
|
|
72
|
+
audioStream = new MultiInputStream();
|
|
73
|
+
audioStreamId;
|
|
62
74
|
// default to null as None, which maps to the default provider tool choice value
|
|
63
75
|
toolChoice = null;
|
|
64
76
|
_preemptiveGeneration;
|
|
65
|
-
agent;
|
|
66
|
-
agentSession;
|
|
67
77
|
/** @internal */
|
|
68
78
|
_mainTask;
|
|
79
|
+
_onEnterTask;
|
|
80
|
+
_onExitTask;
|
|
69
81
|
_userTurnCompletedTask;
|
|
82
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
83
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
84
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
85
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
86
|
+
onModelError = (ev) => this.onError(ev);
|
|
70
87
|
constructor(agent, agentSession) {
|
|
71
88
|
this.agent = agent;
|
|
72
89
|
this.agentSession = agentSession;
|
|
@@ -77,7 +94,7 @@ class AgentActivity {
|
|
|
77
94
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
78
95
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
79
96
|
this.logger.warn(
|
|
80
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
97
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
81
98
|
);
|
|
82
99
|
this.turnDetectionMode = void 0;
|
|
83
100
|
}
|
|
@@ -127,107 +144,124 @@ class AgentActivity {
|
|
|
127
144
|
}
|
|
128
145
|
}
|
|
129
146
|
async start() {
|
|
130
|
-
var _a;
|
|
131
147
|
const unlock = await this.lock.lock();
|
|
132
148
|
try {
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
} else if (this.llm instanceof LLM) {
|
|
173
|
-
try {
|
|
174
|
-
updateInstructions({
|
|
175
|
-
chatCtx: this.agent._chatCtx,
|
|
176
|
-
instructions: this.agent.instructions,
|
|
177
|
-
addIfMissing: true
|
|
178
|
-
});
|
|
179
|
-
} catch (error) {
|
|
180
|
-
this.logger.error("failed to update the instructions", error);
|
|
181
|
-
}
|
|
149
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
150
|
+
} finally {
|
|
151
|
+
unlock();
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
async resume() {
|
|
155
|
+
const unlock = await this.lock.lock();
|
|
156
|
+
try {
|
|
157
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
158
|
+
} finally {
|
|
159
|
+
unlock();
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
async _startSession(options) {
|
|
163
|
+
var _a;
|
|
164
|
+
const { spanName, runOnEnter } = options;
|
|
165
|
+
const startSpan = tracer.startSpan({
|
|
166
|
+
name: spanName,
|
|
167
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
168
|
+
context: ROOT_CONTEXT
|
|
169
|
+
});
|
|
170
|
+
this.agent._agentActivity = this;
|
|
171
|
+
if (this.llm instanceof RealtimeModel) {
|
|
172
|
+
this.realtimeSession = this.llm.session();
|
|
173
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
174
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
175
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
176
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
177
|
+
this.realtimeSession.on(
|
|
178
|
+
"input_audio_transcription_completed",
|
|
179
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
180
|
+
);
|
|
181
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
182
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
183
|
+
removeInstructions(this.agent._chatCtx);
|
|
184
|
+
try {
|
|
185
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
186
|
+
} catch (error) {
|
|
187
|
+
this.logger.error(error, "failed to update the instructions");
|
|
182
188
|
}
|
|
183
|
-
|
|
184
|
-
this.
|
|
185
|
-
|
|
189
|
+
try {
|
|
190
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
191
|
+
} catch (error) {
|
|
192
|
+
this.logger.error(error, "failed to update the chat context");
|
|
186
193
|
}
|
|
187
|
-
|
|
188
|
-
this.
|
|
189
|
-
|
|
194
|
+
try {
|
|
195
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
196
|
+
} catch (error) {
|
|
197
|
+
this.logger.error(error, "failed to update the tools");
|
|
198
|
+
}
|
|
199
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
200
|
+
this.logger.error(
|
|
201
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
202
|
+
);
|
|
190
203
|
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
204
|
+
} else if (this.llm instanceof LLM) {
|
|
205
|
+
try {
|
|
206
|
+
updateInstructions({
|
|
207
|
+
chatCtx: this.agent._chatCtx,
|
|
208
|
+
instructions: this.agent.instructions,
|
|
209
|
+
addIfMissing: true
|
|
210
|
+
});
|
|
211
|
+
} catch (error) {
|
|
212
|
+
this.logger.error("failed to update the instructions", error);
|
|
194
213
|
}
|
|
195
|
-
|
|
196
|
-
|
|
214
|
+
}
|
|
215
|
+
if (this.llm instanceof LLM) {
|
|
216
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
217
|
+
this.llm.on("error", this.onModelError);
|
|
218
|
+
}
|
|
219
|
+
if (this.stt instanceof STT) {
|
|
220
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
221
|
+
this.stt.on("error", this.onModelError);
|
|
222
|
+
}
|
|
223
|
+
if (this.tts instanceof TTS) {
|
|
224
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
225
|
+
this.tts.on("error", this.onModelError);
|
|
226
|
+
}
|
|
227
|
+
if (this.vad instanceof VAD) {
|
|
228
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
229
|
+
}
|
|
230
|
+
this.audioRecognition = new AudioRecognition({
|
|
231
|
+
recognitionHooks: this,
|
|
232
|
+
// Disable stt node if stt is not provided
|
|
233
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
234
|
+
vad: this.vad,
|
|
235
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
236
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
237
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
238
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
239
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
240
|
+
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
241
|
+
sttProvider: this.getSttProvider(),
|
|
242
|
+
getLinkedParticipant: () => {
|
|
243
|
+
var _a2;
|
|
244
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
197
245
|
}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
}
|
|
214
|
-
});
|
|
215
|
-
this.audioRecognition.start();
|
|
216
|
-
this.started = true;
|
|
217
|
-
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
218
|
-
const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
219
|
-
name: "on_enter",
|
|
220
|
-
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
221
|
-
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
222
|
-
});
|
|
223
|
-
this.createSpeechTask({
|
|
224
|
-
task: Task.from(() => onEnterTask),
|
|
246
|
+
});
|
|
247
|
+
this.audioRecognition.start();
|
|
248
|
+
this.started = true;
|
|
249
|
+
this._resumeSchedulingTask();
|
|
250
|
+
if (runOnEnter) {
|
|
251
|
+
this._onEnterTask = this.createSpeechTask({
|
|
252
|
+
taskFn: () => onEnterStorage.run(
|
|
253
|
+
{ session: this.agentSession, agent: this.agent },
|
|
254
|
+
() => tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
255
|
+
name: "on_enter",
|
|
256
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
257
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
258
|
+
})
|
|
259
|
+
),
|
|
260
|
+
inlineTask: true,
|
|
225
261
|
name: "AgentActivity_onEnter"
|
|
226
262
|
});
|
|
227
|
-
startSpan.end();
|
|
228
|
-
} finally {
|
|
229
|
-
unlock();
|
|
230
263
|
}
|
|
264
|
+
startSpan.end();
|
|
231
265
|
}
|
|
232
266
|
get currentSpeech() {
|
|
233
267
|
return this._currentSpeech;
|
|
@@ -256,8 +290,8 @@ class AgentActivity {
|
|
|
256
290
|
get tools() {
|
|
257
291
|
return this.agent.toolCtx;
|
|
258
292
|
}
|
|
259
|
-
get
|
|
260
|
-
return this.
|
|
293
|
+
get schedulingPaused() {
|
|
294
|
+
return this._schedulingPaused;
|
|
261
295
|
}
|
|
262
296
|
get realtimeLLMSession() {
|
|
263
297
|
return this.realtimeSession;
|
|
@@ -288,6 +322,16 @@ class AgentActivity {
|
|
|
288
322
|
});
|
|
289
323
|
}
|
|
290
324
|
}
|
|
325
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
326
|
+
async updateTools(tools) {
|
|
327
|
+
this.agent._tools = { ...tools };
|
|
328
|
+
if (this.realtimeSession) {
|
|
329
|
+
await this.realtimeSession.updateTools(tools);
|
|
330
|
+
}
|
|
331
|
+
if (this.llm instanceof LLM) {
|
|
332
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
333
|
+
}
|
|
334
|
+
}
|
|
291
335
|
updateOptions({ toolChoice }) {
|
|
292
336
|
if (toolChoice !== void 0) {
|
|
293
337
|
this.toolChoice = toolChoice;
|
|
@@ -297,11 +341,9 @@ class AgentActivity {
|
|
|
297
341
|
}
|
|
298
342
|
}
|
|
299
343
|
attachAudioInput(audioStream) {
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
}
|
|
304
|
-
this.audioStream.setSource(audioStream);
|
|
344
|
+
void this.audioStream.close();
|
|
345
|
+
this.audioStream = new MultiInputStream();
|
|
346
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
305
347
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
306
348
|
if (this.realtimeSession) {
|
|
307
349
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -311,13 +353,21 @@ class AgentActivity {
|
|
|
311
353
|
}
|
|
312
354
|
}
|
|
313
355
|
detachAudioInput() {
|
|
314
|
-
this.
|
|
356
|
+
if (this.audioStreamId === void 0) {
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
void this.audioStream.close();
|
|
360
|
+
this.audioStream = new MultiInputStream();
|
|
361
|
+
this.audioStreamId = void 0;
|
|
315
362
|
}
|
|
316
|
-
commitUserTurn() {
|
|
363
|
+
commitUserTurn(options = {}) {
|
|
364
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
317
365
|
if (!this.audioRecognition) {
|
|
318
|
-
|
|
366
|
+
if (throwIfNotReady) {
|
|
367
|
+
throw new Error("AudioRecognition is not initialized");
|
|
368
|
+
}
|
|
369
|
+
return;
|
|
319
370
|
}
|
|
320
|
-
const audioDetached = false;
|
|
321
371
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
322
372
|
}
|
|
323
373
|
clearUserTurn() {
|
|
@@ -353,13 +403,11 @@ class AgentActivity {
|
|
|
353
403
|
})
|
|
354
404
|
);
|
|
355
405
|
const task = this.createSpeechTask({
|
|
356
|
-
|
|
357
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
358
|
-
),
|
|
406
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
359
407
|
ownedSpeechHandle: handle,
|
|
360
408
|
name: "AgentActivity.say_tts"
|
|
361
409
|
});
|
|
362
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
410
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
363
411
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
364
412
|
return handle;
|
|
365
413
|
}
|
|
@@ -449,8 +497,8 @@ class AgentActivity {
|
|
|
449
497
|
if (ev.userInitiated) {
|
|
450
498
|
return;
|
|
451
499
|
}
|
|
452
|
-
if (this.
|
|
453
|
-
this.logger.warn("skipping new realtime generation, the
|
|
500
|
+
if (this.schedulingPaused) {
|
|
501
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
454
502
|
return;
|
|
455
503
|
}
|
|
456
504
|
const handle = SpeechHandle.create({
|
|
@@ -466,9 +514,7 @@ class AgentActivity {
|
|
|
466
514
|
);
|
|
467
515
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
468
516
|
this.createSpeechTask({
|
|
469
|
-
|
|
470
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
471
|
-
),
|
|
517
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
472
518
|
ownedSpeechHandle: handle,
|
|
473
519
|
name: "AgentActivity.realtimeGeneration"
|
|
474
520
|
});
|
|
@@ -555,7 +601,7 @@ class AgentActivity {
|
|
|
555
601
|
}
|
|
556
602
|
}
|
|
557
603
|
onPreemptiveGeneration(info) {
|
|
558
|
-
if (!this.agentSession.options.preemptiveGeneration || this.
|
|
604
|
+
if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
|
|
559
605
|
return;
|
|
560
606
|
}
|
|
561
607
|
this.cancelPreemptiveGeneration();
|
|
@@ -593,7 +639,21 @@ class AgentActivity {
|
|
|
593
639
|
}
|
|
594
640
|
}
|
|
595
641
|
createSpeechTask(options) {
|
|
596
|
-
const {
|
|
642
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
643
|
+
const wrappedFn = (ctrl) => {
|
|
644
|
+
return agentActivityStorage.run(this, () => {
|
|
645
|
+
const currentTask = Task.current();
|
|
646
|
+
if (currentTask) {
|
|
647
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
648
|
+
}
|
|
649
|
+
if (ownedSpeechHandle) {
|
|
650
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
651
|
+
}
|
|
652
|
+
return taskFn(ctrl);
|
|
653
|
+
});
|
|
654
|
+
};
|
|
655
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
656
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
597
657
|
this.speechTasks.add(task);
|
|
598
658
|
task.addDoneCallback(() => {
|
|
599
659
|
this.speechTasks.delete(task);
|
|
@@ -609,12 +669,15 @@ class AgentActivity {
|
|
|
609
669
|
task.addDoneCallback(() => {
|
|
610
670
|
this.wakeupMainTask();
|
|
611
671
|
});
|
|
612
|
-
return task
|
|
672
|
+
return task;
|
|
613
673
|
}
|
|
614
674
|
async onEndOfTurn(info) {
|
|
615
|
-
if (this.
|
|
675
|
+
if (this.schedulingPaused) {
|
|
616
676
|
this.cancelPreemptiveGeneration();
|
|
617
|
-
this.logger.warn(
|
|
677
|
+
this.logger.warn(
|
|
678
|
+
{ user_input: info.newTranscript },
|
|
679
|
+
"skipping user input, speech scheduling is paused"
|
|
680
|
+
);
|
|
618
681
|
return true;
|
|
619
682
|
}
|
|
620
683
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
@@ -633,7 +696,7 @@ class AgentActivity {
|
|
|
633
696
|
}
|
|
634
697
|
const oldTask = this._userTurnCompletedTask;
|
|
635
698
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
636
|
-
|
|
699
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
637
700
|
name: "AgentActivity.userTurnCompleted"
|
|
638
701
|
});
|
|
639
702
|
return true;
|
|
@@ -663,14 +726,41 @@ class AgentActivity {
|
|
|
663
726
|
await speechHandle._waitForGeneration();
|
|
664
727
|
this._currentSpeech = void 0;
|
|
665
728
|
}
|
|
666
|
-
|
|
667
|
-
|
|
729
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
730
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
731
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
668
732
|
break;
|
|
669
733
|
}
|
|
670
734
|
this.q_updated = new Future();
|
|
671
735
|
}
|
|
672
736
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
673
737
|
}
|
|
738
|
+
getDrainPendingSpeechTasks() {
|
|
739
|
+
const blockedHandles = [];
|
|
740
|
+
for (const task of this._drainBlockedTasks) {
|
|
741
|
+
const info = _getActivityTaskInfo(task);
|
|
742
|
+
if (!info) {
|
|
743
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
744
|
+
continue;
|
|
745
|
+
}
|
|
746
|
+
if (!info.speechHandle) {
|
|
747
|
+
continue;
|
|
748
|
+
}
|
|
749
|
+
blockedHandles.push(info.speechHandle);
|
|
750
|
+
}
|
|
751
|
+
const toWait = [];
|
|
752
|
+
for (const task of this.speechTasks) {
|
|
753
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
754
|
+
continue;
|
|
755
|
+
}
|
|
756
|
+
const info = _getActivityTaskInfo(task);
|
|
757
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
758
|
+
continue;
|
|
759
|
+
}
|
|
760
|
+
toWait.push(task);
|
|
761
|
+
}
|
|
762
|
+
return toWait;
|
|
763
|
+
}
|
|
674
764
|
wakeupMainTask() {
|
|
675
765
|
this.q_updated.resolve();
|
|
676
766
|
}
|
|
@@ -696,7 +786,7 @@ class AgentActivity {
|
|
|
696
786
|
if (this.llm === void 0) {
|
|
697
787
|
throw new Error("trying to generate reply without an LLM model");
|
|
698
788
|
}
|
|
699
|
-
const functionCall = (_a =
|
|
789
|
+
const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
700
790
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
701
791
|
toolChoice = "none";
|
|
702
792
|
}
|
|
@@ -714,19 +804,17 @@ class AgentActivity {
|
|
|
714
804
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
715
805
|
if (this.llm instanceof RealtimeModel) {
|
|
716
806
|
this.createSpeechTask({
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
})
|
|
729
|
-
),
|
|
807
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
808
|
+
speechHandle: handle,
|
|
809
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
810
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
811
|
+
instructions,
|
|
812
|
+
modelSettings: {
|
|
813
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
814
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
815
|
+
},
|
|
816
|
+
abortController
|
|
817
|
+
}),
|
|
730
818
|
ownedSpeechHandle: handle,
|
|
731
819
|
name: "AgentActivity.realtimeReply"
|
|
732
820
|
});
|
|
@@ -735,37 +823,44 @@ class AgentActivity {
|
|
|
735
823
|
instructions = `${this.agent.instructions}
|
|
736
824
|
${instructions}`;
|
|
737
825
|
}
|
|
826
|
+
const onEnterData = onEnterStorage.getStore();
|
|
827
|
+
const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
|
|
828
|
+
const tools = shouldFilterTools ? Object.fromEntries(
|
|
829
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
830
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
|
|
831
|
+
)
|
|
832
|
+
) : this.agent.toolCtx;
|
|
738
833
|
const task = this.createSpeechTask({
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
userMessage
|
|
750
|
-
)
|
|
834
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
835
|
+
handle,
|
|
836
|
+
chatCtx ?? this.agent.chatCtx,
|
|
837
|
+
tools,
|
|
838
|
+
{
|
|
839
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
840
|
+
},
|
|
841
|
+
abortController,
|
|
842
|
+
instructions,
|
|
843
|
+
userMessage
|
|
751
844
|
),
|
|
752
845
|
ownedSpeechHandle: handle,
|
|
753
846
|
name: "AgentActivity.pipelineReply"
|
|
754
847
|
});
|
|
755
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
848
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
756
849
|
}
|
|
757
850
|
if (scheduleSpeech) {
|
|
758
851
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
759
852
|
}
|
|
760
853
|
return handle;
|
|
761
854
|
}
|
|
762
|
-
interrupt() {
|
|
855
|
+
interrupt(options = {}) {
|
|
763
856
|
var _a;
|
|
857
|
+
const { force = false } = options;
|
|
858
|
+
this.cancelPreemptiveGeneration();
|
|
764
859
|
const future = new Future();
|
|
765
860
|
const currentSpeech = this._currentSpeech;
|
|
766
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
861
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
767
862
|
for (const [_, __, speech] of this.speechQueue) {
|
|
768
|
-
speech.interrupt();
|
|
863
|
+
speech.interrupt(force);
|
|
769
864
|
}
|
|
770
865
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
771
866
|
if (currentSpeech === void 0) {
|
|
@@ -786,7 +881,7 @@ ${instructions}`;
|
|
|
786
881
|
async userTurnCompleted(info, oldTask) {
|
|
787
882
|
var _a, _b;
|
|
788
883
|
if (oldTask) {
|
|
789
|
-
await oldTask;
|
|
884
|
+
await oldTask.result;
|
|
790
885
|
}
|
|
791
886
|
if (this.llm instanceof RealtimeModel) {
|
|
792
887
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -973,7 +1068,7 @@ ${instructions}`;
|
|
|
973
1068
|
toolsMessages,
|
|
974
1069
|
span
|
|
975
1070
|
}) => {
|
|
976
|
-
var _a, _b
|
|
1071
|
+
var _a, _b;
|
|
977
1072
|
speechHandle._agentTurnContext = otelContext.active();
|
|
978
1073
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
979
1074
|
if (instructions) {
|
|
@@ -1119,11 +1214,11 @@ ${instructions}`;
|
|
|
1119
1214
|
for (const msg of toolsMessages) {
|
|
1120
1215
|
msg.createdAt = replyStartedAt;
|
|
1121
1216
|
}
|
|
1122
|
-
this.agent._chatCtx.insert(toolsMessages);
|
|
1123
1217
|
const toolCallOutputs = toolsMessages.filter(
|
|
1124
1218
|
(m) => m.type === "function_call_output"
|
|
1125
1219
|
);
|
|
1126
1220
|
if (toolCallOutputs.length > 0) {
|
|
1221
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1127
1222
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1128
1223
|
}
|
|
1129
1224
|
}
|
|
@@ -1211,45 +1306,15 @@ ${instructions}`;
|
|
|
1211
1306
|
);
|
|
1212
1307
|
return;
|
|
1213
1308
|
}
|
|
1214
|
-
const functionToolsExecutedEvent =
|
|
1215
|
-
functionCalls: [],
|
|
1216
|
-
functionCallOutputs: []
|
|
1217
|
-
});
|
|
1218
|
-
let shouldGenerateToolReply = false;
|
|
1219
|
-
let newAgentTask = null;
|
|
1220
|
-
let ignoreTaskSwitch = false;
|
|
1221
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1222
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1223
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1224
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1225
|
-
if (sanitizedOut.replyRequired) {
|
|
1226
|
-
shouldGenerateToolReply = true;
|
|
1227
|
-
}
|
|
1228
|
-
}
|
|
1229
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1230
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1231
|
-
ignoreTaskSwitch = true;
|
|
1232
|
-
}
|
|
1233
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1234
|
-
this.logger.debug(
|
|
1235
|
-
{
|
|
1236
|
-
speechId: speechHandle.id,
|
|
1237
|
-
name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
|
|
1238
|
-
args: sanitizedOut.toolCall.args,
|
|
1239
|
-
output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
|
|
1240
|
-
isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
|
|
1241
|
-
},
|
|
1242
|
-
"Tool call execution finished"
|
|
1243
|
-
);
|
|
1244
|
-
}
|
|
1309
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1245
1310
|
this.agentSession.emit(
|
|
1246
1311
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1247
1312
|
functionToolsExecutedEvent
|
|
1248
1313
|
);
|
|
1249
|
-
let
|
|
1314
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1250
1315
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1251
1316
|
this.agentSession.updateAgent(newAgentTask);
|
|
1252
|
-
|
|
1317
|
+
schedulingPaused = true;
|
|
1253
1318
|
}
|
|
1254
1319
|
const toolMessages = [
|
|
1255
1320
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1258,34 +1323,32 @@ ${instructions}`;
|
|
|
1258
1323
|
if (shouldGenerateToolReply) {
|
|
1259
1324
|
chatCtx.insert(toolMessages);
|
|
1260
1325
|
speechHandle._numSteps += 1;
|
|
1261
|
-
const respondToolChoice =
|
|
1326
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1262
1327
|
const toolResponseTask = this.createSpeechTask({
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
toolMessages
|
|
1273
|
-
)
|
|
1328
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1329
|
+
speechHandle,
|
|
1330
|
+
chatCtx,
|
|
1331
|
+
toolCtx,
|
|
1332
|
+
{ toolChoice: respondToolChoice },
|
|
1333
|
+
replyAbortController,
|
|
1334
|
+
instructions,
|
|
1335
|
+
void 0,
|
|
1336
|
+
toolMessages
|
|
1274
1337
|
),
|
|
1275
1338
|
ownedSpeechHandle: speechHandle,
|
|
1276
1339
|
name: "AgentActivity.pipelineReply"
|
|
1277
1340
|
});
|
|
1278
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1341
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1279
1342
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1280
1343
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1281
1344
|
for (const msg of toolMessages) {
|
|
1282
1345
|
msg.createdAt = replyStartedAt;
|
|
1283
1346
|
}
|
|
1284
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1285
1347
|
const toolCallOutputs = toolMessages.filter(
|
|
1286
1348
|
(m) => m.type === "function_call_output"
|
|
1287
1349
|
);
|
|
1288
1350
|
if (toolCallOutputs.length > 0) {
|
|
1351
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1289
1352
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1290
1353
|
}
|
|
1291
1354
|
}
|
|
@@ -1329,7 +1392,7 @@ ${instructions}`;
|
|
|
1329
1392
|
replyAbortController,
|
|
1330
1393
|
span
|
|
1331
1394
|
}) {
|
|
1332
|
-
var _a
|
|
1395
|
+
var _a;
|
|
1333
1396
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1334
1397
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1335
1398
|
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
@@ -1589,44 +1652,15 @@ ${instructions}`;
|
|
|
1589
1652
|
);
|
|
1590
1653
|
return;
|
|
1591
1654
|
}
|
|
1592
|
-
const functionToolsExecutedEvent =
|
|
1593
|
-
functionCalls: [],
|
|
1594
|
-
functionCallOutputs: []
|
|
1595
|
-
});
|
|
1596
|
-
let shouldGenerateToolReply = false;
|
|
1597
|
-
let newAgentTask = null;
|
|
1598
|
-
let ignoreTaskSwitch = false;
|
|
1599
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1600
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1601
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1602
|
-
if (sanitizedOut.replyRequired) {
|
|
1603
|
-
shouldGenerateToolReply = true;
|
|
1604
|
-
}
|
|
1605
|
-
}
|
|
1606
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1607
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1608
|
-
ignoreTaskSwitch = true;
|
|
1609
|
-
}
|
|
1610
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1611
|
-
this.logger.debug(
|
|
1612
|
-
{
|
|
1613
|
-
speechId: speechHandle.id,
|
|
1614
|
-
name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
|
|
1615
|
-
args: sanitizedOut.toolCall.args,
|
|
1616
|
-
output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
|
|
1617
|
-
isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
|
|
1618
|
-
},
|
|
1619
|
-
"Tool call execution finished"
|
|
1620
|
-
);
|
|
1621
|
-
}
|
|
1655
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1622
1656
|
this.agentSession.emit(
|
|
1623
1657
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1624
1658
|
functionToolsExecutedEvent
|
|
1625
1659
|
);
|
|
1626
|
-
let
|
|
1660
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1627
1661
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1628
1662
|
this.agentSession.updateAgent(newAgentTask);
|
|
1629
|
-
|
|
1663
|
+
schedulingPaused = true;
|
|
1630
1664
|
}
|
|
1631
1665
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1632
1666
|
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
@@ -1667,20 +1701,58 @@ ${instructions}`;
|
|
|
1667
1701
|
speechHandle: replySpeechHandle
|
|
1668
1702
|
})
|
|
1669
1703
|
);
|
|
1670
|
-
const toolChoice =
|
|
1704
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1671
1705
|
this.createSpeechTask({
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
})
|
|
1678
|
-
),
|
|
1706
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1707
|
+
speechHandle: replySpeechHandle,
|
|
1708
|
+
modelSettings: { toolChoice },
|
|
1709
|
+
abortController
|
|
1710
|
+
}),
|
|
1679
1711
|
ownedSpeechHandle: replySpeechHandle,
|
|
1680
1712
|
name: "AgentActivity.realtime_reply"
|
|
1681
1713
|
});
|
|
1682
1714
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1683
1715
|
}
|
|
1716
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1717
|
+
var _a, _b, _c;
|
|
1718
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
1719
|
+
functionCalls: [],
|
|
1720
|
+
functionCallOutputs: []
|
|
1721
|
+
});
|
|
1722
|
+
let shouldGenerateToolReply = false;
|
|
1723
|
+
let newAgentTask = null;
|
|
1724
|
+
let ignoreTaskSwitch = false;
|
|
1725
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1726
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1727
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1728
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1729
|
+
if (sanitizedOut.replyRequired) {
|
|
1730
|
+
shouldGenerateToolReply = true;
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1734
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1735
|
+
ignoreTaskSwitch = true;
|
|
1736
|
+
}
|
|
1737
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1738
|
+
this.logger.debug(
|
|
1739
|
+
{
|
|
1740
|
+
speechId: speechHandle.id,
|
|
1741
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1742
|
+
args: sanitizedOut.toolCall.args,
|
|
1743
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1744
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1745
|
+
},
|
|
1746
|
+
"Tool call execution finished"
|
|
1747
|
+
);
|
|
1748
|
+
}
|
|
1749
|
+
return {
|
|
1750
|
+
functionToolsExecutedEvent,
|
|
1751
|
+
shouldGenerateToolReply,
|
|
1752
|
+
newAgentTask,
|
|
1753
|
+
ignoreTaskSwitch
|
|
1754
|
+
};
|
|
1755
|
+
}
|
|
1684
1756
|
async realtimeReplyTask({
|
|
1685
1757
|
speechHandle,
|
|
1686
1758
|
modelSettings: { toolChoice },
|
|
@@ -1722,13 +1794,45 @@ ${instructions}`;
|
|
|
1722
1794
|
}
|
|
1723
1795
|
}
|
|
1724
1796
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1725
|
-
if (this.
|
|
1726
|
-
throw new Error("cannot schedule new speech, the
|
|
1797
|
+
if (this.schedulingPaused && !force) {
|
|
1798
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1727
1799
|
}
|
|
1728
1800
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1729
1801
|
speechHandle._markScheduled();
|
|
1730
1802
|
this.wakeupMainTask();
|
|
1731
1803
|
}
|
|
1804
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
1805
|
+
if (this._schedulingPaused) return;
|
|
1806
|
+
this._schedulingPaused = true;
|
|
1807
|
+
this._drainBlockedTasks = blockedTasks;
|
|
1808
|
+
this.wakeupMainTask();
|
|
1809
|
+
if (this._mainTask) {
|
|
1810
|
+
await this._mainTask.result;
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
_resumeSchedulingTask() {
|
|
1814
|
+
if (!this._schedulingPaused) return;
|
|
1815
|
+
this._schedulingPaused = false;
|
|
1816
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
1817
|
+
}
|
|
1818
|
+
async pause(options = {}) {
|
|
1819
|
+
const { blockedTasks = [] } = options;
|
|
1820
|
+
const unlock = await this.lock.lock();
|
|
1821
|
+
try {
|
|
1822
|
+
const span = tracer.startSpan({
|
|
1823
|
+
name: "pause_agent_activity",
|
|
1824
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1825
|
+
});
|
|
1826
|
+
try {
|
|
1827
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
1828
|
+
await this._closeSessionResources();
|
|
1829
|
+
} finally {
|
|
1830
|
+
span.end();
|
|
1831
|
+
}
|
|
1832
|
+
} finally {
|
|
1833
|
+
unlock();
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1732
1836
|
async drain() {
|
|
1733
1837
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
1734
1838
|
name: "drain_agent_activity",
|
|
@@ -1736,71 +1840,80 @@ ${instructions}`;
|
|
|
1736
1840
|
});
|
|
1737
1841
|
}
|
|
1738
1842
|
async _drainImpl(span) {
|
|
1739
|
-
var _a;
|
|
1740
1843
|
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1741
1844
|
const unlock = await this.lock.lock();
|
|
1742
1845
|
try {
|
|
1743
|
-
if (this.
|
|
1744
|
-
this.
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
task: Task.from(() => onExitTask),
|
|
1846
|
+
if (this._schedulingPaused) return;
|
|
1847
|
+
this._onExitTask = this.createSpeechTask({
|
|
1848
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
1849
|
+
name: "on_exit",
|
|
1850
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1851
|
+
}),
|
|
1852
|
+
inlineTask: true,
|
|
1751
1853
|
name: "AgentActivity_onExit"
|
|
1752
1854
|
});
|
|
1753
|
-
this.
|
|
1754
|
-
this.
|
|
1755
|
-
await
|
|
1855
|
+
this.cancelPreemptiveGeneration();
|
|
1856
|
+
await this._onExitTask.result;
|
|
1857
|
+
await this._pauseSchedulingTask([]);
|
|
1756
1858
|
} finally {
|
|
1757
1859
|
unlock();
|
|
1758
1860
|
}
|
|
1759
1861
|
}
|
|
1760
1862
|
async close() {
|
|
1761
|
-
var _a, _b, _c, _d;
|
|
1762
1863
|
const unlock = await this.lock.lock();
|
|
1763
1864
|
try {
|
|
1764
|
-
if (!this._draining) {
|
|
1765
|
-
this.logger.warn("task closing without draining");
|
|
1766
|
-
}
|
|
1767
1865
|
this.cancelPreemptiveGeneration();
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
if (this.realtimeSession) {
|
|
1772
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1773
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1774
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1775
|
-
this.realtimeSession.off(
|
|
1776
|
-
"input_audio_transcription_completed",
|
|
1777
|
-
this.onInputAudioTranscriptionCompleted
|
|
1778
|
-
);
|
|
1779
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1780
|
-
}
|
|
1781
|
-
if (this.stt instanceof STT) {
|
|
1782
|
-
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1866
|
+
await this._closeSessionResources();
|
|
1867
|
+
if (this._mainTask) {
|
|
1868
|
+
await this._mainTask.cancelAndWait();
|
|
1783
1869
|
}
|
|
1784
|
-
|
|
1785
|
-
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1786
|
-
}
|
|
1787
|
-
if (this.vad instanceof VAD) {
|
|
1788
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1789
|
-
}
|
|
1790
|
-
this.detachAudioInput();
|
|
1791
|
-
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1792
|
-
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1793
|
-
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1794
|
-
await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
|
|
1870
|
+
this.agent._agentActivity = void 0;
|
|
1795
1871
|
} finally {
|
|
1796
1872
|
unlock();
|
|
1797
1873
|
}
|
|
1798
1874
|
}
|
|
1875
|
+
async _closeSessionResources() {
|
|
1876
|
+
var _a, _b, _c;
|
|
1877
|
+
if (this.llm instanceof LLM) {
|
|
1878
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1879
|
+
this.llm.off("error", this.onModelError);
|
|
1880
|
+
}
|
|
1881
|
+
if (this.realtimeSession) {
|
|
1882
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
1883
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
1884
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
1885
|
+
this.realtimeSession.off(
|
|
1886
|
+
"input_audio_transcription_completed",
|
|
1887
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
1888
|
+
);
|
|
1889
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1890
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
1891
|
+
}
|
|
1892
|
+
if (this.stt instanceof STT) {
|
|
1893
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1894
|
+
this.stt.off("error", this.onModelError);
|
|
1895
|
+
}
|
|
1896
|
+
if (this.tts instanceof TTS) {
|
|
1897
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1898
|
+
this.tts.off("error", this.onModelError);
|
|
1899
|
+
}
|
|
1900
|
+
if (this.vad instanceof VAD) {
|
|
1901
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1902
|
+
}
|
|
1903
|
+
this.detachAudioInput();
|
|
1904
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1905
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1906
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1907
|
+
this.realtimeSession = void 0;
|
|
1908
|
+
this.audioRecognition = void 0;
|
|
1909
|
+
}
|
|
1799
1910
|
}
|
|
1800
1911
|
function toOaiToolChoice(toolChoice) {
|
|
1801
1912
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1802
1913
|
}
|
|
1803
1914
|
export {
|
|
1804
|
-
AgentActivity
|
|
1915
|
+
AgentActivity,
|
|
1916
|
+
agentActivityStorage,
|
|
1917
|
+
onEnterStorage
|
|
1805
1918
|
};
|
|
1806
1919
|
//# sourceMappingURL=agent_activity.js.map
|