@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -10,14 +10,20 @@ import {
|
|
|
10
10
|
} from "../llm/index.js";
|
|
11
11
|
import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
|
|
12
12
|
import { log } from "../log.js";
|
|
13
|
-
import {
|
|
13
|
+
import { MultiInputStream } from "../stream/multi_input_stream.js";
|
|
14
14
|
import { STT } from "../stt/stt.js";
|
|
15
15
|
import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
|
|
16
16
|
import { splitWords } from "../tokenize/basic/word.js";
|
|
17
17
|
import { TTS } from "../tts/tts.js";
|
|
18
18
|
import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
|
|
19
19
|
import { VAD } from "../vad.js";
|
|
20
|
-
import {
|
|
20
|
+
import {
|
|
21
|
+
StopResponse,
|
|
22
|
+
_getActivityTaskInfo,
|
|
23
|
+
_setActivityTaskInfo,
|
|
24
|
+
functionCallStorage,
|
|
25
|
+
speechHandleStorage
|
|
26
|
+
} from "./agent.js";
|
|
21
27
|
import {} from "./agent_session.js";
|
|
22
28
|
import {
|
|
23
29
|
AudioRecognition
|
|
@@ -40,8 +46,11 @@ import {
|
|
|
40
46
|
updateInstructions
|
|
41
47
|
} from "./generation.js";
|
|
42
48
|
import { SpeechHandle } from "./speech_handle.js";
|
|
43
|
-
|
|
49
|
+
import { setParticipantSpanAttributes } from "./utils.js";
|
|
50
|
+
const agentActivityStorage = new AsyncLocalStorage();
|
|
44
51
|
class AgentActivity {
|
|
52
|
+
agent;
|
|
53
|
+
agentSession;
|
|
45
54
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
46
55
|
started = false;
|
|
47
56
|
audioRecognition;
|
|
@@ -50,22 +59,29 @@ class AgentActivity {
|
|
|
50
59
|
// Maps response_id to OTEL span for metrics recording
|
|
51
60
|
turnDetectionMode;
|
|
52
61
|
logger = log();
|
|
53
|
-
|
|
62
|
+
_schedulingPaused = true;
|
|
63
|
+
_drainBlockedTasks = [];
|
|
54
64
|
_currentSpeech;
|
|
55
65
|
speechQueue;
|
|
56
66
|
// [priority, timestamp, speechHandle]
|
|
57
67
|
q_updated;
|
|
58
68
|
speechTasks = /* @__PURE__ */ new Set();
|
|
59
69
|
lock = new Mutex();
|
|
60
|
-
audioStream = new
|
|
70
|
+
audioStream = new MultiInputStream();
|
|
71
|
+
audioStreamId;
|
|
61
72
|
// default to null as None, which maps to the default provider tool choice value
|
|
62
73
|
toolChoice = null;
|
|
63
74
|
_preemptiveGeneration;
|
|
64
|
-
agent;
|
|
65
|
-
agentSession;
|
|
66
75
|
/** @internal */
|
|
67
76
|
_mainTask;
|
|
77
|
+
_onEnterTask;
|
|
78
|
+
_onExitTask;
|
|
68
79
|
_userTurnCompletedTask;
|
|
80
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
81
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
82
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
83
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
84
|
+
onModelError = (ev) => this.onError(ev);
|
|
69
85
|
constructor(agent, agentSession) {
|
|
70
86
|
this.agent = agent;
|
|
71
87
|
this.agentSession = agentSession;
|
|
@@ -76,7 +92,7 @@ class AgentActivity {
|
|
|
76
92
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
77
93
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
78
94
|
this.logger.warn(
|
|
79
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
95
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
80
96
|
);
|
|
81
97
|
this.turnDetectionMode = void 0;
|
|
82
98
|
}
|
|
@@ -128,98 +144,119 @@ class AgentActivity {
|
|
|
128
144
|
async start() {
|
|
129
145
|
const unlock = await this.lock.lock();
|
|
130
146
|
try {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
} else if (this.llm instanceof LLM) {
|
|
171
|
-
try {
|
|
172
|
-
updateInstructions({
|
|
173
|
-
chatCtx: this.agent._chatCtx,
|
|
174
|
-
instructions: this.agent.instructions,
|
|
175
|
-
addIfMissing: true
|
|
176
|
-
});
|
|
177
|
-
} catch (error) {
|
|
178
|
-
this.logger.error("failed to update the instructions", error);
|
|
179
|
-
}
|
|
147
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
148
|
+
} finally {
|
|
149
|
+
unlock();
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
async resume() {
|
|
153
|
+
const unlock = await this.lock.lock();
|
|
154
|
+
try {
|
|
155
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
156
|
+
} finally {
|
|
157
|
+
unlock();
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
async _startSession(options) {
|
|
161
|
+
var _a;
|
|
162
|
+
const { spanName, runOnEnter } = options;
|
|
163
|
+
const startSpan = tracer.startSpan({
|
|
164
|
+
name: spanName,
|
|
165
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
166
|
+
context: ROOT_CONTEXT
|
|
167
|
+
});
|
|
168
|
+
this.agent._agentActivity = this;
|
|
169
|
+
if (this.llm instanceof RealtimeModel) {
|
|
170
|
+
this.realtimeSession = this.llm.session();
|
|
171
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
172
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
173
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
174
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
175
|
+
this.realtimeSession.on(
|
|
176
|
+
"input_audio_transcription_completed",
|
|
177
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
178
|
+
);
|
|
179
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
180
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
181
|
+
removeInstructions(this.agent._chatCtx);
|
|
182
|
+
try {
|
|
183
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
184
|
+
} catch (error) {
|
|
185
|
+
this.logger.error(error, "failed to update the instructions");
|
|
180
186
|
}
|
|
181
|
-
|
|
182
|
-
this.
|
|
183
|
-
|
|
187
|
+
try {
|
|
188
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
189
|
+
} catch (error) {
|
|
190
|
+
this.logger.error(error, "failed to update the chat context");
|
|
184
191
|
}
|
|
185
|
-
|
|
186
|
-
this.
|
|
187
|
-
|
|
192
|
+
try {
|
|
193
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
194
|
+
} catch (error) {
|
|
195
|
+
this.logger.error(error, "failed to update the tools");
|
|
188
196
|
}
|
|
189
|
-
if (this.tts
|
|
190
|
-
this.
|
|
191
|
-
|
|
197
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
198
|
+
this.logger.error(
|
|
199
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
200
|
+
);
|
|
192
201
|
}
|
|
193
|
-
|
|
194
|
-
|
|
202
|
+
} else if (this.llm instanceof LLM) {
|
|
203
|
+
try {
|
|
204
|
+
updateInstructions({
|
|
205
|
+
chatCtx: this.agent._chatCtx,
|
|
206
|
+
instructions: this.agent.instructions,
|
|
207
|
+
addIfMissing: true
|
|
208
|
+
});
|
|
209
|
+
} catch (error) {
|
|
210
|
+
this.logger.error("failed to update the instructions", error);
|
|
195
211
|
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
this.
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
this.
|
|
216
|
-
|
|
212
|
+
}
|
|
213
|
+
if (this.llm instanceof LLM) {
|
|
214
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
215
|
+
this.llm.on("error", this.onModelError);
|
|
216
|
+
}
|
|
217
|
+
if (this.stt instanceof STT) {
|
|
218
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
219
|
+
this.stt.on("error", this.onModelError);
|
|
220
|
+
}
|
|
221
|
+
if (this.tts instanceof TTS) {
|
|
222
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
223
|
+
this.tts.on("error", this.onModelError);
|
|
224
|
+
}
|
|
225
|
+
if (this.vad instanceof VAD) {
|
|
226
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
227
|
+
}
|
|
228
|
+
this.audioRecognition = new AudioRecognition({
|
|
229
|
+
recognitionHooks: this,
|
|
230
|
+
// Disable stt node if stt is not provided
|
|
231
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
232
|
+
vad: this.vad,
|
|
233
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
234
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
235
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
236
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
237
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
238
|
+
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
239
|
+
sttProvider: this.getSttProvider(),
|
|
240
|
+
getLinkedParticipant: () => {
|
|
241
|
+
var _a2;
|
|
242
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
this.audioRecognition.start();
|
|
246
|
+
this.started = true;
|
|
247
|
+
this._resumeSchedulingTask();
|
|
248
|
+
if (runOnEnter) {
|
|
249
|
+
this._onEnterTask = this.createSpeechTask({
|
|
250
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
251
|
+
name: "on_enter",
|
|
252
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
253
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
254
|
+
}),
|
|
255
|
+
inlineTask: true,
|
|
217
256
|
name: "AgentActivity_onEnter"
|
|
218
257
|
});
|
|
219
|
-
startSpan.end();
|
|
220
|
-
} finally {
|
|
221
|
-
unlock();
|
|
222
258
|
}
|
|
259
|
+
startSpan.end();
|
|
223
260
|
}
|
|
224
261
|
get currentSpeech() {
|
|
225
262
|
return this._currentSpeech;
|
|
@@ -230,6 +267,15 @@ class AgentActivity {
|
|
|
230
267
|
get stt() {
|
|
231
268
|
return this.agent.stt || this.agentSession.stt;
|
|
232
269
|
}
|
|
270
|
+
getSttProvider() {
|
|
271
|
+
var _a;
|
|
272
|
+
const label = (_a = this.stt) == null ? void 0 : _a.label;
|
|
273
|
+
if (!label) {
|
|
274
|
+
return void 0;
|
|
275
|
+
}
|
|
276
|
+
const [provider] = label.split("-", 1);
|
|
277
|
+
return provider || label;
|
|
278
|
+
}
|
|
233
279
|
get llm() {
|
|
234
280
|
return this.agent.llm || this.agentSession.llm;
|
|
235
281
|
}
|
|
@@ -239,8 +285,8 @@ class AgentActivity {
|
|
|
239
285
|
get tools() {
|
|
240
286
|
return this.agent.toolCtx;
|
|
241
287
|
}
|
|
242
|
-
get
|
|
243
|
-
return this.
|
|
288
|
+
get schedulingPaused() {
|
|
289
|
+
return this._schedulingPaused;
|
|
244
290
|
}
|
|
245
291
|
get realtimeLLMSession() {
|
|
246
292
|
return this.realtimeSession;
|
|
@@ -280,11 +326,9 @@ class AgentActivity {
|
|
|
280
326
|
}
|
|
281
327
|
}
|
|
282
328
|
attachAudioInput(audioStream) {
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
}
|
|
287
|
-
this.audioStream.setSource(audioStream);
|
|
329
|
+
void this.audioStream.close();
|
|
330
|
+
this.audioStream = new MultiInputStream();
|
|
331
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
288
332
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
289
333
|
if (this.realtimeSession) {
|
|
290
334
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -294,13 +338,21 @@ class AgentActivity {
|
|
|
294
338
|
}
|
|
295
339
|
}
|
|
296
340
|
detachAudioInput() {
|
|
297
|
-
this.
|
|
341
|
+
if (this.audioStreamId === void 0) {
|
|
342
|
+
return;
|
|
343
|
+
}
|
|
344
|
+
void this.audioStream.close();
|
|
345
|
+
this.audioStream = new MultiInputStream();
|
|
346
|
+
this.audioStreamId = void 0;
|
|
298
347
|
}
|
|
299
|
-
commitUserTurn() {
|
|
348
|
+
commitUserTurn(options = {}) {
|
|
349
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
300
350
|
if (!this.audioRecognition) {
|
|
301
|
-
|
|
351
|
+
if (throwIfNotReady) {
|
|
352
|
+
throw new Error("AudioRecognition is not initialized");
|
|
353
|
+
}
|
|
354
|
+
return;
|
|
302
355
|
}
|
|
303
|
-
const audioDetached = false;
|
|
304
356
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
305
357
|
}
|
|
306
358
|
clearUserTurn() {
|
|
@@ -336,13 +388,11 @@ class AgentActivity {
|
|
|
336
388
|
})
|
|
337
389
|
);
|
|
338
390
|
const task = this.createSpeechTask({
|
|
339
|
-
|
|
340
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
341
|
-
),
|
|
391
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
342
392
|
ownedSpeechHandle: handle,
|
|
343
393
|
name: "AgentActivity.say_tts"
|
|
344
394
|
});
|
|
345
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
395
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
346
396
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
347
397
|
return handle;
|
|
348
398
|
}
|
|
@@ -432,8 +482,8 @@ class AgentActivity {
|
|
|
432
482
|
if (ev.userInitiated) {
|
|
433
483
|
return;
|
|
434
484
|
}
|
|
435
|
-
if (this.
|
|
436
|
-
this.logger.warn("skipping new realtime generation, the
|
|
485
|
+
if (this.schedulingPaused) {
|
|
486
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
437
487
|
return;
|
|
438
488
|
}
|
|
439
489
|
const handle = SpeechHandle.create({
|
|
@@ -449,9 +499,7 @@ class AgentActivity {
|
|
|
449
499
|
);
|
|
450
500
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
451
501
|
this.createSpeechTask({
|
|
452
|
-
|
|
453
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
454
|
-
),
|
|
502
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
455
503
|
ownedSpeechHandle: handle,
|
|
456
504
|
name: "AgentActivity.realtimeGeneration"
|
|
457
505
|
});
|
|
@@ -538,7 +586,7 @@ class AgentActivity {
|
|
|
538
586
|
}
|
|
539
587
|
}
|
|
540
588
|
onPreemptiveGeneration(info) {
|
|
541
|
-
if (!this.agentSession.options.preemptiveGeneration || this.
|
|
589
|
+
if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
|
|
542
590
|
return;
|
|
543
591
|
}
|
|
544
592
|
this.cancelPreemptiveGeneration();
|
|
@@ -576,7 +624,21 @@ class AgentActivity {
|
|
|
576
624
|
}
|
|
577
625
|
}
|
|
578
626
|
createSpeechTask(options) {
|
|
579
|
-
const {
|
|
627
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
628
|
+
const wrappedFn = (ctrl) => {
|
|
629
|
+
return agentActivityStorage.run(this, () => {
|
|
630
|
+
const currentTask = Task.current();
|
|
631
|
+
if (currentTask) {
|
|
632
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
633
|
+
}
|
|
634
|
+
if (ownedSpeechHandle) {
|
|
635
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
636
|
+
}
|
|
637
|
+
return taskFn(ctrl);
|
|
638
|
+
});
|
|
639
|
+
};
|
|
640
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
641
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
580
642
|
this.speechTasks.add(task);
|
|
581
643
|
task.addDoneCallback(() => {
|
|
582
644
|
this.speechTasks.delete(task);
|
|
@@ -592,12 +654,15 @@ class AgentActivity {
|
|
|
592
654
|
task.addDoneCallback(() => {
|
|
593
655
|
this.wakeupMainTask();
|
|
594
656
|
});
|
|
595
|
-
return task
|
|
657
|
+
return task;
|
|
596
658
|
}
|
|
597
659
|
async onEndOfTurn(info) {
|
|
598
|
-
if (this.
|
|
660
|
+
if (this.schedulingPaused) {
|
|
599
661
|
this.cancelPreemptiveGeneration();
|
|
600
|
-
this.logger.warn(
|
|
662
|
+
this.logger.warn(
|
|
663
|
+
{ user_input: info.newTranscript },
|
|
664
|
+
"skipping user input, speech scheduling is paused"
|
|
665
|
+
);
|
|
601
666
|
return true;
|
|
602
667
|
}
|
|
603
668
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
@@ -616,7 +681,7 @@ class AgentActivity {
|
|
|
616
681
|
}
|
|
617
682
|
const oldTask = this._userTurnCompletedTask;
|
|
618
683
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
619
|
-
|
|
684
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
620
685
|
name: "AgentActivity.userTurnCompleted"
|
|
621
686
|
});
|
|
622
687
|
return true;
|
|
@@ -646,14 +711,41 @@ class AgentActivity {
|
|
|
646
711
|
await speechHandle._waitForGeneration();
|
|
647
712
|
this._currentSpeech = void 0;
|
|
648
713
|
}
|
|
649
|
-
|
|
650
|
-
|
|
714
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
715
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
716
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
651
717
|
break;
|
|
652
718
|
}
|
|
653
719
|
this.q_updated = new Future();
|
|
654
720
|
}
|
|
655
721
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
656
722
|
}
|
|
723
|
+
getDrainPendingSpeechTasks() {
|
|
724
|
+
const blockedHandles = [];
|
|
725
|
+
for (const task of this._drainBlockedTasks) {
|
|
726
|
+
const info = _getActivityTaskInfo(task);
|
|
727
|
+
if (!info) {
|
|
728
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
729
|
+
continue;
|
|
730
|
+
}
|
|
731
|
+
if (!info.speechHandle) {
|
|
732
|
+
continue;
|
|
733
|
+
}
|
|
734
|
+
blockedHandles.push(info.speechHandle);
|
|
735
|
+
}
|
|
736
|
+
const toWait = [];
|
|
737
|
+
for (const task of this.speechTasks) {
|
|
738
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
739
|
+
continue;
|
|
740
|
+
}
|
|
741
|
+
const info = _getActivityTaskInfo(task);
|
|
742
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
743
|
+
continue;
|
|
744
|
+
}
|
|
745
|
+
toWait.push(task);
|
|
746
|
+
}
|
|
747
|
+
return toWait;
|
|
748
|
+
}
|
|
657
749
|
wakeupMainTask() {
|
|
658
750
|
this.q_updated.resolve();
|
|
659
751
|
}
|
|
@@ -679,7 +771,7 @@ class AgentActivity {
|
|
|
679
771
|
if (this.llm === void 0) {
|
|
680
772
|
throw new Error("trying to generate reply without an LLM model");
|
|
681
773
|
}
|
|
682
|
-
const functionCall = (_a =
|
|
774
|
+
const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
683
775
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
684
776
|
toolChoice = "none";
|
|
685
777
|
}
|
|
@@ -697,19 +789,17 @@ class AgentActivity {
|
|
|
697
789
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
698
790
|
if (this.llm instanceof RealtimeModel) {
|
|
699
791
|
this.createSpeechTask({
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
})
|
|
712
|
-
),
|
|
792
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
793
|
+
speechHandle: handle,
|
|
794
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
795
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
796
|
+
instructions,
|
|
797
|
+
modelSettings: {
|
|
798
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
799
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
800
|
+
},
|
|
801
|
+
abortController
|
|
802
|
+
}),
|
|
713
803
|
ownedSpeechHandle: handle,
|
|
714
804
|
name: "AgentActivity.realtimeReply"
|
|
715
805
|
});
|
|
@@ -719,36 +809,36 @@ class AgentActivity {
|
|
|
719
809
|
${instructions}`;
|
|
720
810
|
}
|
|
721
811
|
const task = this.createSpeechTask({
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
userMessage
|
|
733
|
-
)
|
|
812
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
813
|
+
handle,
|
|
814
|
+
chatCtx ?? this.agent.chatCtx,
|
|
815
|
+
this.agent.toolCtx,
|
|
816
|
+
{
|
|
817
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
818
|
+
},
|
|
819
|
+
abortController,
|
|
820
|
+
instructions,
|
|
821
|
+
userMessage
|
|
734
822
|
),
|
|
735
823
|
ownedSpeechHandle: handle,
|
|
736
824
|
name: "AgentActivity.pipelineReply"
|
|
737
825
|
});
|
|
738
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
826
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
739
827
|
}
|
|
740
828
|
if (scheduleSpeech) {
|
|
741
829
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
742
830
|
}
|
|
743
831
|
return handle;
|
|
744
832
|
}
|
|
745
|
-
interrupt() {
|
|
833
|
+
interrupt(options = {}) {
|
|
746
834
|
var _a;
|
|
835
|
+
const { force = false } = options;
|
|
836
|
+
this.cancelPreemptiveGeneration();
|
|
747
837
|
const future = new Future();
|
|
748
838
|
const currentSpeech = this._currentSpeech;
|
|
749
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
839
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
750
840
|
for (const [_, __, speech] of this.speechQueue) {
|
|
751
|
-
speech.interrupt();
|
|
841
|
+
speech.interrupt(force);
|
|
752
842
|
}
|
|
753
843
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
754
844
|
if (currentSpeech === void 0) {
|
|
@@ -769,7 +859,7 @@ ${instructions}`;
|
|
|
769
859
|
async userTurnCompleted(info, oldTask) {
|
|
770
860
|
var _a, _b;
|
|
771
861
|
if (oldTask) {
|
|
772
|
-
await oldTask;
|
|
862
|
+
await oldTask.result;
|
|
773
863
|
}
|
|
774
864
|
if (this.llm instanceof RealtimeModel) {
|
|
775
865
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -956,7 +1046,7 @@ ${instructions}`;
|
|
|
956
1046
|
toolsMessages,
|
|
957
1047
|
span
|
|
958
1048
|
}) => {
|
|
959
|
-
var _a, _b
|
|
1049
|
+
var _a, _b;
|
|
960
1050
|
speechHandle._agentTurnContext = otelContext.active();
|
|
961
1051
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
962
1052
|
if (instructions) {
|
|
@@ -965,6 +1055,10 @@ ${instructions}`;
|
|
|
965
1055
|
if (newMessage) {
|
|
966
1056
|
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
|
|
967
1057
|
}
|
|
1058
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1059
|
+
if (localParticipant) {
|
|
1060
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1061
|
+
}
|
|
968
1062
|
speechHandleStorage.enterWith(speechHandle);
|
|
969
1063
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
970
1064
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
@@ -1024,7 +1118,7 @@ ${instructions}`;
|
|
|
1024
1118
|
speechHandle._clearAuthorization();
|
|
1025
1119
|
const replyStartedAt = Date.now();
|
|
1026
1120
|
let transcriptionInput = llmOutput;
|
|
1027
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1121
|
+
if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
|
|
1028
1122
|
const timedTextsStream = await Promise.race([
|
|
1029
1123
|
ttsGenData.timedTextsFut.await,
|
|
1030
1124
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1098,11 +1192,11 @@ ${instructions}`;
|
|
|
1098
1192
|
for (const msg of toolsMessages) {
|
|
1099
1193
|
msg.createdAt = replyStartedAt;
|
|
1100
1194
|
}
|
|
1101
|
-
this.agent._chatCtx.insert(toolsMessages);
|
|
1102
1195
|
const toolCallOutputs = toolsMessages.filter(
|
|
1103
1196
|
(m) => m.type === "function_call_output"
|
|
1104
1197
|
);
|
|
1105
1198
|
if (toolCallOutputs.length > 0) {
|
|
1199
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1106
1200
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1107
1201
|
}
|
|
1108
1202
|
}
|
|
@@ -1190,45 +1284,15 @@ ${instructions}`;
|
|
|
1190
1284
|
);
|
|
1191
1285
|
return;
|
|
1192
1286
|
}
|
|
1193
|
-
const functionToolsExecutedEvent =
|
|
1194
|
-
functionCalls: [],
|
|
1195
|
-
functionCallOutputs: []
|
|
1196
|
-
});
|
|
1197
|
-
let shouldGenerateToolReply = false;
|
|
1198
|
-
let newAgentTask = null;
|
|
1199
|
-
let ignoreTaskSwitch = false;
|
|
1200
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1201
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1202
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1203
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1204
|
-
if (sanitizedOut.replyRequired) {
|
|
1205
|
-
shouldGenerateToolReply = true;
|
|
1206
|
-
}
|
|
1207
|
-
}
|
|
1208
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1209
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1210
|
-
ignoreTaskSwitch = true;
|
|
1211
|
-
}
|
|
1212
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1213
|
-
this.logger.debug(
|
|
1214
|
-
{
|
|
1215
|
-
speechId: speechHandle.id,
|
|
1216
|
-
name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
|
|
1217
|
-
args: sanitizedOut.toolCall.args,
|
|
1218
|
-
output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
|
|
1219
|
-
isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
|
|
1220
|
-
},
|
|
1221
|
-
"Tool call execution finished"
|
|
1222
|
-
);
|
|
1223
|
-
}
|
|
1287
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1224
1288
|
this.agentSession.emit(
|
|
1225
1289
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1226
1290
|
functionToolsExecutedEvent
|
|
1227
1291
|
);
|
|
1228
|
-
let
|
|
1292
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1229
1293
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1230
1294
|
this.agentSession.updateAgent(newAgentTask);
|
|
1231
|
-
|
|
1295
|
+
schedulingPaused = true;
|
|
1232
1296
|
}
|
|
1233
1297
|
const toolMessages = [
|
|
1234
1298
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1237,34 +1301,32 @@ ${instructions}`;
|
|
|
1237
1301
|
if (shouldGenerateToolReply) {
|
|
1238
1302
|
chatCtx.insert(toolMessages);
|
|
1239
1303
|
speechHandle._numSteps += 1;
|
|
1240
|
-
const respondToolChoice =
|
|
1304
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1241
1305
|
const toolResponseTask = this.createSpeechTask({
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
toolMessages
|
|
1252
|
-
)
|
|
1306
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1307
|
+
speechHandle,
|
|
1308
|
+
chatCtx,
|
|
1309
|
+
toolCtx,
|
|
1310
|
+
{ toolChoice: respondToolChoice },
|
|
1311
|
+
replyAbortController,
|
|
1312
|
+
instructions,
|
|
1313
|
+
void 0,
|
|
1314
|
+
toolMessages
|
|
1253
1315
|
),
|
|
1254
1316
|
ownedSpeechHandle: speechHandle,
|
|
1255
1317
|
name: "AgentActivity.pipelineReply"
|
|
1256
1318
|
});
|
|
1257
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1319
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1258
1320
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1259
1321
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1260
1322
|
for (const msg of toolMessages) {
|
|
1261
1323
|
msg.createdAt = replyStartedAt;
|
|
1262
1324
|
}
|
|
1263
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1264
1325
|
const toolCallOutputs = toolMessages.filter(
|
|
1265
1326
|
(m) => m.type === "function_call_output"
|
|
1266
1327
|
);
|
|
1267
1328
|
if (toolCallOutputs.length > 0) {
|
|
1329
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1268
1330
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1269
1331
|
}
|
|
1270
1332
|
}
|
|
@@ -1308,9 +1370,13 @@ ${instructions}`;
|
|
|
1308
1370
|
replyAbortController,
|
|
1309
1371
|
span
|
|
1310
1372
|
}) {
|
|
1311
|
-
var _a
|
|
1373
|
+
var _a;
|
|
1312
1374
|
speechHandle._agentTurnContext = otelContext.active();
|
|
1313
1375
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1376
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1377
|
+
if (localParticipant) {
|
|
1378
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1379
|
+
}
|
|
1314
1380
|
speechHandleStorage.enterWith(speechHandle);
|
|
1315
1381
|
if (!this.realtimeSession) {
|
|
1316
1382
|
throw new Error("realtime session is not initialized");
|
|
@@ -1564,44 +1630,15 @@ ${instructions}`;
|
|
|
1564
1630
|
);
|
|
1565
1631
|
return;
|
|
1566
1632
|
}
|
|
1567
|
-
const functionToolsExecutedEvent =
|
|
1568
|
-
functionCalls: [],
|
|
1569
|
-
functionCallOutputs: []
|
|
1570
|
-
});
|
|
1571
|
-
let shouldGenerateToolReply = false;
|
|
1572
|
-
let newAgentTask = null;
|
|
1573
|
-
let ignoreTaskSwitch = false;
|
|
1574
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1575
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1576
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1577
|
-
if (sanitizedOut.replyRequired) {
|
|
1578
|
-
shouldGenerateToolReply = true;
|
|
1579
|
-
}
|
|
1580
|
-
}
|
|
1581
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1582
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1583
|
-
ignoreTaskSwitch = true;
|
|
1584
|
-
}
|
|
1585
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1586
|
-
this.logger.debug(
|
|
1587
|
-
{
|
|
1588
|
-
speechId: speechHandle.id,
|
|
1589
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1590
|
-
args: sanitizedOut.toolCall.args,
|
|
1591
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1592
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1593
|
-
},
|
|
1594
|
-
"Tool call execution finished"
|
|
1595
|
-
);
|
|
1596
|
-
}
|
|
1633
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1597
1634
|
this.agentSession.emit(
|
|
1598
1635
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1599
1636
|
functionToolsExecutedEvent
|
|
1600
1637
|
);
|
|
1601
|
-
let
|
|
1638
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1602
1639
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1603
1640
|
this.agentSession.updateAgent(newAgentTask);
|
|
1604
|
-
|
|
1641
|
+
schedulingPaused = true;
|
|
1605
1642
|
}
|
|
1606
1643
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1607
1644
|
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
@@ -1642,20 +1679,58 @@ ${instructions}`;
|
|
|
1642
1679
|
speechHandle: replySpeechHandle
|
|
1643
1680
|
})
|
|
1644
1681
|
);
|
|
1645
|
-
const toolChoice =
|
|
1682
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1646
1683
|
this.createSpeechTask({
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
})
|
|
1653
|
-
),
|
|
1684
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1685
|
+
speechHandle: replySpeechHandle,
|
|
1686
|
+
modelSettings: { toolChoice },
|
|
1687
|
+
abortController
|
|
1688
|
+
}),
|
|
1654
1689
|
ownedSpeechHandle: replySpeechHandle,
|
|
1655
1690
|
name: "AgentActivity.realtime_reply"
|
|
1656
1691
|
});
|
|
1657
1692
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1658
1693
|
}
|
|
1694
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1695
|
+
var _a, _b, _c;
|
|
1696
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
1697
|
+
functionCalls: [],
|
|
1698
|
+
functionCallOutputs: []
|
|
1699
|
+
});
|
|
1700
|
+
let shouldGenerateToolReply = false;
|
|
1701
|
+
let newAgentTask = null;
|
|
1702
|
+
let ignoreTaskSwitch = false;
|
|
1703
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1704
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1705
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1706
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1707
|
+
if (sanitizedOut.replyRequired) {
|
|
1708
|
+
shouldGenerateToolReply = true;
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1712
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1713
|
+
ignoreTaskSwitch = true;
|
|
1714
|
+
}
|
|
1715
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1716
|
+
this.logger.debug(
|
|
1717
|
+
{
|
|
1718
|
+
speechId: speechHandle.id,
|
|
1719
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1720
|
+
args: sanitizedOut.toolCall.args,
|
|
1721
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1722
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1723
|
+
},
|
|
1724
|
+
"Tool call execution finished"
|
|
1725
|
+
);
|
|
1726
|
+
}
|
|
1727
|
+
return {
|
|
1728
|
+
functionToolsExecutedEvent,
|
|
1729
|
+
shouldGenerateToolReply,
|
|
1730
|
+
newAgentTask,
|
|
1731
|
+
ignoreTaskSwitch
|
|
1732
|
+
};
|
|
1733
|
+
}
|
|
1659
1734
|
async realtimeReplyTask({
|
|
1660
1735
|
speechHandle,
|
|
1661
1736
|
modelSettings: { toolChoice },
|
|
@@ -1697,13 +1772,45 @@ ${instructions}`;
|
|
|
1697
1772
|
}
|
|
1698
1773
|
}
|
|
1699
1774
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1700
|
-
if (this.
|
|
1701
|
-
throw new Error("cannot schedule new speech, the
|
|
1775
|
+
if (this.schedulingPaused && !force) {
|
|
1776
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1702
1777
|
}
|
|
1703
1778
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1704
1779
|
speechHandle._markScheduled();
|
|
1705
1780
|
this.wakeupMainTask();
|
|
1706
1781
|
}
|
|
1782
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
1783
|
+
if (this._schedulingPaused) return;
|
|
1784
|
+
this._schedulingPaused = true;
|
|
1785
|
+
this._drainBlockedTasks = blockedTasks;
|
|
1786
|
+
this.wakeupMainTask();
|
|
1787
|
+
if (this._mainTask) {
|
|
1788
|
+
await this._mainTask.result;
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1791
|
+
_resumeSchedulingTask() {
|
|
1792
|
+
if (!this._schedulingPaused) return;
|
|
1793
|
+
this._schedulingPaused = false;
|
|
1794
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
1795
|
+
}
|
|
1796
|
+
async pause(options = {}) {
|
|
1797
|
+
const { blockedTasks = [] } = options;
|
|
1798
|
+
const unlock = await this.lock.lock();
|
|
1799
|
+
try {
|
|
1800
|
+
const span = tracer.startSpan({
|
|
1801
|
+
name: "pause_agent_activity",
|
|
1802
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1803
|
+
});
|
|
1804
|
+
try {
|
|
1805
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
1806
|
+
await this._closeSessionResources();
|
|
1807
|
+
} finally {
|
|
1808
|
+
span.end();
|
|
1809
|
+
}
|
|
1810
|
+
} finally {
|
|
1811
|
+
unlock();
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1707
1814
|
async drain() {
|
|
1708
1815
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
1709
1816
|
name: "drain_agent_activity",
|
|
@@ -1711,71 +1818,79 @@ ${instructions}`;
|
|
|
1711
1818
|
});
|
|
1712
1819
|
}
|
|
1713
1820
|
async _drainImpl(span) {
|
|
1714
|
-
var _a;
|
|
1715
1821
|
span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1716
1822
|
const unlock = await this.lock.lock();
|
|
1717
1823
|
try {
|
|
1718
|
-
if (this.
|
|
1719
|
-
this.
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
task: Task.from(() => onExitTask),
|
|
1824
|
+
if (this._schedulingPaused) return;
|
|
1825
|
+
this._onExitTask = this.createSpeechTask({
|
|
1826
|
+
taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
1827
|
+
name: "on_exit",
|
|
1828
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1829
|
+
}),
|
|
1830
|
+
inlineTask: true,
|
|
1726
1831
|
name: "AgentActivity_onExit"
|
|
1727
1832
|
});
|
|
1728
|
-
this.
|
|
1729
|
-
this.
|
|
1730
|
-
await
|
|
1833
|
+
this.cancelPreemptiveGeneration();
|
|
1834
|
+
await this._onExitTask.result;
|
|
1835
|
+
await this._pauseSchedulingTask([]);
|
|
1731
1836
|
} finally {
|
|
1732
1837
|
unlock();
|
|
1733
1838
|
}
|
|
1734
1839
|
}
|
|
1735
1840
|
async close() {
|
|
1736
|
-
var _a, _b, _c, _d;
|
|
1737
1841
|
const unlock = await this.lock.lock();
|
|
1738
1842
|
try {
|
|
1739
|
-
if (!this._draining) {
|
|
1740
|
-
this.logger.warn("task closing without draining");
|
|
1741
|
-
}
|
|
1742
1843
|
this.cancelPreemptiveGeneration();
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
if (this.realtimeSession) {
|
|
1747
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1748
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1749
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1750
|
-
this.realtimeSession.off(
|
|
1751
|
-
"input_audio_transcription_completed",
|
|
1752
|
-
this.onInputAudioTranscriptionCompleted
|
|
1753
|
-
);
|
|
1754
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1755
|
-
}
|
|
1756
|
-
if (this.stt instanceof STT) {
|
|
1757
|
-
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1758
|
-
}
|
|
1759
|
-
if (this.tts instanceof TTS) {
|
|
1760
|
-
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1844
|
+
await this._closeSessionResources();
|
|
1845
|
+
if (this._mainTask) {
|
|
1846
|
+
await this._mainTask.cancelAndWait();
|
|
1761
1847
|
}
|
|
1762
|
-
|
|
1763
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1764
|
-
}
|
|
1765
|
-
this.detachAudioInput();
|
|
1766
|
-
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1767
|
-
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1768
|
-
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1769
|
-
await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
|
|
1848
|
+
this.agent._agentActivity = void 0;
|
|
1770
1849
|
} finally {
|
|
1771
1850
|
unlock();
|
|
1772
1851
|
}
|
|
1773
1852
|
}
|
|
1853
|
+
async _closeSessionResources() {
|
|
1854
|
+
var _a, _b, _c;
|
|
1855
|
+
if (this.llm instanceof LLM) {
|
|
1856
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1857
|
+
this.llm.off("error", this.onModelError);
|
|
1858
|
+
}
|
|
1859
|
+
if (this.realtimeSession) {
|
|
1860
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
1861
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
1862
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
1863
|
+
this.realtimeSession.off(
|
|
1864
|
+
"input_audio_transcription_completed",
|
|
1865
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
1866
|
+
);
|
|
1867
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1868
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
1869
|
+
}
|
|
1870
|
+
if (this.stt instanceof STT) {
|
|
1871
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1872
|
+
this.stt.off("error", this.onModelError);
|
|
1873
|
+
}
|
|
1874
|
+
if (this.tts instanceof TTS) {
|
|
1875
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1876
|
+
this.tts.off("error", this.onModelError);
|
|
1877
|
+
}
|
|
1878
|
+
if (this.vad instanceof VAD) {
|
|
1879
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1880
|
+
}
|
|
1881
|
+
this.detachAudioInput();
|
|
1882
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1883
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1884
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1885
|
+
this.realtimeSession = void 0;
|
|
1886
|
+
this.audioRecognition = void 0;
|
|
1887
|
+
}
|
|
1774
1888
|
}
|
|
1775
1889
|
function toOaiToolChoice(toolChoice) {
|
|
1776
1890
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1777
1891
|
}
|
|
1778
1892
|
export {
|
|
1779
|
-
AgentActivity
|
|
1893
|
+
AgentActivity,
|
|
1894
|
+
agentActivityStorage
|
|
1780
1895
|
};
|
|
1781
1896
|
//# sourceMappingURL=agent_activity.js.map
|