@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -18,7 +18,8 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
18
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
19
|
var agent_activity_exports = {};
|
|
20
20
|
__export(agent_activity_exports, {
|
|
21
|
-
AgentActivity: () => AgentActivity
|
|
21
|
+
AgentActivity: () => AgentActivity,
|
|
22
|
+
agentActivityStorage: () => agentActivityStorage
|
|
22
23
|
});
|
|
23
24
|
module.exports = __toCommonJS(agent_activity_exports);
|
|
24
25
|
var import_mutex = require("@livekit/mutex");
|
|
@@ -30,7 +31,7 @@ var import_chat_context = require("../llm/chat_context.cjs");
|
|
|
30
31
|
var import_llm = require("../llm/index.cjs");
|
|
31
32
|
var import_tool_context = require("../llm/tool_context.cjs");
|
|
32
33
|
var import_log = require("../log.cjs");
|
|
33
|
-
var
|
|
34
|
+
var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
|
|
34
35
|
var import_stt = require("../stt/stt.cjs");
|
|
35
36
|
var import_telemetry = require("../telemetry/index.cjs");
|
|
36
37
|
var import_word = require("../tokenize/basic/word.cjs");
|
|
@@ -43,8 +44,11 @@ var import_audio_recognition = require("./audio_recognition.cjs");
|
|
|
43
44
|
var import_events = require("./events.cjs");
|
|
44
45
|
var import_generation = require("./generation.cjs");
|
|
45
46
|
var import_speech_handle = require("./speech_handle.cjs");
|
|
46
|
-
|
|
47
|
+
var import_utils2 = require("./utils.cjs");
|
|
48
|
+
const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
47
49
|
class AgentActivity {
|
|
50
|
+
agent;
|
|
51
|
+
agentSession;
|
|
48
52
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
49
53
|
started = false;
|
|
50
54
|
audioRecognition;
|
|
@@ -53,22 +57,29 @@ class AgentActivity {
|
|
|
53
57
|
// Maps response_id to OTEL span for metrics recording
|
|
54
58
|
turnDetectionMode;
|
|
55
59
|
logger = (0, import_log.log)();
|
|
56
|
-
|
|
60
|
+
_schedulingPaused = true;
|
|
61
|
+
_drainBlockedTasks = [];
|
|
57
62
|
_currentSpeech;
|
|
58
63
|
speechQueue;
|
|
59
64
|
// [priority, timestamp, speechHandle]
|
|
60
65
|
q_updated;
|
|
61
66
|
speechTasks = /* @__PURE__ */ new Set();
|
|
62
67
|
lock = new import_mutex.Mutex();
|
|
63
|
-
audioStream = new
|
|
68
|
+
audioStream = new import_multi_input_stream.MultiInputStream();
|
|
69
|
+
audioStreamId;
|
|
64
70
|
// default to null as None, which maps to the default provider tool choice value
|
|
65
71
|
toolChoice = null;
|
|
66
72
|
_preemptiveGeneration;
|
|
67
|
-
agent;
|
|
68
|
-
agentSession;
|
|
69
73
|
/** @internal */
|
|
70
74
|
_mainTask;
|
|
75
|
+
_onEnterTask;
|
|
76
|
+
_onExitTask;
|
|
71
77
|
_userTurnCompletedTask;
|
|
78
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
79
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
80
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
81
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
82
|
+
onModelError = (ev) => this.onError(ev);
|
|
72
83
|
constructor(agent, agentSession) {
|
|
73
84
|
this.agent = agent;
|
|
74
85
|
this.agentSession = agentSession;
|
|
@@ -79,7 +90,7 @@ class AgentActivity {
|
|
|
79
90
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
80
91
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
81
92
|
this.logger.warn(
|
|
82
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
93
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
83
94
|
);
|
|
84
95
|
this.turnDetectionMode = void 0;
|
|
85
96
|
}
|
|
@@ -131,98 +142,119 @@ class AgentActivity {
|
|
|
131
142
|
async start() {
|
|
132
143
|
const unlock = await this.lock.lock();
|
|
133
144
|
try {
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
} else if (this.llm instanceof import_llm.LLM) {
|
|
174
|
-
try {
|
|
175
|
-
(0, import_generation.updateInstructions)({
|
|
176
|
-
chatCtx: this.agent._chatCtx,
|
|
177
|
-
instructions: this.agent.instructions,
|
|
178
|
-
addIfMissing: true
|
|
179
|
-
});
|
|
180
|
-
} catch (error) {
|
|
181
|
-
this.logger.error("failed to update the instructions", error);
|
|
182
|
-
}
|
|
145
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
146
|
+
} finally {
|
|
147
|
+
unlock();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
async resume() {
|
|
151
|
+
const unlock = await this.lock.lock();
|
|
152
|
+
try {
|
|
153
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
154
|
+
} finally {
|
|
155
|
+
unlock();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
async _startSession(options) {
|
|
159
|
+
var _a;
|
|
160
|
+
const { spanName, runOnEnter } = options;
|
|
161
|
+
const startSpan = import_telemetry.tracer.startSpan({
|
|
162
|
+
name: spanName,
|
|
163
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
164
|
+
context: import_api.ROOT_CONTEXT
|
|
165
|
+
});
|
|
166
|
+
this.agent._agentActivity = this;
|
|
167
|
+
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
168
|
+
this.realtimeSession = this.llm.session();
|
|
169
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
170
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
171
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
172
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
173
|
+
this.realtimeSession.on(
|
|
174
|
+
"input_audio_transcription_completed",
|
|
175
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
176
|
+
);
|
|
177
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
178
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
179
|
+
(0, import_generation.removeInstructions)(this.agent._chatCtx);
|
|
180
|
+
try {
|
|
181
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
182
|
+
} catch (error) {
|
|
183
|
+
this.logger.error(error, "failed to update the instructions");
|
|
183
184
|
}
|
|
184
|
-
|
|
185
|
-
this.
|
|
186
|
-
|
|
185
|
+
try {
|
|
186
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
187
|
+
} catch (error) {
|
|
188
|
+
this.logger.error(error, "failed to update the chat context");
|
|
187
189
|
}
|
|
188
|
-
|
|
189
|
-
this.
|
|
190
|
-
|
|
190
|
+
try {
|
|
191
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
192
|
+
} catch (error) {
|
|
193
|
+
this.logger.error(error, "failed to update the tools");
|
|
191
194
|
}
|
|
192
|
-
if (this.tts
|
|
193
|
-
this.
|
|
194
|
-
|
|
195
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
196
|
+
this.logger.error(
|
|
197
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
198
|
+
);
|
|
195
199
|
}
|
|
196
|
-
|
|
197
|
-
|
|
200
|
+
} else if (this.llm instanceof import_llm.LLM) {
|
|
201
|
+
try {
|
|
202
|
+
(0, import_generation.updateInstructions)({
|
|
203
|
+
chatCtx: this.agent._chatCtx,
|
|
204
|
+
instructions: this.agent.instructions,
|
|
205
|
+
addIfMissing: true
|
|
206
|
+
});
|
|
207
|
+
} catch (error) {
|
|
208
|
+
this.logger.error("failed to update the instructions", error);
|
|
198
209
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
this.
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
this.
|
|
219
|
-
|
|
210
|
+
}
|
|
211
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
212
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
213
|
+
this.llm.on("error", this.onModelError);
|
|
214
|
+
}
|
|
215
|
+
if (this.stt instanceof import_stt.STT) {
|
|
216
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
217
|
+
this.stt.on("error", this.onModelError);
|
|
218
|
+
}
|
|
219
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
220
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
221
|
+
this.tts.on("error", this.onModelError);
|
|
222
|
+
}
|
|
223
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
224
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
225
|
+
}
|
|
226
|
+
this.audioRecognition = new import_audio_recognition.AudioRecognition({
|
|
227
|
+
recognitionHooks: this,
|
|
228
|
+
// Disable stt node if stt is not provided
|
|
229
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
230
|
+
vad: this.vad,
|
|
231
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
232
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
233
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
234
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
235
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
236
|
+
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
237
|
+
sttProvider: this.getSttProvider(),
|
|
238
|
+
getLinkedParticipant: () => {
|
|
239
|
+
var _a2;
|
|
240
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
this.audioRecognition.start();
|
|
244
|
+
this.started = true;
|
|
245
|
+
this._resumeSchedulingTask();
|
|
246
|
+
if (runOnEnter) {
|
|
247
|
+
this._onEnterTask = this.createSpeechTask({
|
|
248
|
+
taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
249
|
+
name: "on_enter",
|
|
250
|
+
context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
|
|
251
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
252
|
+
}),
|
|
253
|
+
inlineTask: true,
|
|
220
254
|
name: "AgentActivity_onEnter"
|
|
221
255
|
});
|
|
222
|
-
startSpan.end();
|
|
223
|
-
} finally {
|
|
224
|
-
unlock();
|
|
225
256
|
}
|
|
257
|
+
startSpan.end();
|
|
226
258
|
}
|
|
227
259
|
get currentSpeech() {
|
|
228
260
|
return this._currentSpeech;
|
|
@@ -233,6 +265,15 @@ class AgentActivity {
|
|
|
233
265
|
get stt() {
|
|
234
266
|
return this.agent.stt || this.agentSession.stt;
|
|
235
267
|
}
|
|
268
|
+
getSttProvider() {
|
|
269
|
+
var _a;
|
|
270
|
+
const label = (_a = this.stt) == null ? void 0 : _a.label;
|
|
271
|
+
if (!label) {
|
|
272
|
+
return void 0;
|
|
273
|
+
}
|
|
274
|
+
const [provider] = label.split("-", 1);
|
|
275
|
+
return provider || label;
|
|
276
|
+
}
|
|
236
277
|
get llm() {
|
|
237
278
|
return this.agent.llm || this.agentSession.llm;
|
|
238
279
|
}
|
|
@@ -242,8 +283,8 @@ class AgentActivity {
|
|
|
242
283
|
get tools() {
|
|
243
284
|
return this.agent.toolCtx;
|
|
244
285
|
}
|
|
245
|
-
get
|
|
246
|
-
return this.
|
|
286
|
+
get schedulingPaused() {
|
|
287
|
+
return this._schedulingPaused;
|
|
247
288
|
}
|
|
248
289
|
get realtimeLLMSession() {
|
|
249
290
|
return this.realtimeSession;
|
|
@@ -283,11 +324,9 @@ class AgentActivity {
|
|
|
283
324
|
}
|
|
284
325
|
}
|
|
285
326
|
attachAudioInput(audioStream) {
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
}
|
|
290
|
-
this.audioStream.setSource(audioStream);
|
|
327
|
+
void this.audioStream.close();
|
|
328
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
329
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
291
330
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
292
331
|
if (this.realtimeSession) {
|
|
293
332
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -297,13 +336,21 @@ class AgentActivity {
|
|
|
297
336
|
}
|
|
298
337
|
}
|
|
299
338
|
detachAudioInput() {
|
|
300
|
-
this.
|
|
339
|
+
if (this.audioStreamId === void 0) {
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
void this.audioStream.close();
|
|
343
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
344
|
+
this.audioStreamId = void 0;
|
|
301
345
|
}
|
|
302
|
-
commitUserTurn() {
|
|
346
|
+
commitUserTurn(options = {}) {
|
|
347
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
303
348
|
if (!this.audioRecognition) {
|
|
304
|
-
|
|
349
|
+
if (throwIfNotReady) {
|
|
350
|
+
throw new Error("AudioRecognition is not initialized");
|
|
351
|
+
}
|
|
352
|
+
return;
|
|
305
353
|
}
|
|
306
|
-
const audioDetached = false;
|
|
307
354
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
308
355
|
}
|
|
309
356
|
clearUserTurn() {
|
|
@@ -339,19 +386,17 @@ class AgentActivity {
|
|
|
339
386
|
})
|
|
340
387
|
);
|
|
341
388
|
const task = this.createSpeechTask({
|
|
342
|
-
|
|
343
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
344
|
-
),
|
|
389
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
345
390
|
ownedSpeechHandle: handle,
|
|
346
391
|
name: "AgentActivity.say_tts"
|
|
347
392
|
});
|
|
348
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
393
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
349
394
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
350
395
|
return handle;
|
|
351
396
|
}
|
|
352
397
|
// -- Metrics and errors --
|
|
353
398
|
onMetricsCollected = (ev) => {
|
|
354
|
-
const speechHandle = speechHandleStorage.getStore();
|
|
399
|
+
const speechHandle = import_agent.speechHandleStorage.getStore();
|
|
355
400
|
if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
|
|
356
401
|
ev.speechId = speechHandle.id;
|
|
357
402
|
}
|
|
@@ -435,8 +480,8 @@ class AgentActivity {
|
|
|
435
480
|
if (ev.userInitiated) {
|
|
436
481
|
return;
|
|
437
482
|
}
|
|
438
|
-
if (this.
|
|
439
|
-
this.logger.warn("skipping new realtime generation, the
|
|
483
|
+
if (this.schedulingPaused) {
|
|
484
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
440
485
|
return;
|
|
441
486
|
}
|
|
442
487
|
const handle = import_speech_handle.SpeechHandle.create({
|
|
@@ -452,9 +497,7 @@ class AgentActivity {
|
|
|
452
497
|
);
|
|
453
498
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
454
499
|
this.createSpeechTask({
|
|
455
|
-
|
|
456
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
457
|
-
),
|
|
500
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
458
501
|
ownedSpeechHandle: handle,
|
|
459
502
|
name: "AgentActivity.realtimeGeneration"
|
|
460
503
|
});
|
|
@@ -541,7 +584,7 @@ class AgentActivity {
|
|
|
541
584
|
}
|
|
542
585
|
}
|
|
543
586
|
onPreemptiveGeneration(info) {
|
|
544
|
-
if (!this.agentSession.options.preemptiveGeneration || this.
|
|
587
|
+
if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
545
588
|
return;
|
|
546
589
|
}
|
|
547
590
|
this.cancelPreemptiveGeneration();
|
|
@@ -579,7 +622,21 @@ class AgentActivity {
|
|
|
579
622
|
}
|
|
580
623
|
}
|
|
581
624
|
createSpeechTask(options) {
|
|
582
|
-
const {
|
|
625
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
626
|
+
const wrappedFn = (ctrl) => {
|
|
627
|
+
return agentActivityStorage.run(this, () => {
|
|
628
|
+
const currentTask = import_utils.Task.current();
|
|
629
|
+
if (currentTask) {
|
|
630
|
+
(0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
631
|
+
}
|
|
632
|
+
if (ownedSpeechHandle) {
|
|
633
|
+
return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
634
|
+
}
|
|
635
|
+
return taskFn(ctrl);
|
|
636
|
+
});
|
|
637
|
+
};
|
|
638
|
+
const task = import_utils.Task.from(wrappedFn, controller, name);
|
|
639
|
+
(0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
583
640
|
this.speechTasks.add(task);
|
|
584
641
|
task.addDoneCallback(() => {
|
|
585
642
|
this.speechTasks.delete(task);
|
|
@@ -595,12 +652,15 @@ class AgentActivity {
|
|
|
595
652
|
task.addDoneCallback(() => {
|
|
596
653
|
this.wakeupMainTask();
|
|
597
654
|
});
|
|
598
|
-
return task
|
|
655
|
+
return task;
|
|
599
656
|
}
|
|
600
657
|
async onEndOfTurn(info) {
|
|
601
|
-
if (this.
|
|
658
|
+
if (this.schedulingPaused) {
|
|
602
659
|
this.cancelPreemptiveGeneration();
|
|
603
|
-
this.logger.warn(
|
|
660
|
+
this.logger.warn(
|
|
661
|
+
{ user_input: info.newTranscript },
|
|
662
|
+
"skipping user input, speech scheduling is paused"
|
|
663
|
+
);
|
|
604
664
|
return true;
|
|
605
665
|
}
|
|
606
666
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
@@ -619,7 +679,7 @@ class AgentActivity {
|
|
|
619
679
|
}
|
|
620
680
|
const oldTask = this._userTurnCompletedTask;
|
|
621
681
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
622
|
-
|
|
682
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
623
683
|
name: "AgentActivity.userTurnCompleted"
|
|
624
684
|
});
|
|
625
685
|
return true;
|
|
@@ -649,14 +709,41 @@ class AgentActivity {
|
|
|
649
709
|
await speechHandle._waitForGeneration();
|
|
650
710
|
this._currentSpeech = void 0;
|
|
651
711
|
}
|
|
652
|
-
|
|
653
|
-
|
|
712
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
713
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
714
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
654
715
|
break;
|
|
655
716
|
}
|
|
656
717
|
this.q_updated = new import_utils.Future();
|
|
657
718
|
}
|
|
658
719
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
659
720
|
}
|
|
721
|
+
getDrainPendingSpeechTasks() {
|
|
722
|
+
const blockedHandles = [];
|
|
723
|
+
for (const task of this._drainBlockedTasks) {
|
|
724
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
725
|
+
if (!info) {
|
|
726
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
729
|
+
if (!info.speechHandle) {
|
|
730
|
+
continue;
|
|
731
|
+
}
|
|
732
|
+
blockedHandles.push(info.speechHandle);
|
|
733
|
+
}
|
|
734
|
+
const toWait = [];
|
|
735
|
+
for (const task of this.speechTasks) {
|
|
736
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
737
|
+
continue;
|
|
738
|
+
}
|
|
739
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
740
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
741
|
+
continue;
|
|
742
|
+
}
|
|
743
|
+
toWait.push(task);
|
|
744
|
+
}
|
|
745
|
+
return toWait;
|
|
746
|
+
}
|
|
660
747
|
wakeupMainTask() {
|
|
661
748
|
this.q_updated.resolve();
|
|
662
749
|
}
|
|
@@ -682,7 +769,7 @@ class AgentActivity {
|
|
|
682
769
|
if (this.llm === void 0) {
|
|
683
770
|
throw new Error("trying to generate reply without an LLM model");
|
|
684
771
|
}
|
|
685
|
-
const functionCall = (_a = import_agent.
|
|
772
|
+
const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
686
773
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
687
774
|
toolChoice = "none";
|
|
688
775
|
}
|
|
@@ -700,19 +787,17 @@ class AgentActivity {
|
|
|
700
787
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
701
788
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
702
789
|
this.createSpeechTask({
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
})
|
|
715
|
-
),
|
|
790
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
791
|
+
speechHandle: handle,
|
|
792
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
793
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
794
|
+
instructions,
|
|
795
|
+
modelSettings: {
|
|
796
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
797
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
798
|
+
},
|
|
799
|
+
abortController
|
|
800
|
+
}),
|
|
716
801
|
ownedSpeechHandle: handle,
|
|
717
802
|
name: "AgentActivity.realtimeReply"
|
|
718
803
|
});
|
|
@@ -722,36 +807,36 @@ class AgentActivity {
|
|
|
722
807
|
${instructions}`;
|
|
723
808
|
}
|
|
724
809
|
const task = this.createSpeechTask({
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
userMessage
|
|
736
|
-
)
|
|
810
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
811
|
+
handle,
|
|
812
|
+
chatCtx ?? this.agent.chatCtx,
|
|
813
|
+
this.agent.toolCtx,
|
|
814
|
+
{
|
|
815
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
816
|
+
},
|
|
817
|
+
abortController,
|
|
818
|
+
instructions,
|
|
819
|
+
userMessage
|
|
737
820
|
),
|
|
738
821
|
ownedSpeechHandle: handle,
|
|
739
822
|
name: "AgentActivity.pipelineReply"
|
|
740
823
|
});
|
|
741
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
824
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
742
825
|
}
|
|
743
826
|
if (scheduleSpeech) {
|
|
744
827
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
745
828
|
}
|
|
746
829
|
return handle;
|
|
747
830
|
}
|
|
748
|
-
interrupt() {
|
|
831
|
+
interrupt(options = {}) {
|
|
749
832
|
var _a;
|
|
833
|
+
const { force = false } = options;
|
|
834
|
+
this.cancelPreemptiveGeneration();
|
|
750
835
|
const future = new import_utils.Future();
|
|
751
836
|
const currentSpeech = this._currentSpeech;
|
|
752
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
837
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
753
838
|
for (const [_, __, speech] of this.speechQueue) {
|
|
754
|
-
speech.interrupt();
|
|
839
|
+
speech.interrupt(force);
|
|
755
840
|
}
|
|
756
841
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
757
842
|
if (currentSpeech === void 0) {
|
|
@@ -772,7 +857,7 @@ ${instructions}`;
|
|
|
772
857
|
async userTurnCompleted(info, oldTask) {
|
|
773
858
|
var _a, _b;
|
|
774
859
|
if (oldTask) {
|
|
775
|
-
await oldTask;
|
|
860
|
+
await oldTask.result;
|
|
776
861
|
}
|
|
777
862
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
778
863
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -854,7 +939,7 @@ ${instructions}`;
|
|
|
854
939
|
}
|
|
855
940
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
856
941
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
857
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
942
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
858
943
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
859
944
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
860
945
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
@@ -959,7 +1044,7 @@ ${instructions}`;
|
|
|
959
1044
|
toolsMessages,
|
|
960
1045
|
span
|
|
961
1046
|
}) => {
|
|
962
|
-
var _a, _b
|
|
1047
|
+
var _a, _b;
|
|
963
1048
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
964
1049
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
965
1050
|
if (instructions) {
|
|
@@ -968,7 +1053,11 @@ ${instructions}`;
|
|
|
968
1053
|
if (newMessage) {
|
|
969
1054
|
span.setAttribute(import_telemetry.traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
|
|
970
1055
|
}
|
|
971
|
-
|
|
1056
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1057
|
+
if (localParticipant) {
|
|
1058
|
+
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
1059
|
+
}
|
|
1060
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
972
1061
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
973
1062
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
974
1063
|
chatCtx = chatCtx.copy();
|
|
@@ -1027,7 +1116,7 @@ ${instructions}`;
|
|
|
1027
1116
|
speechHandle._clearAuthorization();
|
|
1028
1117
|
const replyStartedAt = Date.now();
|
|
1029
1118
|
let transcriptionInput = llmOutput;
|
|
1030
|
-
if (this.useTtsAlignedTranscript && ((
|
|
1119
|
+
if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
|
|
1031
1120
|
const timedTextsStream = await Promise.race([
|
|
1032
1121
|
ttsGenData.timedTextsFut.await,
|
|
1033
1122
|
(ttsTask == null ? void 0 : ttsTask.result.catch(
|
|
@@ -1101,11 +1190,11 @@ ${instructions}`;
|
|
|
1101
1190
|
for (const msg of toolsMessages) {
|
|
1102
1191
|
msg.createdAt = replyStartedAt;
|
|
1103
1192
|
}
|
|
1104
|
-
this.agent._chatCtx.insert(toolsMessages);
|
|
1105
1193
|
const toolCallOutputs = toolsMessages.filter(
|
|
1106
1194
|
(m) => m.type === "function_call_output"
|
|
1107
1195
|
);
|
|
1108
1196
|
if (toolCallOutputs.length > 0) {
|
|
1197
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1109
1198
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1110
1199
|
}
|
|
1111
1200
|
}
|
|
@@ -1193,45 +1282,15 @@ ${instructions}`;
|
|
|
1193
1282
|
);
|
|
1194
1283
|
return;
|
|
1195
1284
|
}
|
|
1196
|
-
const functionToolsExecutedEvent = (
|
|
1197
|
-
functionCalls: [],
|
|
1198
|
-
functionCallOutputs: []
|
|
1199
|
-
});
|
|
1200
|
-
let shouldGenerateToolReply = false;
|
|
1201
|
-
let newAgentTask = null;
|
|
1202
|
-
let ignoreTaskSwitch = false;
|
|
1203
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1204
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1205
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1206
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1207
|
-
if (sanitizedOut.replyRequired) {
|
|
1208
|
-
shouldGenerateToolReply = true;
|
|
1209
|
-
}
|
|
1210
|
-
}
|
|
1211
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1212
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1213
|
-
ignoreTaskSwitch = true;
|
|
1214
|
-
}
|
|
1215
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1216
|
-
this.logger.debug(
|
|
1217
|
-
{
|
|
1218
|
-
speechId: speechHandle.id,
|
|
1219
|
-
name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
|
|
1220
|
-
args: sanitizedOut.toolCall.args,
|
|
1221
|
-
output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
|
|
1222
|
-
isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
|
|
1223
|
-
},
|
|
1224
|
-
"Tool call execution finished"
|
|
1225
|
-
);
|
|
1226
|
-
}
|
|
1285
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1227
1286
|
this.agentSession.emit(
|
|
1228
1287
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1229
1288
|
functionToolsExecutedEvent
|
|
1230
1289
|
);
|
|
1231
|
-
let
|
|
1290
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1232
1291
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1233
1292
|
this.agentSession.updateAgent(newAgentTask);
|
|
1234
|
-
|
|
1293
|
+
schedulingPaused = true;
|
|
1235
1294
|
}
|
|
1236
1295
|
const toolMessages = [
|
|
1237
1296
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1240,34 +1299,32 @@ ${instructions}`;
|
|
|
1240
1299
|
if (shouldGenerateToolReply) {
|
|
1241
1300
|
chatCtx.insert(toolMessages);
|
|
1242
1301
|
speechHandle._numSteps += 1;
|
|
1243
|
-
const respondToolChoice =
|
|
1302
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1244
1303
|
const toolResponseTask = this.createSpeechTask({
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
toolMessages
|
|
1255
|
-
)
|
|
1304
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1305
|
+
speechHandle,
|
|
1306
|
+
chatCtx,
|
|
1307
|
+
toolCtx,
|
|
1308
|
+
{ toolChoice: respondToolChoice },
|
|
1309
|
+
replyAbortController,
|
|
1310
|
+
instructions,
|
|
1311
|
+
void 0,
|
|
1312
|
+
toolMessages
|
|
1256
1313
|
),
|
|
1257
1314
|
ownedSpeechHandle: speechHandle,
|
|
1258
1315
|
name: "AgentActivity.pipelineReply"
|
|
1259
1316
|
});
|
|
1260
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1317
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1261
1318
|
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1262
1319
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1263
1320
|
for (const msg of toolMessages) {
|
|
1264
1321
|
msg.createdAt = replyStartedAt;
|
|
1265
1322
|
}
|
|
1266
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1267
1323
|
const toolCallOutputs = toolMessages.filter(
|
|
1268
1324
|
(m) => m.type === "function_call_output"
|
|
1269
1325
|
);
|
|
1270
1326
|
if (toolCallOutputs.length > 0) {
|
|
1327
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1271
1328
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1272
1329
|
}
|
|
1273
1330
|
}
|
|
@@ -1311,10 +1368,14 @@ ${instructions}`;
|
|
|
1311
1368
|
replyAbortController,
|
|
1312
1369
|
span
|
|
1313
1370
|
}) {
|
|
1314
|
-
var _a
|
|
1371
|
+
var _a;
|
|
1315
1372
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
1316
1373
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1317
|
-
|
|
1374
|
+
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1375
|
+
if (localParticipant) {
|
|
1376
|
+
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
1377
|
+
}
|
|
1378
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1318
1379
|
if (!this.realtimeSession) {
|
|
1319
1380
|
throw new Error("realtime session is not initialized");
|
|
1320
1381
|
}
|
|
@@ -1567,44 +1628,15 @@ ${instructions}`;
|
|
|
1567
1628
|
);
|
|
1568
1629
|
return;
|
|
1569
1630
|
}
|
|
1570
|
-
const functionToolsExecutedEvent = (
|
|
1571
|
-
functionCalls: [],
|
|
1572
|
-
functionCallOutputs: []
|
|
1573
|
-
});
|
|
1574
|
-
let shouldGenerateToolReply = false;
|
|
1575
|
-
let newAgentTask = null;
|
|
1576
|
-
let ignoreTaskSwitch = false;
|
|
1577
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1578
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1579
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1580
|
-
if (sanitizedOut.replyRequired) {
|
|
1581
|
-
shouldGenerateToolReply = true;
|
|
1582
|
-
}
|
|
1583
|
-
}
|
|
1584
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1585
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1586
|
-
ignoreTaskSwitch = true;
|
|
1587
|
-
}
|
|
1588
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1589
|
-
this.logger.debug(
|
|
1590
|
-
{
|
|
1591
|
-
speechId: speechHandle.id,
|
|
1592
|
-
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1593
|
-
args: sanitizedOut.toolCall.args,
|
|
1594
|
-
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1595
|
-
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1596
|
-
},
|
|
1597
|
-
"Tool call execution finished"
|
|
1598
|
-
);
|
|
1599
|
-
}
|
|
1631
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1600
1632
|
this.agentSession.emit(
|
|
1601
1633
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1602
1634
|
functionToolsExecutedEvent
|
|
1603
1635
|
);
|
|
1604
|
-
let
|
|
1636
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1605
1637
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1606
1638
|
this.agentSession.updateAgent(newAgentTask);
|
|
1607
|
-
|
|
1639
|
+
schedulingPaused = true;
|
|
1608
1640
|
}
|
|
1609
1641
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1610
1642
|
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
@@ -1645,20 +1677,58 @@ ${instructions}`;
|
|
|
1645
1677
|
speechHandle: replySpeechHandle
|
|
1646
1678
|
})
|
|
1647
1679
|
);
|
|
1648
|
-
const toolChoice =
|
|
1680
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1649
1681
|
this.createSpeechTask({
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
})
|
|
1656
|
-
),
|
|
1682
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1683
|
+
speechHandle: replySpeechHandle,
|
|
1684
|
+
modelSettings: { toolChoice },
|
|
1685
|
+
abortController
|
|
1686
|
+
}),
|
|
1657
1687
|
ownedSpeechHandle: replySpeechHandle,
|
|
1658
1688
|
name: "AgentActivity.realtime_reply"
|
|
1659
1689
|
});
|
|
1660
1690
|
this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1661
1691
|
}
|
|
1692
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1693
|
+
var _a, _b, _c;
|
|
1694
|
+
const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
|
|
1695
|
+
functionCalls: [],
|
|
1696
|
+
functionCallOutputs: []
|
|
1697
|
+
});
|
|
1698
|
+
let shouldGenerateToolReply = false;
|
|
1699
|
+
let newAgentTask = null;
|
|
1700
|
+
let ignoreTaskSwitch = false;
|
|
1701
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1702
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1703
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1704
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1705
|
+
if (sanitizedOut.replyRequired) {
|
|
1706
|
+
shouldGenerateToolReply = true;
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1710
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1711
|
+
ignoreTaskSwitch = true;
|
|
1712
|
+
}
|
|
1713
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1714
|
+
this.logger.debug(
|
|
1715
|
+
{
|
|
1716
|
+
speechId: speechHandle.id,
|
|
1717
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1718
|
+
args: sanitizedOut.toolCall.args,
|
|
1719
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1720
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1721
|
+
},
|
|
1722
|
+
"Tool call execution finished"
|
|
1723
|
+
);
|
|
1724
|
+
}
|
|
1725
|
+
return {
|
|
1726
|
+
functionToolsExecutedEvent,
|
|
1727
|
+
shouldGenerateToolReply,
|
|
1728
|
+
newAgentTask,
|
|
1729
|
+
ignoreTaskSwitch
|
|
1730
|
+
};
|
|
1731
|
+
}
|
|
1662
1732
|
async realtimeReplyTask({
|
|
1663
1733
|
speechHandle,
|
|
1664
1734
|
modelSettings: { toolChoice },
|
|
@@ -1666,7 +1736,7 @@ ${instructions}`;
|
|
|
1666
1736
|
instructions,
|
|
1667
1737
|
abortController
|
|
1668
1738
|
}) {
|
|
1669
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
1739
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1670
1740
|
if (!this.realtimeSession) {
|
|
1671
1741
|
throw new Error("realtime session is not available");
|
|
1672
1742
|
}
|
|
@@ -1700,13 +1770,45 @@ ${instructions}`;
|
|
|
1700
1770
|
}
|
|
1701
1771
|
}
|
|
1702
1772
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1703
|
-
if (this.
|
|
1704
|
-
throw new Error("cannot schedule new speech, the
|
|
1773
|
+
if (this.schedulingPaused && !force) {
|
|
1774
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1705
1775
|
}
|
|
1706
1776
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1707
1777
|
speechHandle._markScheduled();
|
|
1708
1778
|
this.wakeupMainTask();
|
|
1709
1779
|
}
|
|
1780
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
1781
|
+
if (this._schedulingPaused) return;
|
|
1782
|
+
this._schedulingPaused = true;
|
|
1783
|
+
this._drainBlockedTasks = blockedTasks;
|
|
1784
|
+
this.wakeupMainTask();
|
|
1785
|
+
if (this._mainTask) {
|
|
1786
|
+
await this._mainTask.result;
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
_resumeSchedulingTask() {
|
|
1790
|
+
if (!this._schedulingPaused) return;
|
|
1791
|
+
this._schedulingPaused = false;
|
|
1792
|
+
this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
|
|
1793
|
+
}
|
|
1794
|
+
async pause(options = {}) {
|
|
1795
|
+
const { blockedTasks = [] } = options;
|
|
1796
|
+
const unlock = await this.lock.lock();
|
|
1797
|
+
try {
|
|
1798
|
+
const span = import_telemetry.tracer.startSpan({
|
|
1799
|
+
name: "pause_agent_activity",
|
|
1800
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1801
|
+
});
|
|
1802
|
+
try {
|
|
1803
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
1804
|
+
await this._closeSessionResources();
|
|
1805
|
+
} finally {
|
|
1806
|
+
span.end();
|
|
1807
|
+
}
|
|
1808
|
+
} finally {
|
|
1809
|
+
unlock();
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1710
1812
|
async drain() {
|
|
1711
1813
|
return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
1712
1814
|
name: "drain_agent_activity",
|
|
@@ -1714,72 +1816,80 @@ ${instructions}`;
|
|
|
1714
1816
|
});
|
|
1715
1817
|
}
|
|
1716
1818
|
async _drainImpl(span) {
|
|
1717
|
-
var _a;
|
|
1718
1819
|
span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1719
1820
|
const unlock = await this.lock.lock();
|
|
1720
1821
|
try {
|
|
1721
|
-
if (this.
|
|
1722
|
-
this.
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
task: import_utils.Task.from(() => onExitTask),
|
|
1822
|
+
if (this._schedulingPaused) return;
|
|
1823
|
+
this._onExitTask = this.createSpeechTask({
|
|
1824
|
+
taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
1825
|
+
name: "on_exit",
|
|
1826
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1827
|
+
}),
|
|
1828
|
+
inlineTask: true,
|
|
1729
1829
|
name: "AgentActivity_onExit"
|
|
1730
1830
|
});
|
|
1731
|
-
this.
|
|
1732
|
-
this.
|
|
1733
|
-
await
|
|
1831
|
+
this.cancelPreemptiveGeneration();
|
|
1832
|
+
await this._onExitTask.result;
|
|
1833
|
+
await this._pauseSchedulingTask([]);
|
|
1734
1834
|
} finally {
|
|
1735
1835
|
unlock();
|
|
1736
1836
|
}
|
|
1737
1837
|
}
|
|
1738
1838
|
async close() {
|
|
1739
|
-
var _a, _b, _c, _d;
|
|
1740
1839
|
const unlock = await this.lock.lock();
|
|
1741
1840
|
try {
|
|
1742
|
-
if (!this._draining) {
|
|
1743
|
-
this.logger.warn("task closing without draining");
|
|
1744
|
-
}
|
|
1745
1841
|
this.cancelPreemptiveGeneration();
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
if (this.realtimeSession) {
|
|
1750
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1751
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1752
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1753
|
-
this.realtimeSession.off(
|
|
1754
|
-
"input_audio_transcription_completed",
|
|
1755
|
-
this.onInputAudioTranscriptionCompleted
|
|
1756
|
-
);
|
|
1757
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1758
|
-
}
|
|
1759
|
-
if (this.stt instanceof import_stt.STT) {
|
|
1760
|
-
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1761
|
-
}
|
|
1762
|
-
if (this.tts instanceof import_tts.TTS) {
|
|
1763
|
-
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1842
|
+
await this._closeSessionResources();
|
|
1843
|
+
if (this._mainTask) {
|
|
1844
|
+
await this._mainTask.cancelAndWait();
|
|
1764
1845
|
}
|
|
1765
|
-
|
|
1766
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1767
|
-
}
|
|
1768
|
-
this.detachAudioInput();
|
|
1769
|
-
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1770
|
-
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1771
|
-
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1772
|
-
await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
|
|
1846
|
+
this.agent._agentActivity = void 0;
|
|
1773
1847
|
} finally {
|
|
1774
1848
|
unlock();
|
|
1775
1849
|
}
|
|
1776
1850
|
}
|
|
1851
|
+
async _closeSessionResources() {
|
|
1852
|
+
var _a, _b, _c;
|
|
1853
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
1854
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1855
|
+
this.llm.off("error", this.onModelError);
|
|
1856
|
+
}
|
|
1857
|
+
if (this.realtimeSession) {
|
|
1858
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
1859
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
1860
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
1861
|
+
this.realtimeSession.off(
|
|
1862
|
+
"input_audio_transcription_completed",
|
|
1863
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
1864
|
+
);
|
|
1865
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1866
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
1867
|
+
}
|
|
1868
|
+
if (this.stt instanceof import_stt.STT) {
|
|
1869
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1870
|
+
this.stt.off("error", this.onModelError);
|
|
1871
|
+
}
|
|
1872
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
1873
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1874
|
+
this.tts.off("error", this.onModelError);
|
|
1875
|
+
}
|
|
1876
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
1877
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1878
|
+
}
|
|
1879
|
+
this.detachAudioInput();
|
|
1880
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1881
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1882
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1883
|
+
this.realtimeSession = void 0;
|
|
1884
|
+
this.audioRecognition = void 0;
|
|
1885
|
+
}
|
|
1777
1886
|
}
|
|
1778
1887
|
function toOaiToolChoice(toolChoice) {
|
|
1779
1888
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1780
1889
|
}
|
|
1781
1890
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1782
1891
|
0 && (module.exports = {
|
|
1783
|
-
AgentActivity
|
|
1892
|
+
AgentActivity,
|
|
1893
|
+
agentActivityStorage
|
|
1784
1894
|
});
|
|
1785
1895
|
//# sourceMappingURL=agent_activity.cjs.map
|