@livekit/agents 1.0.46 → 1.0.48
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/index.cjs +29 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +2 -0
- package/dist/beta/index.d.ts +2 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/workflows/index.cjs +29 -0
- package/dist/beta/workflows/index.cjs.map +1 -0
- package/dist/beta/workflows/index.d.cts +2 -0
- package/dist/beta/workflows/index.d.ts +2 -0
- package/dist/beta/workflows/index.d.ts.map +1 -0
- package/dist/beta/workflows/index.js +7 -0
- package/dist/beta/workflows/index.js.map +1 -0
- package/dist/beta/workflows/task_group.cjs +162 -0
- package/dist/beta/workflows/task_group.cjs.map +1 -0
- package/dist/beta/workflows/task_group.d.cts +32 -0
- package/dist/beta/workflows/task_group.d.ts +32 -0
- package/dist/beta/workflows/task_group.d.ts.map +1 -0
- package/dist/beta/workflows/task_group.js +138 -0
- package/dist/beta/workflows/task_group.js.map +1 -0
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.d.cts +59 -59
- package/dist/inference/api_protos.d.ts +59 -59
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +108 -1
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +14 -1
- package/dist/llm/chat_context.d.ts +14 -1
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +108 -1
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +43 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +43 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +2 -2
- package/dist/llm/provider_format/index.d.ts +2 -2
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/tool_context.cjs +7 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +10 -2
- package/dist/llm/tool_context.d.ts +10 -2
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +6 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/utils.cjs +32 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +32 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +153 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +30 -4
- package/dist/voice/agent.d.ts +30 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +149 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +406 -298
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +41 -7
- package/dist/voice/agent_activity.d.ts +41 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +407 -294
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +140 -40
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +19 -7
- package/dist/voice/agent_session.d.ts +19 -7
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +137 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +4 -0
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +4 -0
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +11 -2
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +12 -3
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/fake_llm.cjs +127 -0
- package/dist/voice/testing/fake_llm.cjs.map +1 -0
- package/dist/voice/testing/fake_llm.d.cts +30 -0
- package/dist/voice/testing/fake_llm.d.ts +30 -0
- package/dist/voice/testing/fake_llm.d.ts.map +1 -0
- package/dist/voice/testing/fake_llm.js +103 -0
- package/dist/voice/testing/fake_llm.js.map +1 -0
- package/dist/voice/testing/index.cjs +3 -0
- package/dist/voice/testing/index.cjs.map +1 -1
- package/dist/voice/testing/index.d.cts +1 -0
- package/dist/voice/testing/index.d.ts +1 -0
- package/dist/voice/testing/index.d.ts.map +1 -1
- package/dist/voice/testing/index.js +2 -0
- package/dist/voice/testing/index.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/beta/index.ts +9 -0
- package/src/beta/workflows/index.ts +9 -0
- package/src/beta/workflows/task_group.ts +194 -0
- package/src/cli.ts +20 -33
- package/src/index.ts +2 -1
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.test.ts +48 -0
- package/src/llm/chat_context.ts +158 -0
- package/src/llm/index.ts +1 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/llm/tool_context.ts +14 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +41 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +200 -10
- package/src/voice/agent_activity.ts +466 -290
- package/src/voice/agent_session.ts +178 -40
- package/src/voice/audio_recognition.ts +4 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/room_io/room_io.ts +14 -3
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/fake_llm.ts +138 -0
- package/src/voice/testing/index.ts +2 -0
- package/src/voice/testing/run_result.ts +81 -23
|
@@ -23,6 +23,7 @@ import {
|
|
|
23
23
|
type RealtimeSession,
|
|
24
24
|
type ToolChoice,
|
|
25
25
|
type ToolContext,
|
|
26
|
+
ToolFlag,
|
|
26
27
|
} from '../llm/index.js';
|
|
27
28
|
import type { LLMError } from '../llm/llm.js';
|
|
28
29
|
import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
@@ -35,7 +36,7 @@ import type {
|
|
|
35
36
|
TTSMetrics,
|
|
36
37
|
VADMetrics,
|
|
37
38
|
} from '../metrics/base.js';
|
|
38
|
-
import {
|
|
39
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
39
40
|
import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
|
|
40
41
|
import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
|
|
41
42
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
@@ -43,7 +44,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
|
|
|
43
44
|
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
44
45
|
import { VAD, type VADEvent } from '../vad.js';
|
|
45
46
|
import type { Agent, ModelSettings } from './agent.js';
|
|
46
|
-
import {
|
|
47
|
+
import {
|
|
48
|
+
StopResponse,
|
|
49
|
+
_getActivityTaskInfo,
|
|
50
|
+
_setActivityTaskInfo,
|
|
51
|
+
functionCallStorage,
|
|
52
|
+
speechHandleStorage,
|
|
53
|
+
} from './agent.js';
|
|
47
54
|
import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
48
55
|
import {
|
|
49
56
|
AudioRecognition,
|
|
@@ -60,7 +67,7 @@ import {
|
|
|
60
67
|
createSpeechCreatedEvent,
|
|
61
68
|
createUserInputTranscribedEvent,
|
|
62
69
|
} from './events.js';
|
|
63
|
-
import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
|
|
70
|
+
import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
|
|
64
71
|
import {
|
|
65
72
|
type _AudioOut,
|
|
66
73
|
type _TextOut,
|
|
@@ -76,7 +83,13 @@ import type { TimedString } from './io.js';
|
|
|
76
83
|
import { SpeechHandle } from './speech_handle.js';
|
|
77
84
|
import { setParticipantSpanAttributes } from './utils.js';
|
|
78
85
|
|
|
79
|
-
const
|
|
86
|
+
export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
|
|
87
|
+
export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
|
|
88
|
+
|
|
89
|
+
interface OnEnterData {
|
|
90
|
+
session: AgentSession;
|
|
91
|
+
agent: Agent;
|
|
92
|
+
}
|
|
80
93
|
|
|
81
94
|
interface PreemptiveGeneration {
|
|
82
95
|
speechHandle: SpeechHandle;
|
|
@@ -89,31 +102,47 @@ interface PreemptiveGeneration {
|
|
|
89
102
|
}
|
|
90
103
|
|
|
91
104
|
export class AgentActivity implements RecognitionHooks {
|
|
105
|
+
agent: Agent;
|
|
106
|
+
agentSession: AgentSession;
|
|
107
|
+
|
|
92
108
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
109
|
+
|
|
93
110
|
private started = false;
|
|
94
111
|
private audioRecognition?: AudioRecognition;
|
|
95
112
|
private realtimeSession?: RealtimeSession;
|
|
96
113
|
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
97
114
|
private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
98
115
|
private logger = log();
|
|
99
|
-
private
|
|
116
|
+
private _schedulingPaused = true;
|
|
117
|
+
private _drainBlockedTasks: Task<any>[] = [];
|
|
100
118
|
private _currentSpeech?: SpeechHandle;
|
|
101
119
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
102
120
|
private q_updated: Future;
|
|
103
121
|
private speechTasks: Set<Task<void>> = new Set();
|
|
104
122
|
private lock = new Mutex();
|
|
105
|
-
private audioStream = new
|
|
123
|
+
private audioStream = new MultiInputStream<AudioFrame>();
|
|
124
|
+
private audioStreamId?: string;
|
|
125
|
+
|
|
106
126
|
// default to null as None, which maps to the default provider tool choice value
|
|
107
127
|
private toolChoice: ToolChoice | null = null;
|
|
108
128
|
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
109
129
|
|
|
110
|
-
agent: Agent;
|
|
111
|
-
agentSession: AgentSession;
|
|
112
|
-
|
|
113
130
|
/** @internal */
|
|
114
131
|
_mainTask?: Task<void>;
|
|
115
|
-
|
|
116
|
-
|
|
132
|
+
_onEnterTask?: Task<void>;
|
|
133
|
+
_onExitTask?: Task<void>;
|
|
134
|
+
_userTurnCompletedTask?: Task<void>;
|
|
135
|
+
|
|
136
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
|
|
137
|
+
this.onGenerationCreated(ev);
|
|
138
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
|
|
139
|
+
this.onInputSpeechStarted(ev);
|
|
140
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
|
|
141
|
+
this.onInputSpeechStopped(ev);
|
|
142
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
|
|
143
|
+
this.onInputAudioTranscriptionCompleted(ev);
|
|
144
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
|
|
145
|
+
this.onError(ev);
|
|
117
146
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
118
147
|
this.agent = agent;
|
|
119
148
|
this.agentSession = agentSession;
|
|
@@ -133,7 +162,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
133
162
|
|
|
134
163
|
if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
|
|
135
164
|
this.logger.warn(
|
|
136
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
165
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
|
|
137
166
|
);
|
|
138
167
|
this.turnDetectionMode = undefined;
|
|
139
168
|
}
|
|
@@ -211,120 +240,142 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
211
240
|
async start(): Promise<void> {
|
|
212
241
|
const unlock = await this.lock.lock();
|
|
213
242
|
try {
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
});
|
|
243
|
+
await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
|
|
244
|
+
} finally {
|
|
245
|
+
unlock();
|
|
246
|
+
}
|
|
247
|
+
}
|
|
220
248
|
|
|
221
|
-
|
|
249
|
+
async resume(): Promise<void> {
|
|
250
|
+
const unlock = await this.lock.lock();
|
|
251
|
+
try {
|
|
252
|
+
await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
|
|
253
|
+
} finally {
|
|
254
|
+
unlock();
|
|
255
|
+
}
|
|
256
|
+
}
|
|
222
257
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
this.realtimeSession.on('error', (ev) => this.onError(ev));
|
|
234
|
-
|
|
235
|
-
removeInstructions(this.agent._chatCtx);
|
|
236
|
-
try {
|
|
237
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
238
|
-
} catch (error) {
|
|
239
|
-
this.logger.error(error, 'failed to update the instructions');
|
|
240
|
-
}
|
|
258
|
+
private async _startSession(options: {
|
|
259
|
+
spanName: 'start_agent_activity' | 'resume_agent_activity';
|
|
260
|
+
runOnEnter: boolean;
|
|
261
|
+
}): Promise<void> {
|
|
262
|
+
const { spanName, runOnEnter } = options;
|
|
263
|
+
const startSpan = tracer.startSpan({
|
|
264
|
+
name: spanName,
|
|
265
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
266
|
+
context: ROOT_CONTEXT,
|
|
267
|
+
});
|
|
241
268
|
|
|
242
|
-
|
|
243
|
-
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
244
|
-
} catch (error) {
|
|
245
|
-
this.logger.error(error, 'failed to update the chat context');
|
|
246
|
-
}
|
|
269
|
+
this.agent._agentActivity = this;
|
|
247
270
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
271
|
+
if (this.llm instanceof RealtimeModel) {
|
|
272
|
+
this.realtimeSession = this.llm.session();
|
|
273
|
+
this.realtimeSpans = new Map<string, Span>();
|
|
274
|
+
this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
|
|
275
|
+
this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
276
|
+
this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
277
|
+
this.realtimeSession.on(
|
|
278
|
+
'input_audio_transcription_completed',
|
|
279
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
280
|
+
);
|
|
281
|
+
this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
|
|
282
|
+
this.realtimeSession.on('error', this.onModelError);
|
|
253
283
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
);
|
|
260
|
-
}
|
|
261
|
-
} else if (this.llm instanceof LLM) {
|
|
262
|
-
try {
|
|
263
|
-
updateInstructions({
|
|
264
|
-
chatCtx: this.agent._chatCtx,
|
|
265
|
-
instructions: this.agent.instructions,
|
|
266
|
-
addIfMissing: true,
|
|
267
|
-
});
|
|
268
|
-
} catch (error) {
|
|
269
|
-
this.logger.error('failed to update the instructions', error);
|
|
270
|
-
}
|
|
284
|
+
removeInstructions(this.agent._chatCtx);
|
|
285
|
+
try {
|
|
286
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
287
|
+
} catch (error) {
|
|
288
|
+
this.logger.error(error, 'failed to update the instructions');
|
|
271
289
|
}
|
|
272
290
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
this.
|
|
291
|
+
try {
|
|
292
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
293
|
+
} catch (error) {
|
|
294
|
+
this.logger.error(error, 'failed to update the chat context');
|
|
277
295
|
}
|
|
278
296
|
|
|
279
|
-
|
|
280
|
-
this.
|
|
281
|
-
|
|
297
|
+
try {
|
|
298
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
299
|
+
} catch (error) {
|
|
300
|
+
this.logger.error(error, 'failed to update the tools');
|
|
282
301
|
}
|
|
283
302
|
|
|
284
|
-
if (this.tts
|
|
285
|
-
this.
|
|
286
|
-
|
|
303
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
304
|
+
this.logger.error(
|
|
305
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
306
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
307
|
+
'or set a TTS model.',
|
|
308
|
+
);
|
|
287
309
|
}
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
310
|
+
} else if (this.llm instanceof LLM) {
|
|
311
|
+
try {
|
|
312
|
+
updateInstructions({
|
|
313
|
+
chatCtx: this.agent._chatCtx,
|
|
314
|
+
instructions: this.agent.instructions,
|
|
315
|
+
addIfMissing: true,
|
|
316
|
+
});
|
|
317
|
+
} catch (error) {
|
|
318
|
+
this.logger.error('failed to update the instructions', error);
|
|
291
319
|
}
|
|
320
|
+
}
|
|
292
321
|
|
|
293
|
-
|
|
294
|
-
recognitionHooks: this,
|
|
295
|
-
// Disable stt node if stt is not provided
|
|
296
|
-
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
297
|
-
vad: this.vad,
|
|
298
|
-
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
299
|
-
turnDetectionMode: this.turnDetectionMode,
|
|
300
|
-
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
301
|
-
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
302
|
-
rootSpanContext: this.agentSession.rootSpanContext,
|
|
303
|
-
sttModel: this.stt?.label,
|
|
304
|
-
sttProvider: this.getSttProvider(),
|
|
305
|
-
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
306
|
-
});
|
|
307
|
-
this.audioRecognition.start();
|
|
308
|
-
this.started = true;
|
|
322
|
+
// TODO(parity): Record initial AgentConfigUpdate in chat context
|
|
309
323
|
|
|
310
|
-
|
|
324
|
+
// metrics and error handling
|
|
325
|
+
if (this.llm instanceof LLM) {
|
|
326
|
+
this.llm.on('metrics_collected', this.onMetricsCollected);
|
|
327
|
+
this.llm.on('error', this.onModelError);
|
|
328
|
+
}
|
|
311
329
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
317
|
-
});
|
|
330
|
+
if (this.stt instanceof STT) {
|
|
331
|
+
this.stt.on('metrics_collected', this.onMetricsCollected);
|
|
332
|
+
this.stt.on('error', this.onModelError);
|
|
333
|
+
}
|
|
318
334
|
|
|
319
|
-
|
|
320
|
-
|
|
335
|
+
if (this.tts instanceof TTS) {
|
|
336
|
+
this.tts.on('metrics_collected', this.onMetricsCollected);
|
|
337
|
+
this.tts.on('error', this.onModelError);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (this.vad instanceof VAD) {
|
|
341
|
+
this.vad.on('metrics_collected', this.onMetricsCollected);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
this.audioRecognition = new AudioRecognition({
|
|
345
|
+
recognitionHooks: this,
|
|
346
|
+
// Disable stt node if stt is not provided
|
|
347
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
348
|
+
vad: this.vad,
|
|
349
|
+
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
350
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
351
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
352
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
353
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
354
|
+
sttModel: this.stt?.label,
|
|
355
|
+
sttProvider: this.getSttProvider(),
|
|
356
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
357
|
+
});
|
|
358
|
+
this.audioRecognition.start();
|
|
359
|
+
this.started = true;
|
|
360
|
+
|
|
361
|
+
this._resumeSchedulingTask();
|
|
362
|
+
|
|
363
|
+
if (runOnEnter) {
|
|
364
|
+
this._onEnterTask = this.createSpeechTask({
|
|
365
|
+
taskFn: () =>
|
|
366
|
+
onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
|
|
367
|
+
tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
368
|
+
name: 'on_enter',
|
|
369
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
370
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
371
|
+
}),
|
|
372
|
+
),
|
|
373
|
+
inlineTask: true,
|
|
321
374
|
name: 'AgentActivity_onEnter',
|
|
322
375
|
});
|
|
323
|
-
|
|
324
|
-
startSpan.end();
|
|
325
|
-
} finally {
|
|
326
|
-
unlock();
|
|
327
376
|
}
|
|
377
|
+
|
|
378
|
+
startSpan.end();
|
|
328
379
|
}
|
|
329
380
|
|
|
330
381
|
get currentSpeech(): SpeechHandle | undefined {
|
|
@@ -362,8 +413,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
362
413
|
return this.agent.toolCtx;
|
|
363
414
|
}
|
|
364
415
|
|
|
365
|
-
get
|
|
366
|
-
return this.
|
|
416
|
+
get schedulingPaused(): boolean {
|
|
417
|
+
return this._schedulingPaused;
|
|
367
418
|
}
|
|
368
419
|
|
|
369
420
|
get realtimeLLMSession(): RealtimeSession | undefined {
|
|
@@ -406,6 +457,20 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
406
457
|
}
|
|
407
458
|
}
|
|
408
459
|
|
|
460
|
+
// TODO: Add when AgentConfigUpdate is ported to ChatContext.
|
|
461
|
+
async updateTools(tools: ToolContext): Promise<void> {
|
|
462
|
+
this.agent._tools = { ...tools };
|
|
463
|
+
|
|
464
|
+
if (this.realtimeSession) {
|
|
465
|
+
await this.realtimeSession.updateTools(tools);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if (this.llm instanceof LLM) {
|
|
469
|
+
// for realtime LLM, we assume the server will remove unvalid tool messages
|
|
470
|
+
await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
409
474
|
updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
|
|
410
475
|
if (toolChoice !== undefined) {
|
|
411
476
|
this.toolChoice = toolChoice;
|
|
@@ -417,18 +482,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
417
482
|
}
|
|
418
483
|
|
|
419
484
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
this.audioStream.detachSource();
|
|
423
|
-
}
|
|
485
|
+
void this.audioStream.close();
|
|
486
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
424
487
|
|
|
425
|
-
|
|
426
|
-
* We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
|
|
427
|
-
* The tee() operation should be applied to the deferred stream, not the original audioStream.
|
|
428
|
-
* This is important because teeing the original stream directly makes it very difficult—if not
|
|
429
|
-
* impossible—to implement stream unlock logic cleanly.
|
|
430
|
-
*/
|
|
431
|
-
this.audioStream.setSource(audioStream);
|
|
488
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
432
489
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
433
490
|
|
|
434
491
|
if (this.realtimeSession) {
|
|
@@ -441,16 +498,29 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
441
498
|
}
|
|
442
499
|
|
|
443
500
|
detachAudioInput(): void {
|
|
444
|
-
this.
|
|
501
|
+
if (this.audioStreamId === undefined) {
|
|
502
|
+
return;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
void this.audioStream.close();
|
|
506
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
507
|
+
this.audioStreamId = undefined;
|
|
445
508
|
}
|
|
446
509
|
|
|
447
|
-
commitUserTurn(
|
|
510
|
+
commitUserTurn(
|
|
511
|
+
options: {
|
|
512
|
+
audioDetached?: boolean;
|
|
513
|
+
throwIfNotReady?: boolean;
|
|
514
|
+
} = {},
|
|
515
|
+
) {
|
|
516
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
448
517
|
if (!this.audioRecognition) {
|
|
449
|
-
|
|
518
|
+
if (throwIfNotReady) {
|
|
519
|
+
throw new Error('AudioRecognition is not initialized');
|
|
520
|
+
}
|
|
521
|
+
return;
|
|
450
522
|
}
|
|
451
523
|
|
|
452
|
-
// TODO(brian): add audio_detached flag
|
|
453
|
-
const audioDetached = false;
|
|
454
524
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
455
525
|
}
|
|
456
526
|
|
|
@@ -508,14 +578,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
508
578
|
}),
|
|
509
579
|
);
|
|
510
580
|
const task = this.createSpeechTask({
|
|
511
|
-
|
|
581
|
+
taskFn: (abortController: AbortController) =>
|
|
512
582
|
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
513
|
-
),
|
|
514
583
|
ownedSpeechHandle: handle,
|
|
515
584
|
name: 'AgentActivity.say_tts',
|
|
516
585
|
});
|
|
517
586
|
|
|
518
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
587
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
519
588
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
520
589
|
return handle;
|
|
521
590
|
}
|
|
@@ -628,9 +697,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
628
697
|
return;
|
|
629
698
|
}
|
|
630
699
|
|
|
631
|
-
if (this.
|
|
700
|
+
if (this.schedulingPaused) {
|
|
632
701
|
// TODO(shubhra): should we "forward" this new turn to the next agent?
|
|
633
|
-
this.logger.warn('skipping new realtime generation, the
|
|
702
|
+
this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
|
|
634
703
|
return;
|
|
635
704
|
}
|
|
636
705
|
|
|
@@ -648,9 +717,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
648
717
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
649
718
|
|
|
650
719
|
this.createSpeechTask({
|
|
651
|
-
|
|
720
|
+
taskFn: (abortController: AbortController) =>
|
|
652
721
|
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
653
|
-
),
|
|
654
722
|
ownedSpeechHandle: handle,
|
|
655
723
|
name: 'AgentActivity.realtimeGeneration',
|
|
656
724
|
});
|
|
@@ -782,7 +850,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
782
850
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
783
851
|
if (
|
|
784
852
|
!this.agentSession.options.preemptiveGeneration ||
|
|
785
|
-
this.
|
|
853
|
+
this.schedulingPaused ||
|
|
786
854
|
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
787
855
|
!(this.llm instanceof LLM)
|
|
788
856
|
) {
|
|
@@ -829,11 +897,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
829
897
|
}
|
|
830
898
|
|
|
831
899
|
private createSpeechTask(options: {
|
|
832
|
-
|
|
900
|
+
taskFn: (controller: AbortController) => Promise<void>;
|
|
901
|
+
controller?: AbortController;
|
|
833
902
|
ownedSpeechHandle?: SpeechHandle;
|
|
903
|
+
inlineTask?: boolean;
|
|
834
904
|
name?: string;
|
|
835
|
-
}):
|
|
836
|
-
const {
|
|
905
|
+
}): Task<void> {
|
|
906
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
907
|
+
|
|
908
|
+
const wrappedFn = (ctrl: AbortController) => {
|
|
909
|
+
return agentActivityStorage.run(this, () => {
|
|
910
|
+
// Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
|
|
911
|
+
// before post-construction metadata is attached to the Task instance.
|
|
912
|
+
const currentTask = Task.current();
|
|
913
|
+
if (currentTask) {
|
|
914
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
if (ownedSpeechHandle) {
|
|
918
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
919
|
+
}
|
|
920
|
+
return taskFn(ctrl);
|
|
921
|
+
});
|
|
922
|
+
};
|
|
923
|
+
|
|
924
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
925
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
837
926
|
|
|
838
927
|
this.speechTasks.add(task);
|
|
839
928
|
task.addDoneCallback(() => {
|
|
@@ -853,13 +942,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
853
942
|
this.wakeupMainTask();
|
|
854
943
|
});
|
|
855
944
|
|
|
856
|
-
return task
|
|
945
|
+
return task;
|
|
857
946
|
}
|
|
858
947
|
|
|
859
948
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
860
|
-
if (this.
|
|
949
|
+
if (this.schedulingPaused) {
|
|
861
950
|
this.cancelPreemptiveGeneration();
|
|
862
|
-
this.logger.warn(
|
|
951
|
+
this.logger.warn(
|
|
952
|
+
{ user_input: info.newTranscript },
|
|
953
|
+
'skipping user input, speech scheduling is paused',
|
|
954
|
+
);
|
|
863
955
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
864
956
|
return true;
|
|
865
957
|
}
|
|
@@ -892,7 +984,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
892
984
|
|
|
893
985
|
const oldTask = this._userTurnCompletedTask;
|
|
894
986
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
895
|
-
|
|
987
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
896
988
|
name: 'AgentActivity.userTurnCompleted',
|
|
897
989
|
});
|
|
898
990
|
return true;
|
|
@@ -928,10 +1020,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
928
1020
|
this._currentSpeech = undefined;
|
|
929
1021
|
}
|
|
930
1022
|
|
|
931
|
-
//
|
|
932
|
-
//
|
|
933
|
-
|
|
934
|
-
|
|
1023
|
+
// if we're draining/pausing and there are no more speech tasks, we can exit.
|
|
1024
|
+
// only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
|
|
1025
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
1026
|
+
|
|
1027
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
1028
|
+
this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
|
|
935
1029
|
break;
|
|
936
1030
|
}
|
|
937
1031
|
|
|
@@ -941,6 +1035,39 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
941
1035
|
this.logger.info('AgentActivity mainTask: exiting');
|
|
942
1036
|
}
|
|
943
1037
|
|
|
1038
|
+
private getDrainPendingSpeechTasks(): Task<void>[] {
|
|
1039
|
+
const blockedHandles: SpeechHandle[] = [];
|
|
1040
|
+
|
|
1041
|
+
for (const task of this._drainBlockedTasks) {
|
|
1042
|
+
const info = _getActivityTaskInfo(task);
|
|
1043
|
+
if (!info) {
|
|
1044
|
+
this.logger.error('blocked task without activity info; skipping.');
|
|
1045
|
+
continue;
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
if (!info.speechHandle) {
|
|
1049
|
+
continue; // onEnter/onExit
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
blockedHandles.push(info.speechHandle);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
const toWait: Task<void>[] = [];
|
|
1056
|
+
for (const task of this.speechTasks) {
|
|
1057
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
1058
|
+
continue;
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
const info = _getActivityTaskInfo(task);
|
|
1062
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
1063
|
+
continue;
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
toWait.push(task);
|
|
1067
|
+
}
|
|
1068
|
+
return toWait;
|
|
1069
|
+
}
|
|
1070
|
+
|
|
944
1071
|
private wakeupMainTask(): void {
|
|
945
1072
|
this.q_updated.resolve();
|
|
946
1073
|
}
|
|
@@ -982,7 +1109,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
982
1109
|
throw new Error('trying to generate reply without an LLM model');
|
|
983
1110
|
}
|
|
984
1111
|
|
|
985
|
-
const functionCall =
|
|
1112
|
+
const functionCall = functionCallStorage.getStore()?.functionCall;
|
|
986
1113
|
if (toolChoice === undefined && functionCall !== undefined) {
|
|
987
1114
|
// when generateReply is called inside a tool, set toolChoice to 'none' by default
|
|
988
1115
|
toolChoice = 'none';
|
|
@@ -1004,7 +1131,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1004
1131
|
|
|
1005
1132
|
if (this.llm instanceof RealtimeModel) {
|
|
1006
1133
|
this.createSpeechTask({
|
|
1007
|
-
|
|
1134
|
+
taskFn: (abortController: AbortController) =>
|
|
1008
1135
|
this.realtimeReplyTask({
|
|
1009
1136
|
speechHandle: handle,
|
|
1010
1137
|
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
@@ -1016,7 +1143,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1016
1143
|
},
|
|
1017
1144
|
abortController,
|
|
1018
1145
|
}),
|
|
1019
|
-
),
|
|
1020
1146
|
ownedSpeechHandle: handle,
|
|
1021
1147
|
name: 'AgentActivity.realtimeReply',
|
|
1022
1148
|
});
|
|
@@ -1028,12 +1154,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1028
1154
|
instructions = `${this.agent.instructions}\n${instructions}`;
|
|
1029
1155
|
}
|
|
1030
1156
|
|
|
1157
|
+
// Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
|
|
1158
|
+
const onEnterData = onEnterStorage.getStore();
|
|
1159
|
+
const shouldFilterTools =
|
|
1160
|
+
onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
|
|
1161
|
+
|
|
1162
|
+
const tools = shouldFilterTools
|
|
1163
|
+
? Object.fromEntries(
|
|
1164
|
+
Object.entries(this.agent.toolCtx).filter(
|
|
1165
|
+
([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
|
|
1166
|
+
),
|
|
1167
|
+
)
|
|
1168
|
+
: this.agent.toolCtx;
|
|
1169
|
+
|
|
1031
1170
|
const task = this.createSpeechTask({
|
|
1032
|
-
|
|
1171
|
+
taskFn: (abortController: AbortController) =>
|
|
1033
1172
|
this.pipelineReplyTask(
|
|
1034
1173
|
handle,
|
|
1035
1174
|
chatCtx ?? this.agent.chatCtx,
|
|
1036
|
-
|
|
1175
|
+
tools,
|
|
1037
1176
|
{
|
|
1038
1177
|
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
1039
1178
|
},
|
|
@@ -1041,12 +1180,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1041
1180
|
instructions,
|
|
1042
1181
|
userMessage,
|
|
1043
1182
|
),
|
|
1044
|
-
),
|
|
1045
1183
|
ownedSpeechHandle: handle,
|
|
1046
1184
|
name: 'AgentActivity.pipelineReply',
|
|
1047
1185
|
});
|
|
1048
1186
|
|
|
1049
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
1187
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
1050
1188
|
}
|
|
1051
1189
|
|
|
1052
1190
|
if (scheduleSpeech) {
|
|
@@ -1055,16 +1193,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1055
1193
|
return handle;
|
|
1056
1194
|
}
|
|
1057
1195
|
|
|
1058
|
-
interrupt(): Future<void> {
|
|
1196
|
+
interrupt(options: { force?: boolean } = {}): Future<void> {
|
|
1197
|
+
const { force = false } = options;
|
|
1198
|
+
this.cancelPreemptiveGeneration();
|
|
1199
|
+
|
|
1059
1200
|
const future = new Future<void>();
|
|
1060
1201
|
const currentSpeech = this._currentSpeech;
|
|
1061
1202
|
|
|
1062
1203
|
//TODO(AJS-273): add interrupt for background speeches
|
|
1063
1204
|
|
|
1064
|
-
currentSpeech?.interrupt();
|
|
1205
|
+
currentSpeech?.interrupt(force);
|
|
1065
1206
|
|
|
1066
1207
|
for (const [_, __, speech] of this.speechQueue) {
|
|
1067
|
-
speech.interrupt();
|
|
1208
|
+
speech.interrupt(force);
|
|
1068
1209
|
}
|
|
1069
1210
|
|
|
1070
1211
|
this.realtimeSession?.interrupt();
|
|
@@ -1087,13 +1228,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1087
1228
|
}
|
|
1088
1229
|
}
|
|
1089
1230
|
|
|
1090
|
-
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?:
|
|
1231
|
+
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
|
|
1091
1232
|
if (oldTask) {
|
|
1092
1233
|
// We never cancel user code as this is very confusing.
|
|
1093
1234
|
// So we wait for the old execution of onUserTurnCompleted to finish.
|
|
1094
1235
|
// In practice this is OK because most speeches will be interrupted if a new turn
|
|
1095
1236
|
// is detected. So the previous execution should complete quickly.
|
|
1096
|
-
await oldTask;
|
|
1237
|
+
await oldTask.result;
|
|
1097
1238
|
}
|
|
1098
1239
|
|
|
1099
1240
|
// When the audio recognition detects the end of a user turn:
|
|
@@ -1551,13 +1692,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1551
1692
|
for (const msg of toolsMessages) {
|
|
1552
1693
|
msg.createdAt = replyStartedAt;
|
|
1553
1694
|
}
|
|
1554
|
-
|
|
1555
|
-
//
|
|
1556
|
-
//
|
|
1695
|
+
// Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
|
|
1696
|
+
// were already added by onToolExecutionStarted when the tool execution began.
|
|
1697
|
+
// Inserting function_calls again would create duplicates that break provider APIs
|
|
1698
|
+
// (e.g. Google's "function response parts != function call parts" error).
|
|
1557
1699
|
const toolCallOutputs = toolsMessages.filter(
|
|
1558
1700
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1559
1701
|
);
|
|
1560
1702
|
if (toolCallOutputs.length > 0) {
|
|
1703
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1561
1704
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1562
1705
|
}
|
|
1563
1706
|
}
|
|
@@ -1665,52 +1808,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1665
1808
|
return;
|
|
1666
1809
|
}
|
|
1667
1810
|
|
|
1668
|
-
const functionToolsExecutedEvent =
|
|
1669
|
-
|
|
1670
|
-
functionCallOutputs: [],
|
|
1671
|
-
});
|
|
1672
|
-
let shouldGenerateToolReply: boolean = false;
|
|
1673
|
-
let newAgentTask: Agent | null = null;
|
|
1674
|
-
let ignoreTaskSwitch: boolean = false;
|
|
1675
|
-
|
|
1676
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1677
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
1678
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1679
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1680
|
-
if (sanitizedOut.replyRequired) {
|
|
1681
|
-
shouldGenerateToolReply = true;
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
|
|
1685
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
1686
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
1687
|
-
ignoreTaskSwitch = true;
|
|
1688
|
-
// TODO(brian): should we mark the function call as failed to notify the LLM?
|
|
1689
|
-
}
|
|
1690
|
-
|
|
1691
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1692
|
-
|
|
1693
|
-
this.logger.debug(
|
|
1694
|
-
{
|
|
1695
|
-
speechId: speechHandle.id,
|
|
1696
|
-
name: sanitizedOut.toolCall?.name,
|
|
1697
|
-
args: sanitizedOut.toolCall.args,
|
|
1698
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
1699
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
1700
|
-
},
|
|
1701
|
-
'Tool call execution finished',
|
|
1702
|
-
);
|
|
1703
|
-
}
|
|
1811
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
1812
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1704
1813
|
|
|
1705
1814
|
this.agentSession.emit(
|
|
1706
1815
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1707
1816
|
functionToolsExecutedEvent,
|
|
1708
1817
|
);
|
|
1709
1818
|
|
|
1710
|
-
let
|
|
1819
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1711
1820
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1712
1821
|
this.agentSession.updateAgent(newAgentTask);
|
|
1713
|
-
|
|
1822
|
+
schedulingPaused = true;
|
|
1714
1823
|
}
|
|
1715
1824
|
|
|
1716
1825
|
const toolMessages = [
|
|
@@ -1725,11 +1834,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1725
1834
|
|
|
1726
1835
|
// Avoid setting tool_choice to "required" or a specific function when
|
|
1727
1836
|
// passing tool response back to the LLM
|
|
1728
|
-
const respondToolChoice =
|
|
1837
|
+
const respondToolChoice =
|
|
1838
|
+
schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1729
1839
|
|
|
1730
1840
|
// Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
|
|
1731
1841
|
const toolResponseTask = this.createSpeechTask({
|
|
1732
|
-
|
|
1842
|
+
taskFn: () =>
|
|
1733
1843
|
this.pipelineReplyTask(
|
|
1734
1844
|
speechHandle,
|
|
1735
1845
|
chatCtx,
|
|
@@ -1740,12 +1850,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1740
1850
|
undefined,
|
|
1741
1851
|
toolMessages,
|
|
1742
1852
|
),
|
|
1743
|
-
),
|
|
1744
1853
|
ownedSpeechHandle: speechHandle,
|
|
1745
1854
|
name: 'AgentActivity.pipelineReply',
|
|
1746
1855
|
});
|
|
1747
1856
|
|
|
1748
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1857
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1749
1858
|
|
|
1750
1859
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1751
1860
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -1753,15 +1862,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1753
1862
|
msg.createdAt = replyStartedAt;
|
|
1754
1863
|
}
|
|
1755
1864
|
|
|
1756
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1757
|
-
|
|
1758
|
-
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1759
|
-
// were already added by onToolExecutionStarted when the tool execution began
|
|
1760
1865
|
const toolCallOutputs = toolMessages.filter(
|
|
1761
1866
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1762
1867
|
);
|
|
1763
1868
|
|
|
1764
1869
|
if (toolCallOutputs.length > 0) {
|
|
1870
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1765
1871
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1766
1872
|
}
|
|
1767
1873
|
}
|
|
@@ -2164,50 +2270,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2164
2270
|
return;
|
|
2165
2271
|
}
|
|
2166
2272
|
|
|
2167
|
-
const functionToolsExecutedEvent =
|
|
2168
|
-
|
|
2169
|
-
functionCallOutputs: [],
|
|
2170
|
-
});
|
|
2171
|
-
let shouldGenerateToolReply: boolean = false;
|
|
2172
|
-
let newAgentTask: Agent | null = null;
|
|
2173
|
-
let ignoreTaskSwitch: boolean = false;
|
|
2174
|
-
|
|
2175
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
2176
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2177
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2178
|
-
if (sanitizedOut.replyRequired) {
|
|
2179
|
-
shouldGenerateToolReply = true;
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
|
-
|
|
2183
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2184
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2185
|
-
ignoreTaskSwitch = true;
|
|
2186
|
-
}
|
|
2187
|
-
|
|
2188
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2189
|
-
|
|
2190
|
-
this.logger.debug(
|
|
2191
|
-
{
|
|
2192
|
-
speechId: speechHandle.id,
|
|
2193
|
-
name: sanitizedOut.toolCall?.name,
|
|
2194
|
-
args: sanitizedOut.toolCall.args,
|
|
2195
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
2196
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2197
|
-
},
|
|
2198
|
-
'Tool call execution finished',
|
|
2199
|
-
);
|
|
2200
|
-
}
|
|
2273
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
2274
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
2201
2275
|
|
|
2202
2276
|
this.agentSession.emit(
|
|
2203
2277
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
2204
2278
|
functionToolsExecutedEvent,
|
|
2205
2279
|
);
|
|
2206
2280
|
|
|
2207
|
-
let
|
|
2281
|
+
let schedulingPaused = this.schedulingPaused;
|
|
2208
2282
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
2209
2283
|
this.agentSession.updateAgent(newAgentTask);
|
|
2210
|
-
|
|
2284
|
+
schedulingPaused = true;
|
|
2211
2285
|
}
|
|
2212
2286
|
|
|
2213
2287
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -2263,15 +2337,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2263
2337
|
}),
|
|
2264
2338
|
);
|
|
2265
2339
|
|
|
2266
|
-
const toolChoice =
|
|
2340
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
2267
2341
|
this.createSpeechTask({
|
|
2268
|
-
|
|
2342
|
+
taskFn: (abortController: AbortController) =>
|
|
2269
2343
|
this.realtimeReplyTask({
|
|
2270
2344
|
speechHandle: replySpeechHandle,
|
|
2271
2345
|
modelSettings: { toolChoice },
|
|
2272
2346
|
abortController,
|
|
2273
2347
|
}),
|
|
2274
|
-
),
|
|
2275
2348
|
ownedSpeechHandle: replySpeechHandle,
|
|
2276
2349
|
name: 'AgentActivity.realtime_reply',
|
|
2277
2350
|
});
|
|
@@ -2279,6 +2352,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2279
2352
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
2280
2353
|
}
|
|
2281
2354
|
|
|
2355
|
+
private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
|
|
2356
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
2357
|
+
functionCalls: [],
|
|
2358
|
+
functionCallOutputs: [],
|
|
2359
|
+
});
|
|
2360
|
+
|
|
2361
|
+
let shouldGenerateToolReply = false;
|
|
2362
|
+
let newAgentTask: Agent | null = null;
|
|
2363
|
+
let ignoreTaskSwitch = false;
|
|
2364
|
+
|
|
2365
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
2366
|
+
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2367
|
+
// Keep event payload symmetric for pipeline + realtime paths.
|
|
2368
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
2369
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2370
|
+
if (sanitizedOut.replyRequired) {
|
|
2371
|
+
shouldGenerateToolReply = true;
|
|
2372
|
+
}
|
|
2373
|
+
}
|
|
2374
|
+
|
|
2375
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2376
|
+
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2377
|
+
ignoreTaskSwitch = true;
|
|
2378
|
+
}
|
|
2379
|
+
|
|
2380
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2381
|
+
|
|
2382
|
+
this.logger.debug(
|
|
2383
|
+
{
|
|
2384
|
+
speechId: speechHandle.id,
|
|
2385
|
+
name: sanitizedOut.toolCall?.name,
|
|
2386
|
+
args: sanitizedOut.toolCall.args,
|
|
2387
|
+
output: sanitizedOut.toolCallOutput?.output,
|
|
2388
|
+
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2389
|
+
},
|
|
2390
|
+
'Tool call execution finished',
|
|
2391
|
+
);
|
|
2392
|
+
}
|
|
2393
|
+
|
|
2394
|
+
return {
|
|
2395
|
+
functionToolsExecutedEvent,
|
|
2396
|
+
shouldGenerateToolReply,
|
|
2397
|
+
newAgentTask,
|
|
2398
|
+
ignoreTaskSwitch,
|
|
2399
|
+
};
|
|
2400
|
+
}
|
|
2401
|
+
|
|
2282
2402
|
private async realtimeReplyTask({
|
|
2283
2403
|
speechHandle,
|
|
2284
2404
|
modelSettings: { toolChoice },
|
|
@@ -2337,10 +2457,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2337
2457
|
priority: number,
|
|
2338
2458
|
force: boolean = false,
|
|
2339
2459
|
): void {
|
|
2340
|
-
// when force=true, we allow tool responses to bypass
|
|
2460
|
+
// when force=true, we allow tool responses to bypass scheduling pause
|
|
2341
2461
|
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
2342
|
-
if (this.
|
|
2343
|
-
throw new Error('cannot schedule new speech, the
|
|
2462
|
+
if (this.schedulingPaused && !force) {
|
|
2463
|
+
throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
|
|
2344
2464
|
}
|
|
2345
2465
|
|
|
2346
2466
|
// Monotonic time to avoid near 0 collisions
|
|
@@ -2349,6 +2469,48 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2349
2469
|
this.wakeupMainTask();
|
|
2350
2470
|
}
|
|
2351
2471
|
|
|
2472
|
+
private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
|
|
2473
|
+
if (this._schedulingPaused) return;
|
|
2474
|
+
|
|
2475
|
+
this._schedulingPaused = true;
|
|
2476
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2477
|
+
this.wakeupMainTask();
|
|
2478
|
+
|
|
2479
|
+
if (this._mainTask) {
|
|
2480
|
+
// When pausing/draining, we ensure that all speech_tasks complete fully.
|
|
2481
|
+
// This means that even if the SpeechHandle themselves have finished,
|
|
2482
|
+
// we still wait for the entire execution (e.g function_tools)
|
|
2483
|
+
await this._mainTask.result;
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
|
|
2487
|
+
private _resumeSchedulingTask(): void {
|
|
2488
|
+
if (!this._schedulingPaused) return;
|
|
2489
|
+
|
|
2490
|
+
this._schedulingPaused = false;
|
|
2491
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
2492
|
+
}
|
|
2493
|
+
|
|
2494
|
+
async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
|
|
2495
|
+
const { blockedTasks = [] } = options;
|
|
2496
|
+
const unlock = await this.lock.lock();
|
|
2497
|
+
|
|
2498
|
+
try {
|
|
2499
|
+
const span = tracer.startSpan({
|
|
2500
|
+
name: 'pause_agent_activity',
|
|
2501
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2502
|
+
});
|
|
2503
|
+
try {
|
|
2504
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2505
|
+
await this._closeSessionResources();
|
|
2506
|
+
} finally {
|
|
2507
|
+
span.end();
|
|
2508
|
+
}
|
|
2509
|
+
} finally {
|
|
2510
|
+
unlock();
|
|
2511
|
+
}
|
|
2512
|
+
}
|
|
2513
|
+
|
|
2352
2514
|
async drain(): Promise<void> {
|
|
2353
2515
|
// Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
|
|
2354
2516
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
@@ -2362,23 +2524,22 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2362
2524
|
|
|
2363
2525
|
const unlock = await this.lock.lock();
|
|
2364
2526
|
try {
|
|
2365
|
-
if (this.
|
|
2527
|
+
if (this._schedulingPaused) return;
|
|
2366
2528
|
|
|
2367
|
-
this.
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
this.createSpeechTask({
|
|
2375
|
-
task: Task.from(() => onExitTask),
|
|
2529
|
+
this._onExitTask = this.createSpeechTask({
|
|
2530
|
+
taskFn: () =>
|
|
2531
|
+
tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2532
|
+
name: 'on_exit',
|
|
2533
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2534
|
+
}),
|
|
2535
|
+
inlineTask: true,
|
|
2376
2536
|
name: 'AgentActivity_onExit',
|
|
2377
2537
|
});
|
|
2378
2538
|
|
|
2379
|
-
this.
|
|
2380
|
-
|
|
2381
|
-
await this.
|
|
2539
|
+
this.cancelPreemptiveGeneration();
|
|
2540
|
+
|
|
2541
|
+
await this._onExitTask.result;
|
|
2542
|
+
await this._pauseSchedulingTask([]);
|
|
2382
2543
|
} finally {
|
|
2383
2544
|
unlock();
|
|
2384
2545
|
}
|
|
@@ -2387,44 +2548,59 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2387
2548
|
async close(): Promise<void> {
|
|
2388
2549
|
const unlock = await this.lock.lock();
|
|
2389
2550
|
try {
|
|
2390
|
-
if (!this._draining) {
|
|
2391
|
-
this.logger.warn('task closing without draining');
|
|
2392
|
-
}
|
|
2393
|
-
|
|
2394
2551
|
this.cancelPreemptiveGeneration();
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
if (this.realtimeSession) {
|
|
2400
|
-
this.realtimeSession.off('generation_created', this.onGenerationCreated);
|
|
2401
|
-
this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
|
|
2402
|
-
this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
|
|
2403
|
-
this.realtimeSession.off(
|
|
2404
|
-
'input_audio_transcription_completed',
|
|
2405
|
-
this.onInputAudioTranscriptionCompleted,
|
|
2406
|
-
);
|
|
2407
|
-
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2408
|
-
}
|
|
2409
|
-
if (this.stt instanceof STT) {
|
|
2410
|
-
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2411
|
-
}
|
|
2412
|
-
if (this.tts instanceof TTS) {
|
|
2413
|
-
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2414
|
-
}
|
|
2415
|
-
if (this.vad instanceof VAD) {
|
|
2416
|
-
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2552
|
+
await this._closeSessionResources();
|
|
2553
|
+
|
|
2554
|
+
if (this._mainTask) {
|
|
2555
|
+
await this._mainTask.cancelAndWait();
|
|
2417
2556
|
}
|
|
2418
2557
|
|
|
2419
|
-
this.
|
|
2420
|
-
this.realtimeSpans?.clear();
|
|
2421
|
-
await this.realtimeSession?.close();
|
|
2422
|
-
await this.audioRecognition?.close();
|
|
2423
|
-
await this._mainTask?.cancelAndWait();
|
|
2558
|
+
this.agent._agentActivity = undefined;
|
|
2424
2559
|
} finally {
|
|
2425
2560
|
unlock();
|
|
2426
2561
|
}
|
|
2427
2562
|
}
|
|
2563
|
+
|
|
2564
|
+
private async _closeSessionResources(): Promise<void> {
|
|
2565
|
+
// Unregister event handlers to prevent duplicate metrics
|
|
2566
|
+
if (this.llm instanceof LLM) {
|
|
2567
|
+
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
2568
|
+
this.llm.off('error', this.onModelError);
|
|
2569
|
+
}
|
|
2570
|
+
|
|
2571
|
+
if (this.realtimeSession) {
|
|
2572
|
+
this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
|
|
2573
|
+
this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
2574
|
+
this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
2575
|
+
this.realtimeSession.off(
|
|
2576
|
+
'input_audio_transcription_completed',
|
|
2577
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
2578
|
+
);
|
|
2579
|
+
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2580
|
+
this.realtimeSession.off('error', this.onModelError);
|
|
2581
|
+
}
|
|
2582
|
+
|
|
2583
|
+
if (this.stt instanceof STT) {
|
|
2584
|
+
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2585
|
+
this.stt.off('error', this.onModelError);
|
|
2586
|
+
}
|
|
2587
|
+
|
|
2588
|
+
if (this.tts instanceof TTS) {
|
|
2589
|
+
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2590
|
+
this.tts.off('error', this.onModelError);
|
|
2591
|
+
}
|
|
2592
|
+
|
|
2593
|
+
if (this.vad instanceof VAD) {
|
|
2594
|
+
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2595
|
+
}
|
|
2596
|
+
|
|
2597
|
+
this.detachAudioInput();
|
|
2598
|
+
this.realtimeSpans?.clear();
|
|
2599
|
+
await this.realtimeSession?.close();
|
|
2600
|
+
await this.audioRecognition?.close();
|
|
2601
|
+
this.realtimeSession = undefined;
|
|
2602
|
+
this.audioRecognition = undefined;
|
|
2603
|
+
}
|
|
2428
2604
|
}
|
|
2429
2605
|
|
|
2430
2606
|
function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
|