@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -35,7 +35,7 @@ import type {
|
|
|
35
35
|
TTSMetrics,
|
|
36
36
|
VADMetrics,
|
|
37
37
|
} from '../metrics/base.js';
|
|
38
|
-
import {
|
|
38
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
39
39
|
import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
|
|
40
40
|
import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
|
|
41
41
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
@@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
|
|
|
43
43
|
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
44
44
|
import { VAD, type VADEvent } from '../vad.js';
|
|
45
45
|
import type { Agent, ModelSettings } from './agent.js';
|
|
46
|
-
import {
|
|
46
|
+
import {
|
|
47
|
+
StopResponse,
|
|
48
|
+
_getActivityTaskInfo,
|
|
49
|
+
_setActivityTaskInfo,
|
|
50
|
+
functionCallStorage,
|
|
51
|
+
speechHandleStorage,
|
|
52
|
+
} from './agent.js';
|
|
47
53
|
import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
48
54
|
import {
|
|
49
55
|
AudioRecognition,
|
|
@@ -60,7 +66,7 @@ import {
|
|
|
60
66
|
createSpeechCreatedEvent,
|
|
61
67
|
createUserInputTranscribedEvent,
|
|
62
68
|
} from './events.js';
|
|
63
|
-
import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
|
|
69
|
+
import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
|
|
64
70
|
import {
|
|
65
71
|
type _AudioOut,
|
|
66
72
|
type _TextOut,
|
|
@@ -74,8 +80,9 @@ import {
|
|
|
74
80
|
} from './generation.js';
|
|
75
81
|
import type { TimedString } from './io.js';
|
|
76
82
|
import { SpeechHandle } from './speech_handle.js';
|
|
83
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
77
84
|
|
|
78
|
-
const
|
|
85
|
+
export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
|
|
79
86
|
|
|
80
87
|
interface PreemptiveGeneration {
|
|
81
88
|
speechHandle: SpeechHandle;
|
|
@@ -88,31 +95,47 @@ interface PreemptiveGeneration {
|
|
|
88
95
|
}
|
|
89
96
|
|
|
90
97
|
export class AgentActivity implements RecognitionHooks {
|
|
98
|
+
agent: Agent;
|
|
99
|
+
agentSession: AgentSession;
|
|
100
|
+
|
|
91
101
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
102
|
+
|
|
92
103
|
private started = false;
|
|
93
104
|
private audioRecognition?: AudioRecognition;
|
|
94
105
|
private realtimeSession?: RealtimeSession;
|
|
95
106
|
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
96
107
|
private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
97
108
|
private logger = log();
|
|
98
|
-
private
|
|
109
|
+
private _schedulingPaused = true;
|
|
110
|
+
private _drainBlockedTasks: Task<any>[] = [];
|
|
99
111
|
private _currentSpeech?: SpeechHandle;
|
|
100
112
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
101
113
|
private q_updated: Future;
|
|
102
114
|
private speechTasks: Set<Task<void>> = new Set();
|
|
103
115
|
private lock = new Mutex();
|
|
104
|
-
private audioStream = new
|
|
116
|
+
private audioStream = new MultiInputStream<AudioFrame>();
|
|
117
|
+
private audioStreamId?: string;
|
|
118
|
+
|
|
105
119
|
// default to null as None, which maps to the default provider tool choice value
|
|
106
120
|
private toolChoice: ToolChoice | null = null;
|
|
107
121
|
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
108
122
|
|
|
109
|
-
agent: Agent;
|
|
110
|
-
agentSession: AgentSession;
|
|
111
|
-
|
|
112
123
|
/** @internal */
|
|
113
124
|
_mainTask?: Task<void>;
|
|
114
|
-
|
|
115
|
-
|
|
125
|
+
_onEnterTask?: Task<void>;
|
|
126
|
+
_onExitTask?: Task<void>;
|
|
127
|
+
_userTurnCompletedTask?: Task<void>;
|
|
128
|
+
|
|
129
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
|
|
130
|
+
this.onGenerationCreated(ev);
|
|
131
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
|
|
132
|
+
this.onInputSpeechStarted(ev);
|
|
133
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
|
|
134
|
+
this.onInputSpeechStopped(ev);
|
|
135
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
|
|
136
|
+
this.onInputAudioTranscriptionCompleted(ev);
|
|
137
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
|
|
138
|
+
this.onError(ev);
|
|
116
139
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
117
140
|
this.agent = agent;
|
|
118
141
|
this.agentSession = agentSession;
|
|
@@ -132,7 +155,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
132
155
|
|
|
133
156
|
if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
|
|
134
157
|
this.logger.warn(
|
|
135
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
158
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
|
|
136
159
|
);
|
|
137
160
|
this.turnDetectionMode = undefined;
|
|
138
161
|
}
|
|
@@ -210,117 +233,138 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
210
233
|
async start(): Promise<void> {
|
|
211
234
|
const unlock = await this.lock.lock();
|
|
212
235
|
try {
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
});
|
|
236
|
+
await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
|
|
237
|
+
} finally {
|
|
238
|
+
unlock();
|
|
239
|
+
}
|
|
240
|
+
}
|
|
219
241
|
|
|
220
|
-
|
|
242
|
+
async resume(): Promise<void> {
|
|
243
|
+
const unlock = await this.lock.lock();
|
|
244
|
+
try {
|
|
245
|
+
await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
|
|
246
|
+
} finally {
|
|
247
|
+
unlock();
|
|
248
|
+
}
|
|
249
|
+
}
|
|
221
250
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
this.realtimeSession.on('error', (ev) => this.onError(ev));
|
|
233
|
-
|
|
234
|
-
removeInstructions(this.agent._chatCtx);
|
|
235
|
-
try {
|
|
236
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
237
|
-
} catch (error) {
|
|
238
|
-
this.logger.error(error, 'failed to update the instructions');
|
|
239
|
-
}
|
|
251
|
+
private async _startSession(options: {
|
|
252
|
+
spanName: 'start_agent_activity' | 'resume_agent_activity';
|
|
253
|
+
runOnEnter: boolean;
|
|
254
|
+
}): Promise<void> {
|
|
255
|
+
const { spanName, runOnEnter } = options;
|
|
256
|
+
const startSpan = tracer.startSpan({
|
|
257
|
+
name: spanName,
|
|
258
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
259
|
+
context: ROOT_CONTEXT,
|
|
260
|
+
});
|
|
240
261
|
|
|
241
|
-
|
|
242
|
-
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
243
|
-
} catch (error) {
|
|
244
|
-
this.logger.error(error, 'failed to update the chat context');
|
|
245
|
-
}
|
|
262
|
+
this.agent._agentActivity = this;
|
|
246
263
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
264
|
+
if (this.llm instanceof RealtimeModel) {
|
|
265
|
+
this.realtimeSession = this.llm.session();
|
|
266
|
+
this.realtimeSpans = new Map<string, Span>();
|
|
267
|
+
this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
|
|
268
|
+
this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
269
|
+
this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
270
|
+
this.realtimeSession.on(
|
|
271
|
+
'input_audio_transcription_completed',
|
|
272
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
273
|
+
);
|
|
274
|
+
this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
|
|
275
|
+
this.realtimeSession.on('error', this.onModelError);
|
|
252
276
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
);
|
|
259
|
-
}
|
|
260
|
-
} else if (this.llm instanceof LLM) {
|
|
261
|
-
try {
|
|
262
|
-
updateInstructions({
|
|
263
|
-
chatCtx: this.agent._chatCtx,
|
|
264
|
-
instructions: this.agent.instructions,
|
|
265
|
-
addIfMissing: true,
|
|
266
|
-
});
|
|
267
|
-
} catch (error) {
|
|
268
|
-
this.logger.error('failed to update the instructions', error);
|
|
269
|
-
}
|
|
277
|
+
removeInstructions(this.agent._chatCtx);
|
|
278
|
+
try {
|
|
279
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
280
|
+
} catch (error) {
|
|
281
|
+
this.logger.error(error, 'failed to update the instructions');
|
|
270
282
|
}
|
|
271
283
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
this.
|
|
284
|
+
try {
|
|
285
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
286
|
+
} catch (error) {
|
|
287
|
+
this.logger.error(error, 'failed to update the chat context');
|
|
276
288
|
}
|
|
277
289
|
|
|
278
|
-
|
|
279
|
-
this.
|
|
280
|
-
|
|
290
|
+
try {
|
|
291
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
292
|
+
} catch (error) {
|
|
293
|
+
this.logger.error(error, 'failed to update the tools');
|
|
281
294
|
}
|
|
282
295
|
|
|
283
|
-
if (this.tts
|
|
284
|
-
this.
|
|
285
|
-
|
|
296
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
297
|
+
this.logger.error(
|
|
298
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
299
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
300
|
+
'or set a TTS model.',
|
|
301
|
+
);
|
|
286
302
|
}
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
303
|
+
} else if (this.llm instanceof LLM) {
|
|
304
|
+
try {
|
|
305
|
+
updateInstructions({
|
|
306
|
+
chatCtx: this.agent._chatCtx,
|
|
307
|
+
instructions: this.agent.instructions,
|
|
308
|
+
addIfMissing: true,
|
|
309
|
+
});
|
|
310
|
+
} catch (error) {
|
|
311
|
+
this.logger.error('failed to update the instructions', error);
|
|
290
312
|
}
|
|
313
|
+
}
|
|
291
314
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
298
|
-
turnDetectionMode: this.turnDetectionMode,
|
|
299
|
-
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
300
|
-
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
301
|
-
rootSpanContext: this.agentSession.rootSpanContext,
|
|
302
|
-
});
|
|
303
|
-
this.audioRecognition.start();
|
|
304
|
-
this.started = true;
|
|
315
|
+
// metrics and error handling
|
|
316
|
+
if (this.llm instanceof LLM) {
|
|
317
|
+
this.llm.on('metrics_collected', this.onMetricsCollected);
|
|
318
|
+
this.llm.on('error', this.onModelError);
|
|
319
|
+
}
|
|
305
320
|
|
|
306
|
-
|
|
321
|
+
if (this.stt instanceof STT) {
|
|
322
|
+
this.stt.on('metrics_collected', this.onMetricsCollected);
|
|
323
|
+
this.stt.on('error', this.onModelError);
|
|
324
|
+
}
|
|
307
325
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
313
|
-
});
|
|
326
|
+
if (this.tts instanceof TTS) {
|
|
327
|
+
this.tts.on('metrics_collected', this.onMetricsCollected);
|
|
328
|
+
this.tts.on('error', this.onModelError);
|
|
329
|
+
}
|
|
314
330
|
|
|
315
|
-
|
|
316
|
-
|
|
331
|
+
if (this.vad instanceof VAD) {
|
|
332
|
+
this.vad.on('metrics_collected', this.onMetricsCollected);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
this.audioRecognition = new AudioRecognition({
|
|
336
|
+
recognitionHooks: this,
|
|
337
|
+
// Disable stt node if stt is not provided
|
|
338
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
339
|
+
vad: this.vad,
|
|
340
|
+
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
341
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
342
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
343
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
344
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
345
|
+
sttModel: this.stt?.label,
|
|
346
|
+
sttProvider: this.getSttProvider(),
|
|
347
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
348
|
+
});
|
|
349
|
+
this.audioRecognition.start();
|
|
350
|
+
this.started = true;
|
|
351
|
+
|
|
352
|
+
this._resumeSchedulingTask();
|
|
353
|
+
|
|
354
|
+
if (runOnEnter) {
|
|
355
|
+
this._onEnterTask = this.createSpeechTask({
|
|
356
|
+
taskFn: () =>
|
|
357
|
+
tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
358
|
+
name: 'on_enter',
|
|
359
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
360
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
361
|
+
}),
|
|
362
|
+
inlineTask: true,
|
|
317
363
|
name: 'AgentActivity_onEnter',
|
|
318
364
|
});
|
|
319
|
-
|
|
320
|
-
startSpan.end();
|
|
321
|
-
} finally {
|
|
322
|
-
unlock();
|
|
323
365
|
}
|
|
366
|
+
|
|
367
|
+
startSpan.end();
|
|
324
368
|
}
|
|
325
369
|
|
|
326
370
|
get currentSpeech(): SpeechHandle | undefined {
|
|
@@ -335,6 +379,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
335
379
|
return this.agent.stt || this.agentSession.stt;
|
|
336
380
|
}
|
|
337
381
|
|
|
382
|
+
private getSttProvider(): string | undefined {
|
|
383
|
+
const label = this.stt?.label;
|
|
384
|
+
if (!label) {
|
|
385
|
+
return undefined;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Heuristic: most labels look like "<provider>-<model>"
|
|
389
|
+
const [provider] = label.split('-', 1);
|
|
390
|
+
return provider || label;
|
|
391
|
+
}
|
|
392
|
+
|
|
338
393
|
get llm(): LLM | RealtimeModel | undefined {
|
|
339
394
|
return this.agent.llm || this.agentSession.llm;
|
|
340
395
|
}
|
|
@@ -347,8 +402,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
347
402
|
return this.agent.toolCtx;
|
|
348
403
|
}
|
|
349
404
|
|
|
350
|
-
get
|
|
351
|
-
return this.
|
|
405
|
+
get schedulingPaused(): boolean {
|
|
406
|
+
return this._schedulingPaused;
|
|
352
407
|
}
|
|
353
408
|
|
|
354
409
|
get realtimeLLMSession(): RealtimeSession | undefined {
|
|
@@ -402,18 +457,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
402
457
|
}
|
|
403
458
|
|
|
404
459
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
this.audioStream.detachSource();
|
|
408
|
-
}
|
|
460
|
+
void this.audioStream.close();
|
|
461
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
409
462
|
|
|
410
|
-
|
|
411
|
-
* We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
|
|
412
|
-
* The tee() operation should be applied to the deferred stream, not the original audioStream.
|
|
413
|
-
* This is important because teeing the original stream directly makes it very difficult—if not
|
|
414
|
-
* impossible—to implement stream unlock logic cleanly.
|
|
415
|
-
*/
|
|
416
|
-
this.audioStream.setSource(audioStream);
|
|
463
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
417
464
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
418
465
|
|
|
419
466
|
if (this.realtimeSession) {
|
|
@@ -426,16 +473,29 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
426
473
|
}
|
|
427
474
|
|
|
428
475
|
detachAudioInput(): void {
|
|
429
|
-
this.
|
|
476
|
+
if (this.audioStreamId === undefined) {
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
void this.audioStream.close();
|
|
481
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
482
|
+
this.audioStreamId = undefined;
|
|
430
483
|
}
|
|
431
484
|
|
|
432
|
-
commitUserTurn(
|
|
485
|
+
commitUserTurn(
|
|
486
|
+
options: {
|
|
487
|
+
audioDetached?: boolean;
|
|
488
|
+
throwIfNotReady?: boolean;
|
|
489
|
+
} = {},
|
|
490
|
+
) {
|
|
491
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
433
492
|
if (!this.audioRecognition) {
|
|
434
|
-
|
|
493
|
+
if (throwIfNotReady) {
|
|
494
|
+
throw new Error('AudioRecognition is not initialized');
|
|
495
|
+
}
|
|
496
|
+
return;
|
|
435
497
|
}
|
|
436
498
|
|
|
437
|
-
// TODO(brian): add audio_detached flag
|
|
438
|
-
const audioDetached = false;
|
|
439
499
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
440
500
|
}
|
|
441
501
|
|
|
@@ -493,14 +553,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
493
553
|
}),
|
|
494
554
|
);
|
|
495
555
|
const task = this.createSpeechTask({
|
|
496
|
-
|
|
556
|
+
taskFn: (abortController: AbortController) =>
|
|
497
557
|
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
498
|
-
),
|
|
499
558
|
ownedSpeechHandle: handle,
|
|
500
559
|
name: 'AgentActivity.say_tts',
|
|
501
560
|
});
|
|
502
561
|
|
|
503
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
562
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
504
563
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
505
564
|
return handle;
|
|
506
565
|
}
|
|
@@ -613,9 +672,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
613
672
|
return;
|
|
614
673
|
}
|
|
615
674
|
|
|
616
|
-
if (this.
|
|
675
|
+
if (this.schedulingPaused) {
|
|
617
676
|
// TODO(shubhra): should we "forward" this new turn to the next agent?
|
|
618
|
-
this.logger.warn('skipping new realtime generation, the
|
|
677
|
+
this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
|
|
619
678
|
return;
|
|
620
679
|
}
|
|
621
680
|
|
|
@@ -633,9 +692,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
633
692
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
634
693
|
|
|
635
694
|
this.createSpeechTask({
|
|
636
|
-
|
|
695
|
+
taskFn: (abortController: AbortController) =>
|
|
637
696
|
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
638
|
-
),
|
|
639
697
|
ownedSpeechHandle: handle,
|
|
640
698
|
name: 'AgentActivity.realtimeGeneration',
|
|
641
699
|
});
|
|
@@ -767,7 +825,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
767
825
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
768
826
|
if (
|
|
769
827
|
!this.agentSession.options.preemptiveGeneration ||
|
|
770
|
-
this.
|
|
828
|
+
this.schedulingPaused ||
|
|
771
829
|
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
772
830
|
!(this.llm instanceof LLM)
|
|
773
831
|
) {
|
|
@@ -814,11 +872,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
814
872
|
}
|
|
815
873
|
|
|
816
874
|
private createSpeechTask(options: {
|
|
817
|
-
|
|
875
|
+
taskFn: (controller: AbortController) => Promise<void>;
|
|
876
|
+
controller?: AbortController;
|
|
818
877
|
ownedSpeechHandle?: SpeechHandle;
|
|
878
|
+
inlineTask?: boolean;
|
|
819
879
|
name?: string;
|
|
820
|
-
}):
|
|
821
|
-
const {
|
|
880
|
+
}): Task<void> {
|
|
881
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
882
|
+
|
|
883
|
+
const wrappedFn = (ctrl: AbortController) => {
|
|
884
|
+
return agentActivityStorage.run(this, () => {
|
|
885
|
+
// Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
|
|
886
|
+
// before post-construction metadata is attached to the Task instance.
|
|
887
|
+
const currentTask = Task.current();
|
|
888
|
+
if (currentTask) {
|
|
889
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
if (ownedSpeechHandle) {
|
|
893
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
894
|
+
}
|
|
895
|
+
return taskFn(ctrl);
|
|
896
|
+
});
|
|
897
|
+
};
|
|
898
|
+
|
|
899
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
900
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
822
901
|
|
|
823
902
|
this.speechTasks.add(task);
|
|
824
903
|
task.addDoneCallback(() => {
|
|
@@ -838,13 +917,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
838
917
|
this.wakeupMainTask();
|
|
839
918
|
});
|
|
840
919
|
|
|
841
|
-
return task
|
|
920
|
+
return task;
|
|
842
921
|
}
|
|
843
922
|
|
|
844
923
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
845
|
-
if (this.
|
|
924
|
+
if (this.schedulingPaused) {
|
|
846
925
|
this.cancelPreemptiveGeneration();
|
|
847
|
-
this.logger.warn(
|
|
926
|
+
this.logger.warn(
|
|
927
|
+
{ user_input: info.newTranscript },
|
|
928
|
+
'skipping user input, speech scheduling is paused',
|
|
929
|
+
);
|
|
848
930
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
849
931
|
return true;
|
|
850
932
|
}
|
|
@@ -877,7 +959,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
877
959
|
|
|
878
960
|
const oldTask = this._userTurnCompletedTask;
|
|
879
961
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
880
|
-
|
|
962
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
881
963
|
name: 'AgentActivity.userTurnCompleted',
|
|
882
964
|
});
|
|
883
965
|
return true;
|
|
@@ -913,10 +995,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
913
995
|
this._currentSpeech = undefined;
|
|
914
996
|
}
|
|
915
997
|
|
|
916
|
-
//
|
|
917
|
-
//
|
|
918
|
-
|
|
919
|
-
|
|
998
|
+
// if we're draining/pausing and there are no more speech tasks, we can exit.
|
|
999
|
+
// only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
|
|
1000
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
1001
|
+
|
|
1002
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
1003
|
+
this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
|
|
920
1004
|
break;
|
|
921
1005
|
}
|
|
922
1006
|
|
|
@@ -926,6 +1010,39 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
926
1010
|
this.logger.info('AgentActivity mainTask: exiting');
|
|
927
1011
|
}
|
|
928
1012
|
|
|
1013
|
+
private getDrainPendingSpeechTasks(): Task<void>[] {
|
|
1014
|
+
const blockedHandles: SpeechHandle[] = [];
|
|
1015
|
+
|
|
1016
|
+
for (const task of this._drainBlockedTasks) {
|
|
1017
|
+
const info = _getActivityTaskInfo(task);
|
|
1018
|
+
if (!info) {
|
|
1019
|
+
this.logger.error('blocked task without activity info; skipping.');
|
|
1020
|
+
continue;
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
if (!info.speechHandle) {
|
|
1024
|
+
continue; // onEnter/onExit
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
blockedHandles.push(info.speechHandle);
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
const toWait: Task<void>[] = [];
|
|
1031
|
+
for (const task of this.speechTasks) {
|
|
1032
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
const info = _getActivityTaskInfo(task);
|
|
1037
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
1038
|
+
continue;
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
toWait.push(task);
|
|
1042
|
+
}
|
|
1043
|
+
return toWait;
|
|
1044
|
+
}
|
|
1045
|
+
|
|
929
1046
|
private wakeupMainTask(): void {
|
|
930
1047
|
this.q_updated.resolve();
|
|
931
1048
|
}
|
|
@@ -967,7 +1084,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
967
1084
|
throw new Error('trying to generate reply without an LLM model');
|
|
968
1085
|
}
|
|
969
1086
|
|
|
970
|
-
const functionCall =
|
|
1087
|
+
const functionCall = functionCallStorage.getStore()?.functionCall;
|
|
971
1088
|
if (toolChoice === undefined && functionCall !== undefined) {
|
|
972
1089
|
// when generateReply is called inside a tool, set toolChoice to 'none' by default
|
|
973
1090
|
toolChoice = 'none';
|
|
@@ -989,7 +1106,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
989
1106
|
|
|
990
1107
|
if (this.llm instanceof RealtimeModel) {
|
|
991
1108
|
this.createSpeechTask({
|
|
992
|
-
|
|
1109
|
+
taskFn: (abortController: AbortController) =>
|
|
993
1110
|
this.realtimeReplyTask({
|
|
994
1111
|
speechHandle: handle,
|
|
995
1112
|
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
@@ -1001,7 +1118,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1001
1118
|
},
|
|
1002
1119
|
abortController,
|
|
1003
1120
|
}),
|
|
1004
|
-
),
|
|
1005
1121
|
ownedSpeechHandle: handle,
|
|
1006
1122
|
name: 'AgentActivity.realtimeReply',
|
|
1007
1123
|
});
|
|
@@ -1014,7 +1130,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1014
1130
|
}
|
|
1015
1131
|
|
|
1016
1132
|
const task = this.createSpeechTask({
|
|
1017
|
-
|
|
1133
|
+
taskFn: (abortController: AbortController) =>
|
|
1018
1134
|
this.pipelineReplyTask(
|
|
1019
1135
|
handle,
|
|
1020
1136
|
chatCtx ?? this.agent.chatCtx,
|
|
@@ -1026,12 +1142,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1026
1142
|
instructions,
|
|
1027
1143
|
userMessage,
|
|
1028
1144
|
),
|
|
1029
|
-
),
|
|
1030
1145
|
ownedSpeechHandle: handle,
|
|
1031
1146
|
name: 'AgentActivity.pipelineReply',
|
|
1032
1147
|
});
|
|
1033
1148
|
|
|
1034
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
1149
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
1035
1150
|
}
|
|
1036
1151
|
|
|
1037
1152
|
if (scheduleSpeech) {
|
|
@@ -1040,16 +1155,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1040
1155
|
return handle;
|
|
1041
1156
|
}
|
|
1042
1157
|
|
|
1043
|
-
interrupt(): Future<void> {
|
|
1158
|
+
interrupt(options: { force?: boolean } = {}): Future<void> {
|
|
1159
|
+
const { force = false } = options;
|
|
1160
|
+
this.cancelPreemptiveGeneration();
|
|
1161
|
+
|
|
1044
1162
|
const future = new Future<void>();
|
|
1045
1163
|
const currentSpeech = this._currentSpeech;
|
|
1046
1164
|
|
|
1047
1165
|
//TODO(AJS-273): add interrupt for background speeches
|
|
1048
1166
|
|
|
1049
|
-
currentSpeech?.interrupt();
|
|
1167
|
+
currentSpeech?.interrupt(force);
|
|
1050
1168
|
|
|
1051
1169
|
for (const [_, __, speech] of this.speechQueue) {
|
|
1052
|
-
speech.interrupt();
|
|
1170
|
+
speech.interrupt(force);
|
|
1053
1171
|
}
|
|
1054
1172
|
|
|
1055
1173
|
this.realtimeSession?.interrupt();
|
|
@@ -1072,13 +1190,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1072
1190
|
}
|
|
1073
1191
|
}
|
|
1074
1192
|
|
|
1075
|
-
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?:
|
|
1193
|
+
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
|
|
1076
1194
|
if (oldTask) {
|
|
1077
1195
|
// We never cancel user code as this is very confusing.
|
|
1078
1196
|
// So we wait for the old execution of onUserTurnCompleted to finish.
|
|
1079
1197
|
// In practice this is OK because most speeches will be interrupted if a new turn
|
|
1080
1198
|
// is detected. So the previous execution should complete quickly.
|
|
1081
|
-
await oldTask;
|
|
1199
|
+
await oldTask.result;
|
|
1082
1200
|
}
|
|
1083
1201
|
|
|
1084
1202
|
// When the audio recognition detects the end of a user turn:
|
|
@@ -1355,6 +1473,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1355
1473
|
span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
|
|
1356
1474
|
}
|
|
1357
1475
|
|
|
1476
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1477
|
+
if (localParticipant) {
|
|
1478
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1358
1481
|
speechHandleStorage.enterWith(speechHandle);
|
|
1359
1482
|
|
|
1360
1483
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
@@ -1531,13 +1654,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1531
1654
|
for (const msg of toolsMessages) {
|
|
1532
1655
|
msg.createdAt = replyStartedAt;
|
|
1533
1656
|
}
|
|
1534
|
-
|
|
1535
|
-
//
|
|
1536
|
-
//
|
|
1657
|
+
// Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
|
|
1658
|
+
// were already added by onToolExecutionStarted when the tool execution began.
|
|
1659
|
+
// Inserting function_calls again would create duplicates that break provider APIs
|
|
1660
|
+
// (e.g. Google's "function response parts != function call parts" error).
|
|
1537
1661
|
const toolCallOutputs = toolsMessages.filter(
|
|
1538
1662
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1539
1663
|
);
|
|
1540
1664
|
if (toolCallOutputs.length > 0) {
|
|
1665
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1541
1666
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1542
1667
|
}
|
|
1543
1668
|
}
|
|
@@ -1645,52 +1770,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1645
1770
|
return;
|
|
1646
1771
|
}
|
|
1647
1772
|
|
|
1648
|
-
const functionToolsExecutedEvent =
|
|
1649
|
-
|
|
1650
|
-
functionCallOutputs: [],
|
|
1651
|
-
});
|
|
1652
|
-
let shouldGenerateToolReply: boolean = false;
|
|
1653
|
-
let newAgentTask: Agent | null = null;
|
|
1654
|
-
let ignoreTaskSwitch: boolean = false;
|
|
1655
|
-
|
|
1656
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1657
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
1658
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1659
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1660
|
-
if (sanitizedOut.replyRequired) {
|
|
1661
|
-
shouldGenerateToolReply = true;
|
|
1662
|
-
}
|
|
1663
|
-
}
|
|
1664
|
-
|
|
1665
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
1666
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
1667
|
-
ignoreTaskSwitch = true;
|
|
1668
|
-
// TODO(brian): should we mark the function call as failed to notify the LLM?
|
|
1669
|
-
}
|
|
1670
|
-
|
|
1671
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1672
|
-
|
|
1673
|
-
this.logger.debug(
|
|
1674
|
-
{
|
|
1675
|
-
speechId: speechHandle.id,
|
|
1676
|
-
name: sanitizedOut.toolCall?.name,
|
|
1677
|
-
args: sanitizedOut.toolCall.args,
|
|
1678
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
1679
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
1680
|
-
},
|
|
1681
|
-
'Tool call execution finished',
|
|
1682
|
-
);
|
|
1683
|
-
}
|
|
1773
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
1774
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1684
1775
|
|
|
1685
1776
|
this.agentSession.emit(
|
|
1686
1777
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1687
1778
|
functionToolsExecutedEvent,
|
|
1688
1779
|
);
|
|
1689
1780
|
|
|
1690
|
-
let
|
|
1781
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1691
1782
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1692
1783
|
this.agentSession.updateAgent(newAgentTask);
|
|
1693
|
-
|
|
1784
|
+
schedulingPaused = true;
|
|
1694
1785
|
}
|
|
1695
1786
|
|
|
1696
1787
|
const toolMessages = [
|
|
@@ -1705,11 +1796,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1705
1796
|
|
|
1706
1797
|
// Avoid setting tool_choice to "required" or a specific function when
|
|
1707
1798
|
// passing tool response back to the LLM
|
|
1708
|
-
const respondToolChoice =
|
|
1799
|
+
const respondToolChoice =
|
|
1800
|
+
schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1709
1801
|
|
|
1710
1802
|
// Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
|
|
1711
1803
|
const toolResponseTask = this.createSpeechTask({
|
|
1712
|
-
|
|
1804
|
+
taskFn: () =>
|
|
1713
1805
|
this.pipelineReplyTask(
|
|
1714
1806
|
speechHandle,
|
|
1715
1807
|
chatCtx,
|
|
@@ -1720,12 +1812,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1720
1812
|
undefined,
|
|
1721
1813
|
toolMessages,
|
|
1722
1814
|
),
|
|
1723
|
-
),
|
|
1724
1815
|
ownedSpeechHandle: speechHandle,
|
|
1725
1816
|
name: 'AgentActivity.pipelineReply',
|
|
1726
1817
|
});
|
|
1727
1818
|
|
|
1728
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1819
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1729
1820
|
|
|
1730
1821
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1731
1822
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -1733,15 +1824,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1733
1824
|
msg.createdAt = replyStartedAt;
|
|
1734
1825
|
}
|
|
1735
1826
|
|
|
1736
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1737
|
-
|
|
1738
|
-
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1739
|
-
// were already added by onToolExecutionStarted when the tool execution began
|
|
1740
1827
|
const toolCallOutputs = toolMessages.filter(
|
|
1741
1828
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1742
1829
|
);
|
|
1743
1830
|
|
|
1744
1831
|
if (toolCallOutputs.length > 0) {
|
|
1832
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1745
1833
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1746
1834
|
}
|
|
1747
1835
|
}
|
|
@@ -1815,6 +1903,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1815
1903
|
|
|
1816
1904
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1817
1905
|
|
|
1906
|
+
const localParticipant = this.agentSession._roomIO?.localParticipant;
|
|
1907
|
+
if (localParticipant) {
|
|
1908
|
+
setParticipantSpanAttributes(span, localParticipant);
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1818
1911
|
speechHandleStorage.enterWith(speechHandle);
|
|
1819
1912
|
|
|
1820
1913
|
if (!this.realtimeSession) {
|
|
@@ -2139,50 +2232,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2139
2232
|
return;
|
|
2140
2233
|
}
|
|
2141
2234
|
|
|
2142
|
-
const functionToolsExecutedEvent =
|
|
2143
|
-
|
|
2144
|
-
functionCallOutputs: [],
|
|
2145
|
-
});
|
|
2146
|
-
let shouldGenerateToolReply: boolean = false;
|
|
2147
|
-
let newAgentTask: Agent | null = null;
|
|
2148
|
-
let ignoreTaskSwitch: boolean = false;
|
|
2149
|
-
|
|
2150
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
2151
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2152
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2153
|
-
if (sanitizedOut.replyRequired) {
|
|
2154
|
-
shouldGenerateToolReply = true;
|
|
2155
|
-
}
|
|
2156
|
-
}
|
|
2157
|
-
|
|
2158
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2159
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2160
|
-
ignoreTaskSwitch = true;
|
|
2161
|
-
}
|
|
2162
|
-
|
|
2163
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2164
|
-
|
|
2165
|
-
this.logger.debug(
|
|
2166
|
-
{
|
|
2167
|
-
speechId: speechHandle.id,
|
|
2168
|
-
name: sanitizedOut.toolCall?.name,
|
|
2169
|
-
args: sanitizedOut.toolCall.args,
|
|
2170
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
2171
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2172
|
-
},
|
|
2173
|
-
'Tool call execution finished',
|
|
2174
|
-
);
|
|
2175
|
-
}
|
|
2235
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
2236
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
2176
2237
|
|
|
2177
2238
|
this.agentSession.emit(
|
|
2178
2239
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
2179
2240
|
functionToolsExecutedEvent,
|
|
2180
2241
|
);
|
|
2181
2242
|
|
|
2182
|
-
let
|
|
2243
|
+
let schedulingPaused = this.schedulingPaused;
|
|
2183
2244
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
2184
2245
|
this.agentSession.updateAgent(newAgentTask);
|
|
2185
|
-
|
|
2246
|
+
schedulingPaused = true;
|
|
2186
2247
|
}
|
|
2187
2248
|
|
|
2188
2249
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -2238,15 +2299,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2238
2299
|
}),
|
|
2239
2300
|
);
|
|
2240
2301
|
|
|
2241
|
-
const toolChoice =
|
|
2302
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
2242
2303
|
this.createSpeechTask({
|
|
2243
|
-
|
|
2304
|
+
taskFn: (abortController: AbortController) =>
|
|
2244
2305
|
this.realtimeReplyTask({
|
|
2245
2306
|
speechHandle: replySpeechHandle,
|
|
2246
2307
|
modelSettings: { toolChoice },
|
|
2247
2308
|
abortController,
|
|
2248
2309
|
}),
|
|
2249
|
-
),
|
|
2250
2310
|
ownedSpeechHandle: replySpeechHandle,
|
|
2251
2311
|
name: 'AgentActivity.realtime_reply',
|
|
2252
2312
|
});
|
|
@@ -2254,6 +2314,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2254
2314
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
2255
2315
|
}
|
|
2256
2316
|
|
|
2317
|
+
private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
|
|
2318
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
2319
|
+
functionCalls: [],
|
|
2320
|
+
functionCallOutputs: [],
|
|
2321
|
+
});
|
|
2322
|
+
|
|
2323
|
+
let shouldGenerateToolReply = false;
|
|
2324
|
+
let newAgentTask: Agent | null = null;
|
|
2325
|
+
let ignoreTaskSwitch = false;
|
|
2326
|
+
|
|
2327
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
2328
|
+
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2329
|
+
// Keep event payload symmetric for pipeline + realtime paths.
|
|
2330
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
2331
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2332
|
+
if (sanitizedOut.replyRequired) {
|
|
2333
|
+
shouldGenerateToolReply = true;
|
|
2334
|
+
}
|
|
2335
|
+
}
|
|
2336
|
+
|
|
2337
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2338
|
+
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2339
|
+
ignoreTaskSwitch = true;
|
|
2340
|
+
}
|
|
2341
|
+
|
|
2342
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2343
|
+
|
|
2344
|
+
this.logger.debug(
|
|
2345
|
+
{
|
|
2346
|
+
speechId: speechHandle.id,
|
|
2347
|
+
name: sanitizedOut.toolCall?.name,
|
|
2348
|
+
args: sanitizedOut.toolCall.args,
|
|
2349
|
+
output: sanitizedOut.toolCallOutput?.output,
|
|
2350
|
+
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2351
|
+
},
|
|
2352
|
+
'Tool call execution finished',
|
|
2353
|
+
);
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
return {
|
|
2357
|
+
functionToolsExecutedEvent,
|
|
2358
|
+
shouldGenerateToolReply,
|
|
2359
|
+
newAgentTask,
|
|
2360
|
+
ignoreTaskSwitch,
|
|
2361
|
+
};
|
|
2362
|
+
}
|
|
2363
|
+
|
|
2257
2364
|
private async realtimeReplyTask({
|
|
2258
2365
|
speechHandle,
|
|
2259
2366
|
modelSettings: { toolChoice },
|
|
@@ -2312,10 +2419,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2312
2419
|
priority: number,
|
|
2313
2420
|
force: boolean = false,
|
|
2314
2421
|
): void {
|
|
2315
|
-
// when force=true, we allow tool responses to bypass
|
|
2422
|
+
// when force=true, we allow tool responses to bypass scheduling pause
|
|
2316
2423
|
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
2317
|
-
if (this.
|
|
2318
|
-
throw new Error('cannot schedule new speech, the
|
|
2424
|
+
if (this.schedulingPaused && !force) {
|
|
2425
|
+
throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
|
|
2319
2426
|
}
|
|
2320
2427
|
|
|
2321
2428
|
// Monotonic time to avoid near 0 collisions
|
|
@@ -2324,6 +2431,48 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2324
2431
|
this.wakeupMainTask();
|
|
2325
2432
|
}
|
|
2326
2433
|
|
|
2434
|
+
private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
|
|
2435
|
+
if (this._schedulingPaused) return;
|
|
2436
|
+
|
|
2437
|
+
this._schedulingPaused = true;
|
|
2438
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2439
|
+
this.wakeupMainTask();
|
|
2440
|
+
|
|
2441
|
+
if (this._mainTask) {
|
|
2442
|
+
// When pausing/draining, we ensure that all speech_tasks complete fully.
|
|
2443
|
+
// This means that even if the SpeechHandle themselves have finished,
|
|
2444
|
+
// we still wait for the entire execution (e.g function_tools)
|
|
2445
|
+
await this._mainTask.result;
|
|
2446
|
+
}
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
private _resumeSchedulingTask(): void {
|
|
2450
|
+
if (!this._schedulingPaused) return;
|
|
2451
|
+
|
|
2452
|
+
this._schedulingPaused = false;
|
|
2453
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
2454
|
+
}
|
|
2455
|
+
|
|
2456
|
+
async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
|
|
2457
|
+
const { blockedTasks = [] } = options;
|
|
2458
|
+
const unlock = await this.lock.lock();
|
|
2459
|
+
|
|
2460
|
+
try {
|
|
2461
|
+
const span = tracer.startSpan({
|
|
2462
|
+
name: 'pause_agent_activity',
|
|
2463
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2464
|
+
});
|
|
2465
|
+
try {
|
|
2466
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2467
|
+
await this._closeSessionResources();
|
|
2468
|
+
} finally {
|
|
2469
|
+
span.end();
|
|
2470
|
+
}
|
|
2471
|
+
} finally {
|
|
2472
|
+
unlock();
|
|
2473
|
+
}
|
|
2474
|
+
}
|
|
2475
|
+
|
|
2327
2476
|
async drain(): Promise<void> {
|
|
2328
2477
|
// Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
|
|
2329
2478
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
@@ -2337,23 +2486,22 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2337
2486
|
|
|
2338
2487
|
const unlock = await this.lock.lock();
|
|
2339
2488
|
try {
|
|
2340
|
-
if (this.
|
|
2341
|
-
|
|
2342
|
-
this.cancelPreemptiveGeneration();
|
|
2489
|
+
if (this._schedulingPaused) return;
|
|
2343
2490
|
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2491
|
+
this._onExitTask = this.createSpeechTask({
|
|
2492
|
+
taskFn: () =>
|
|
2493
|
+
tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2494
|
+
name: 'on_exit',
|
|
2495
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2496
|
+
}),
|
|
2497
|
+
inlineTask: true,
|
|
2351
2498
|
name: 'AgentActivity_onExit',
|
|
2352
2499
|
});
|
|
2353
2500
|
|
|
2354
|
-
this.
|
|
2355
|
-
|
|
2356
|
-
await this.
|
|
2501
|
+
this.cancelPreemptiveGeneration();
|
|
2502
|
+
|
|
2503
|
+
await this._onExitTask.result;
|
|
2504
|
+
await this._pauseSchedulingTask([]);
|
|
2357
2505
|
} finally {
|
|
2358
2506
|
unlock();
|
|
2359
2507
|
}
|
|
@@ -2362,44 +2510,59 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2362
2510
|
async close(): Promise<void> {
|
|
2363
2511
|
const unlock = await this.lock.lock();
|
|
2364
2512
|
try {
|
|
2365
|
-
if (!this._draining) {
|
|
2366
|
-
this.logger.warn('task closing without draining');
|
|
2367
|
-
}
|
|
2368
|
-
|
|
2369
2513
|
this.cancelPreemptiveGeneration();
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
if (this.realtimeSession) {
|
|
2375
|
-
this.realtimeSession.off('generation_created', this.onGenerationCreated);
|
|
2376
|
-
this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
|
|
2377
|
-
this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
|
|
2378
|
-
this.realtimeSession.off(
|
|
2379
|
-
'input_audio_transcription_completed',
|
|
2380
|
-
this.onInputAudioTranscriptionCompleted,
|
|
2381
|
-
);
|
|
2382
|
-
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2383
|
-
}
|
|
2384
|
-
if (this.stt instanceof STT) {
|
|
2385
|
-
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2386
|
-
}
|
|
2387
|
-
if (this.tts instanceof TTS) {
|
|
2388
|
-
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2389
|
-
}
|
|
2390
|
-
if (this.vad instanceof VAD) {
|
|
2391
|
-
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2514
|
+
await this._closeSessionResources();
|
|
2515
|
+
|
|
2516
|
+
if (this._mainTask) {
|
|
2517
|
+
await this._mainTask.cancelAndWait();
|
|
2392
2518
|
}
|
|
2393
2519
|
|
|
2394
|
-
this.
|
|
2395
|
-
this.realtimeSpans?.clear();
|
|
2396
|
-
await this.realtimeSession?.close();
|
|
2397
|
-
await this.audioRecognition?.close();
|
|
2398
|
-
await this._mainTask?.cancelAndWait();
|
|
2520
|
+
this.agent._agentActivity = undefined;
|
|
2399
2521
|
} finally {
|
|
2400
2522
|
unlock();
|
|
2401
2523
|
}
|
|
2402
2524
|
}
|
|
2525
|
+
|
|
2526
|
+
private async _closeSessionResources(): Promise<void> {
|
|
2527
|
+
// Unregister event handlers to prevent duplicate metrics
|
|
2528
|
+
if (this.llm instanceof LLM) {
|
|
2529
|
+
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
2530
|
+
this.llm.off('error', this.onModelError);
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
if (this.realtimeSession) {
|
|
2534
|
+
this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
|
|
2535
|
+
this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
2536
|
+
this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
2537
|
+
this.realtimeSession.off(
|
|
2538
|
+
'input_audio_transcription_completed',
|
|
2539
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
2540
|
+
);
|
|
2541
|
+
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2542
|
+
this.realtimeSession.off('error', this.onModelError);
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
if (this.stt instanceof STT) {
|
|
2546
|
+
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2547
|
+
this.stt.off('error', this.onModelError);
|
|
2548
|
+
}
|
|
2549
|
+
|
|
2550
|
+
if (this.tts instanceof TTS) {
|
|
2551
|
+
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2552
|
+
this.tts.off('error', this.onModelError);
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
if (this.vad instanceof VAD) {
|
|
2556
|
+
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2557
|
+
}
|
|
2558
|
+
|
|
2559
|
+
this.detachAudioInput();
|
|
2560
|
+
this.realtimeSpans?.clear();
|
|
2561
|
+
await this.realtimeSession?.close();
|
|
2562
|
+
await this.audioRecognition?.close();
|
|
2563
|
+
this.realtimeSession = undefined;
|
|
2564
|
+
this.audioRecognition = undefined;
|
|
2565
|
+
}
|
|
2403
2566
|
}
|
|
2404
2567
|
|
|
2405
2568
|
function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
|