@livekit/agents 1.0.46 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/utils.cjs +31 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +31 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +383 -298
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +34 -7
- package/dist/voice/agent_activity.d.ts +34 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +383 -293
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +140 -40
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +19 -7
- package/dist/voice/agent_session.d.ts +19 -7
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +137 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +4 -0
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +4 -0
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +36 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +427 -289
- package/src/voice/agent_session.ts +178 -40
- package/src/voice/audio_recognition.ts +4 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
|
@@ -35,7 +35,7 @@ import type {
|
|
|
35
35
|
TTSMetrics,
|
|
36
36
|
VADMetrics,
|
|
37
37
|
} from '../metrics/base.js';
|
|
38
|
-
import {
|
|
38
|
+
import { MultiInputStream } from '../stream/multi_input_stream.js';
|
|
39
39
|
import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
|
|
40
40
|
import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
|
|
41
41
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
@@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
|
|
|
43
43
|
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
44
44
|
import { VAD, type VADEvent } from '../vad.js';
|
|
45
45
|
import type { Agent, ModelSettings } from './agent.js';
|
|
46
|
-
import {
|
|
46
|
+
import {
|
|
47
|
+
StopResponse,
|
|
48
|
+
_getActivityTaskInfo,
|
|
49
|
+
_setActivityTaskInfo,
|
|
50
|
+
functionCallStorage,
|
|
51
|
+
speechHandleStorage,
|
|
52
|
+
} from './agent.js';
|
|
47
53
|
import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
48
54
|
import {
|
|
49
55
|
AudioRecognition,
|
|
@@ -60,7 +66,7 @@ import {
|
|
|
60
66
|
createSpeechCreatedEvent,
|
|
61
67
|
createUserInputTranscribedEvent,
|
|
62
68
|
} from './events.js';
|
|
63
|
-
import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
|
|
69
|
+
import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
|
|
64
70
|
import {
|
|
65
71
|
type _AudioOut,
|
|
66
72
|
type _TextOut,
|
|
@@ -76,7 +82,7 @@ import type { TimedString } from './io.js';
|
|
|
76
82
|
import { SpeechHandle } from './speech_handle.js';
|
|
77
83
|
import { setParticipantSpanAttributes } from './utils.js';
|
|
78
84
|
|
|
79
|
-
const
|
|
85
|
+
export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
|
|
80
86
|
|
|
81
87
|
interface PreemptiveGeneration {
|
|
82
88
|
speechHandle: SpeechHandle;
|
|
@@ -89,31 +95,47 @@ interface PreemptiveGeneration {
|
|
|
89
95
|
}
|
|
90
96
|
|
|
91
97
|
export class AgentActivity implements RecognitionHooks {
|
|
98
|
+
agent: Agent;
|
|
99
|
+
agentSession: AgentSession;
|
|
100
|
+
|
|
92
101
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
102
|
+
|
|
93
103
|
private started = false;
|
|
94
104
|
private audioRecognition?: AudioRecognition;
|
|
95
105
|
private realtimeSession?: RealtimeSession;
|
|
96
106
|
private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
|
|
97
107
|
private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
98
108
|
private logger = log();
|
|
99
|
-
private
|
|
109
|
+
private _schedulingPaused = true;
|
|
110
|
+
private _drainBlockedTasks: Task<any>[] = [];
|
|
100
111
|
private _currentSpeech?: SpeechHandle;
|
|
101
112
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
102
113
|
private q_updated: Future;
|
|
103
114
|
private speechTasks: Set<Task<void>> = new Set();
|
|
104
115
|
private lock = new Mutex();
|
|
105
|
-
private audioStream = new
|
|
116
|
+
private audioStream = new MultiInputStream<AudioFrame>();
|
|
117
|
+
private audioStreamId?: string;
|
|
118
|
+
|
|
106
119
|
// default to null as None, which maps to the default provider tool choice value
|
|
107
120
|
private toolChoice: ToolChoice | null = null;
|
|
108
121
|
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
109
122
|
|
|
110
|
-
agent: Agent;
|
|
111
|
-
agentSession: AgentSession;
|
|
112
|
-
|
|
113
123
|
/** @internal */
|
|
114
124
|
_mainTask?: Task<void>;
|
|
115
|
-
|
|
116
|
-
|
|
125
|
+
_onEnterTask?: Task<void>;
|
|
126
|
+
_onExitTask?: Task<void>;
|
|
127
|
+
_userTurnCompletedTask?: Task<void>;
|
|
128
|
+
|
|
129
|
+
private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
|
|
130
|
+
this.onGenerationCreated(ev);
|
|
131
|
+
private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
|
|
132
|
+
this.onInputSpeechStarted(ev);
|
|
133
|
+
private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
|
|
134
|
+
this.onInputSpeechStopped(ev);
|
|
135
|
+
private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
|
|
136
|
+
this.onInputAudioTranscriptionCompleted(ev);
|
|
137
|
+
private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
|
|
138
|
+
this.onError(ev);
|
|
117
139
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
118
140
|
this.agent = agent;
|
|
119
141
|
this.agentSession = agentSession;
|
|
@@ -133,7 +155,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
133
155
|
|
|
134
156
|
if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
|
|
135
157
|
this.logger.warn(
|
|
136
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
158
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
|
|
137
159
|
);
|
|
138
160
|
this.turnDetectionMode = undefined;
|
|
139
161
|
}
|
|
@@ -211,120 +233,138 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
211
233
|
async start(): Promise<void> {
|
|
212
234
|
const unlock = await this.lock.lock();
|
|
213
235
|
try {
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
});
|
|
236
|
+
await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
|
|
237
|
+
} finally {
|
|
238
|
+
unlock();
|
|
239
|
+
}
|
|
240
|
+
}
|
|
220
241
|
|
|
221
|
-
|
|
242
|
+
async resume(): Promise<void> {
|
|
243
|
+
const unlock = await this.lock.lock();
|
|
244
|
+
try {
|
|
245
|
+
await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
|
|
246
|
+
} finally {
|
|
247
|
+
unlock();
|
|
248
|
+
}
|
|
249
|
+
}
|
|
222
250
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
this.realtimeSession.on('error', (ev) => this.onError(ev));
|
|
234
|
-
|
|
235
|
-
removeInstructions(this.agent._chatCtx);
|
|
236
|
-
try {
|
|
237
|
-
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
238
|
-
} catch (error) {
|
|
239
|
-
this.logger.error(error, 'failed to update the instructions');
|
|
240
|
-
}
|
|
251
|
+
private async _startSession(options: {
|
|
252
|
+
spanName: 'start_agent_activity' | 'resume_agent_activity';
|
|
253
|
+
runOnEnter: boolean;
|
|
254
|
+
}): Promise<void> {
|
|
255
|
+
const { spanName, runOnEnter } = options;
|
|
256
|
+
const startSpan = tracer.startSpan({
|
|
257
|
+
name: spanName,
|
|
258
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
259
|
+
context: ROOT_CONTEXT,
|
|
260
|
+
});
|
|
241
261
|
|
|
242
|
-
|
|
243
|
-
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
244
|
-
} catch (error) {
|
|
245
|
-
this.logger.error(error, 'failed to update the chat context');
|
|
246
|
-
}
|
|
262
|
+
this.agent._agentActivity = this;
|
|
247
263
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
264
|
+
if (this.llm instanceof RealtimeModel) {
|
|
265
|
+
this.realtimeSession = this.llm.session();
|
|
266
|
+
this.realtimeSpans = new Map<string, Span>();
|
|
267
|
+
this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
|
|
268
|
+
this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
269
|
+
this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
270
|
+
this.realtimeSession.on(
|
|
271
|
+
'input_audio_transcription_completed',
|
|
272
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
273
|
+
);
|
|
274
|
+
this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
|
|
275
|
+
this.realtimeSession.on('error', this.onModelError);
|
|
253
276
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
);
|
|
260
|
-
}
|
|
261
|
-
} else if (this.llm instanceof LLM) {
|
|
262
|
-
try {
|
|
263
|
-
updateInstructions({
|
|
264
|
-
chatCtx: this.agent._chatCtx,
|
|
265
|
-
instructions: this.agent.instructions,
|
|
266
|
-
addIfMissing: true,
|
|
267
|
-
});
|
|
268
|
-
} catch (error) {
|
|
269
|
-
this.logger.error('failed to update the instructions', error);
|
|
270
|
-
}
|
|
277
|
+
removeInstructions(this.agent._chatCtx);
|
|
278
|
+
try {
|
|
279
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
280
|
+
} catch (error) {
|
|
281
|
+
this.logger.error(error, 'failed to update the instructions');
|
|
271
282
|
}
|
|
272
283
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
this.
|
|
284
|
+
try {
|
|
285
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
286
|
+
} catch (error) {
|
|
287
|
+
this.logger.error(error, 'failed to update the chat context');
|
|
277
288
|
}
|
|
278
289
|
|
|
279
|
-
|
|
280
|
-
this.
|
|
281
|
-
|
|
290
|
+
try {
|
|
291
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
292
|
+
} catch (error) {
|
|
293
|
+
this.logger.error(error, 'failed to update the tools');
|
|
282
294
|
}
|
|
283
295
|
|
|
284
|
-
if (this.tts
|
|
285
|
-
this.
|
|
286
|
-
|
|
296
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
297
|
+
this.logger.error(
|
|
298
|
+
'audio output is enabled but RealtimeModel has no audio modality ' +
|
|
299
|
+
'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
|
|
300
|
+
'or set a TTS model.',
|
|
301
|
+
);
|
|
287
302
|
}
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
303
|
+
} else if (this.llm instanceof LLM) {
|
|
304
|
+
try {
|
|
305
|
+
updateInstructions({
|
|
306
|
+
chatCtx: this.agent._chatCtx,
|
|
307
|
+
instructions: this.agent.instructions,
|
|
308
|
+
addIfMissing: true,
|
|
309
|
+
});
|
|
310
|
+
} catch (error) {
|
|
311
|
+
this.logger.error('failed to update the instructions', error);
|
|
291
312
|
}
|
|
313
|
+
}
|
|
292
314
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
299
|
-
turnDetectionMode: this.turnDetectionMode,
|
|
300
|
-
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
301
|
-
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
302
|
-
rootSpanContext: this.agentSession.rootSpanContext,
|
|
303
|
-
sttModel: this.stt?.label,
|
|
304
|
-
sttProvider: this.getSttProvider(),
|
|
305
|
-
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
306
|
-
});
|
|
307
|
-
this.audioRecognition.start();
|
|
308
|
-
this.started = true;
|
|
315
|
+
// metrics and error handling
|
|
316
|
+
if (this.llm instanceof LLM) {
|
|
317
|
+
this.llm.on('metrics_collected', this.onMetricsCollected);
|
|
318
|
+
this.llm.on('error', this.onModelError);
|
|
319
|
+
}
|
|
309
320
|
|
|
310
|
-
|
|
321
|
+
if (this.stt instanceof STT) {
|
|
322
|
+
this.stt.on('metrics_collected', this.onMetricsCollected);
|
|
323
|
+
this.stt.on('error', this.onModelError);
|
|
324
|
+
}
|
|
311
325
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
317
|
-
});
|
|
326
|
+
if (this.tts instanceof TTS) {
|
|
327
|
+
this.tts.on('metrics_collected', this.onMetricsCollected);
|
|
328
|
+
this.tts.on('error', this.onModelError);
|
|
329
|
+
}
|
|
318
330
|
|
|
319
|
-
|
|
320
|
-
|
|
331
|
+
if (this.vad instanceof VAD) {
|
|
332
|
+
this.vad.on('metrics_collected', this.onMetricsCollected);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
this.audioRecognition = new AudioRecognition({
|
|
336
|
+
recognitionHooks: this,
|
|
337
|
+
// Disable stt node if stt is not provided
|
|
338
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
339
|
+
vad: this.vad,
|
|
340
|
+
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
341
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
342
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
343
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
344
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
345
|
+
sttModel: this.stt?.label,
|
|
346
|
+
sttProvider: this.getSttProvider(),
|
|
347
|
+
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
|
|
348
|
+
});
|
|
349
|
+
this.audioRecognition.start();
|
|
350
|
+
this.started = true;
|
|
351
|
+
|
|
352
|
+
this._resumeSchedulingTask();
|
|
353
|
+
|
|
354
|
+
if (runOnEnter) {
|
|
355
|
+
this._onEnterTask = this.createSpeechTask({
|
|
356
|
+
taskFn: () =>
|
|
357
|
+
tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
358
|
+
name: 'on_enter',
|
|
359
|
+
context: trace.setSpan(ROOT_CONTEXT, startSpan),
|
|
360
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
361
|
+
}),
|
|
362
|
+
inlineTask: true,
|
|
321
363
|
name: 'AgentActivity_onEnter',
|
|
322
364
|
});
|
|
323
|
-
|
|
324
|
-
startSpan.end();
|
|
325
|
-
} finally {
|
|
326
|
-
unlock();
|
|
327
365
|
}
|
|
366
|
+
|
|
367
|
+
startSpan.end();
|
|
328
368
|
}
|
|
329
369
|
|
|
330
370
|
get currentSpeech(): SpeechHandle | undefined {
|
|
@@ -362,8 +402,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
362
402
|
return this.agent.toolCtx;
|
|
363
403
|
}
|
|
364
404
|
|
|
365
|
-
get
|
|
366
|
-
return this.
|
|
405
|
+
get schedulingPaused(): boolean {
|
|
406
|
+
return this._schedulingPaused;
|
|
367
407
|
}
|
|
368
408
|
|
|
369
409
|
get realtimeLLMSession(): RealtimeSession | undefined {
|
|
@@ -417,18 +457,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
417
457
|
}
|
|
418
458
|
|
|
419
459
|
attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
this.audioStream.detachSource();
|
|
423
|
-
}
|
|
460
|
+
void this.audioStream.close();
|
|
461
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
424
462
|
|
|
425
|
-
|
|
426
|
-
* We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
|
|
427
|
-
* The tee() operation should be applied to the deferred stream, not the original audioStream.
|
|
428
|
-
* This is important because teeing the original stream directly makes it very difficult—if not
|
|
429
|
-
* impossible—to implement stream unlock logic cleanly.
|
|
430
|
-
*/
|
|
431
|
-
this.audioStream.setSource(audioStream);
|
|
463
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
432
464
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
433
465
|
|
|
434
466
|
if (this.realtimeSession) {
|
|
@@ -441,16 +473,29 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
441
473
|
}
|
|
442
474
|
|
|
443
475
|
detachAudioInput(): void {
|
|
444
|
-
this.
|
|
476
|
+
if (this.audioStreamId === undefined) {
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
void this.audioStream.close();
|
|
481
|
+
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
482
|
+
this.audioStreamId = undefined;
|
|
445
483
|
}
|
|
446
484
|
|
|
447
|
-
commitUserTurn(
|
|
485
|
+
commitUserTurn(
|
|
486
|
+
options: {
|
|
487
|
+
audioDetached?: boolean;
|
|
488
|
+
throwIfNotReady?: boolean;
|
|
489
|
+
} = {},
|
|
490
|
+
) {
|
|
491
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
448
492
|
if (!this.audioRecognition) {
|
|
449
|
-
|
|
493
|
+
if (throwIfNotReady) {
|
|
494
|
+
throw new Error('AudioRecognition is not initialized');
|
|
495
|
+
}
|
|
496
|
+
return;
|
|
450
497
|
}
|
|
451
498
|
|
|
452
|
-
// TODO(brian): add audio_detached flag
|
|
453
|
-
const audioDetached = false;
|
|
454
499
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
455
500
|
}
|
|
456
501
|
|
|
@@ -508,14 +553,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
508
553
|
}),
|
|
509
554
|
);
|
|
510
555
|
const task = this.createSpeechTask({
|
|
511
|
-
|
|
556
|
+
taskFn: (abortController: AbortController) =>
|
|
512
557
|
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
513
|
-
),
|
|
514
558
|
ownedSpeechHandle: handle,
|
|
515
559
|
name: 'AgentActivity.say_tts',
|
|
516
560
|
});
|
|
517
561
|
|
|
518
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
562
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
519
563
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
520
564
|
return handle;
|
|
521
565
|
}
|
|
@@ -628,9 +672,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
628
672
|
return;
|
|
629
673
|
}
|
|
630
674
|
|
|
631
|
-
if (this.
|
|
675
|
+
if (this.schedulingPaused) {
|
|
632
676
|
// TODO(shubhra): should we "forward" this new turn to the next agent?
|
|
633
|
-
this.logger.warn('skipping new realtime generation, the
|
|
677
|
+
this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
|
|
634
678
|
return;
|
|
635
679
|
}
|
|
636
680
|
|
|
@@ -648,9 +692,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
648
692
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
649
693
|
|
|
650
694
|
this.createSpeechTask({
|
|
651
|
-
|
|
695
|
+
taskFn: (abortController: AbortController) =>
|
|
652
696
|
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
653
|
-
),
|
|
654
697
|
ownedSpeechHandle: handle,
|
|
655
698
|
name: 'AgentActivity.realtimeGeneration',
|
|
656
699
|
});
|
|
@@ -782,7 +825,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
782
825
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
783
826
|
if (
|
|
784
827
|
!this.agentSession.options.preemptiveGeneration ||
|
|
785
|
-
this.
|
|
828
|
+
this.schedulingPaused ||
|
|
786
829
|
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
787
830
|
!(this.llm instanceof LLM)
|
|
788
831
|
) {
|
|
@@ -829,11 +872,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
829
872
|
}
|
|
830
873
|
|
|
831
874
|
private createSpeechTask(options: {
|
|
832
|
-
|
|
875
|
+
taskFn: (controller: AbortController) => Promise<void>;
|
|
876
|
+
controller?: AbortController;
|
|
833
877
|
ownedSpeechHandle?: SpeechHandle;
|
|
878
|
+
inlineTask?: boolean;
|
|
834
879
|
name?: string;
|
|
835
|
-
}):
|
|
836
|
-
const {
|
|
880
|
+
}): Task<void> {
|
|
881
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
882
|
+
|
|
883
|
+
const wrappedFn = (ctrl: AbortController) => {
|
|
884
|
+
return agentActivityStorage.run(this, () => {
|
|
885
|
+
// Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
|
|
886
|
+
// before post-construction metadata is attached to the Task instance.
|
|
887
|
+
const currentTask = Task.current();
|
|
888
|
+
if (currentTask) {
|
|
889
|
+
_setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
if (ownedSpeechHandle) {
|
|
893
|
+
return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
894
|
+
}
|
|
895
|
+
return taskFn(ctrl);
|
|
896
|
+
});
|
|
897
|
+
};
|
|
898
|
+
|
|
899
|
+
const task = Task.from(wrappedFn, controller, name);
|
|
900
|
+
_setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
837
901
|
|
|
838
902
|
this.speechTasks.add(task);
|
|
839
903
|
task.addDoneCallback(() => {
|
|
@@ -853,13 +917,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
853
917
|
this.wakeupMainTask();
|
|
854
918
|
});
|
|
855
919
|
|
|
856
|
-
return task
|
|
920
|
+
return task;
|
|
857
921
|
}
|
|
858
922
|
|
|
859
923
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
860
|
-
if (this.
|
|
924
|
+
if (this.schedulingPaused) {
|
|
861
925
|
this.cancelPreemptiveGeneration();
|
|
862
|
-
this.logger.warn(
|
|
926
|
+
this.logger.warn(
|
|
927
|
+
{ user_input: info.newTranscript },
|
|
928
|
+
'skipping user input, speech scheduling is paused',
|
|
929
|
+
);
|
|
863
930
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
864
931
|
return true;
|
|
865
932
|
}
|
|
@@ -892,7 +959,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
892
959
|
|
|
893
960
|
const oldTask = this._userTurnCompletedTask;
|
|
894
961
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
895
|
-
|
|
962
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
896
963
|
name: 'AgentActivity.userTurnCompleted',
|
|
897
964
|
});
|
|
898
965
|
return true;
|
|
@@ -928,10 +995,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
928
995
|
this._currentSpeech = undefined;
|
|
929
996
|
}
|
|
930
997
|
|
|
931
|
-
//
|
|
932
|
-
//
|
|
933
|
-
|
|
934
|
-
|
|
998
|
+
// if we're draining/pausing and there are no more speech tasks, we can exit.
|
|
999
|
+
// only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
|
|
1000
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
1001
|
+
|
|
1002
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
1003
|
+
this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
|
|
935
1004
|
break;
|
|
936
1005
|
}
|
|
937
1006
|
|
|
@@ -941,6 +1010,39 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
941
1010
|
this.logger.info('AgentActivity mainTask: exiting');
|
|
942
1011
|
}
|
|
943
1012
|
|
|
1013
|
+
private getDrainPendingSpeechTasks(): Task<void>[] {
|
|
1014
|
+
const blockedHandles: SpeechHandle[] = [];
|
|
1015
|
+
|
|
1016
|
+
for (const task of this._drainBlockedTasks) {
|
|
1017
|
+
const info = _getActivityTaskInfo(task);
|
|
1018
|
+
if (!info) {
|
|
1019
|
+
this.logger.error('blocked task without activity info; skipping.');
|
|
1020
|
+
continue;
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
if (!info.speechHandle) {
|
|
1024
|
+
continue; // onEnter/onExit
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
blockedHandles.push(info.speechHandle);
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
const toWait: Task<void>[] = [];
|
|
1031
|
+
for (const task of this.speechTasks) {
|
|
1032
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
const info = _getActivityTaskInfo(task);
|
|
1037
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
1038
|
+
continue;
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
toWait.push(task);
|
|
1042
|
+
}
|
|
1043
|
+
return toWait;
|
|
1044
|
+
}
|
|
1045
|
+
|
|
944
1046
|
private wakeupMainTask(): void {
|
|
945
1047
|
this.q_updated.resolve();
|
|
946
1048
|
}
|
|
@@ -982,7 +1084,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
982
1084
|
throw new Error('trying to generate reply without an LLM model');
|
|
983
1085
|
}
|
|
984
1086
|
|
|
985
|
-
const functionCall =
|
|
1087
|
+
const functionCall = functionCallStorage.getStore()?.functionCall;
|
|
986
1088
|
if (toolChoice === undefined && functionCall !== undefined) {
|
|
987
1089
|
// when generateReply is called inside a tool, set toolChoice to 'none' by default
|
|
988
1090
|
toolChoice = 'none';
|
|
@@ -1004,7 +1106,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1004
1106
|
|
|
1005
1107
|
if (this.llm instanceof RealtimeModel) {
|
|
1006
1108
|
this.createSpeechTask({
|
|
1007
|
-
|
|
1109
|
+
taskFn: (abortController: AbortController) =>
|
|
1008
1110
|
this.realtimeReplyTask({
|
|
1009
1111
|
speechHandle: handle,
|
|
1010
1112
|
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
@@ -1016,7 +1118,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1016
1118
|
},
|
|
1017
1119
|
abortController,
|
|
1018
1120
|
}),
|
|
1019
|
-
),
|
|
1020
1121
|
ownedSpeechHandle: handle,
|
|
1021
1122
|
name: 'AgentActivity.realtimeReply',
|
|
1022
1123
|
});
|
|
@@ -1029,7 +1130,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1029
1130
|
}
|
|
1030
1131
|
|
|
1031
1132
|
const task = this.createSpeechTask({
|
|
1032
|
-
|
|
1133
|
+
taskFn: (abortController: AbortController) =>
|
|
1033
1134
|
this.pipelineReplyTask(
|
|
1034
1135
|
handle,
|
|
1035
1136
|
chatCtx ?? this.agent.chatCtx,
|
|
@@ -1041,12 +1142,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1041
1142
|
instructions,
|
|
1042
1143
|
userMessage,
|
|
1043
1144
|
),
|
|
1044
|
-
),
|
|
1045
1145
|
ownedSpeechHandle: handle,
|
|
1046
1146
|
name: 'AgentActivity.pipelineReply',
|
|
1047
1147
|
});
|
|
1048
1148
|
|
|
1049
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
1149
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
1050
1150
|
}
|
|
1051
1151
|
|
|
1052
1152
|
if (scheduleSpeech) {
|
|
@@ -1055,16 +1155,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1055
1155
|
return handle;
|
|
1056
1156
|
}
|
|
1057
1157
|
|
|
1058
|
-
interrupt(): Future<void> {
|
|
1158
|
+
interrupt(options: { force?: boolean } = {}): Future<void> {
|
|
1159
|
+
const { force = false } = options;
|
|
1160
|
+
this.cancelPreemptiveGeneration();
|
|
1161
|
+
|
|
1059
1162
|
const future = new Future<void>();
|
|
1060
1163
|
const currentSpeech = this._currentSpeech;
|
|
1061
1164
|
|
|
1062
1165
|
//TODO(AJS-273): add interrupt for background speeches
|
|
1063
1166
|
|
|
1064
|
-
currentSpeech?.interrupt();
|
|
1167
|
+
currentSpeech?.interrupt(force);
|
|
1065
1168
|
|
|
1066
1169
|
for (const [_, __, speech] of this.speechQueue) {
|
|
1067
|
-
speech.interrupt();
|
|
1170
|
+
speech.interrupt(force);
|
|
1068
1171
|
}
|
|
1069
1172
|
|
|
1070
1173
|
this.realtimeSession?.interrupt();
|
|
@@ -1087,13 +1190,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1087
1190
|
}
|
|
1088
1191
|
}
|
|
1089
1192
|
|
|
1090
|
-
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?:
|
|
1193
|
+
private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
|
|
1091
1194
|
if (oldTask) {
|
|
1092
1195
|
// We never cancel user code as this is very confusing.
|
|
1093
1196
|
// So we wait for the old execution of onUserTurnCompleted to finish.
|
|
1094
1197
|
// In practice this is OK because most speeches will be interrupted if a new turn
|
|
1095
1198
|
// is detected. So the previous execution should complete quickly.
|
|
1096
|
-
await oldTask;
|
|
1199
|
+
await oldTask.result;
|
|
1097
1200
|
}
|
|
1098
1201
|
|
|
1099
1202
|
// When the audio recognition detects the end of a user turn:
|
|
@@ -1551,13 +1654,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1551
1654
|
for (const msg of toolsMessages) {
|
|
1552
1655
|
msg.createdAt = replyStartedAt;
|
|
1553
1656
|
}
|
|
1554
|
-
|
|
1555
|
-
//
|
|
1556
|
-
//
|
|
1657
|
+
// Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
|
|
1658
|
+
// were already added by onToolExecutionStarted when the tool execution began.
|
|
1659
|
+
// Inserting function_calls again would create duplicates that break provider APIs
|
|
1660
|
+
// (e.g. Google's "function response parts != function call parts" error).
|
|
1557
1661
|
const toolCallOutputs = toolsMessages.filter(
|
|
1558
1662
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1559
1663
|
);
|
|
1560
1664
|
if (toolCallOutputs.length > 0) {
|
|
1665
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1561
1666
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1562
1667
|
}
|
|
1563
1668
|
}
|
|
@@ -1665,52 +1770,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1665
1770
|
return;
|
|
1666
1771
|
}
|
|
1667
1772
|
|
|
1668
|
-
const functionToolsExecutedEvent =
|
|
1669
|
-
|
|
1670
|
-
functionCallOutputs: [],
|
|
1671
|
-
});
|
|
1672
|
-
let shouldGenerateToolReply: boolean = false;
|
|
1673
|
-
let newAgentTask: Agent | null = null;
|
|
1674
|
-
let ignoreTaskSwitch: boolean = false;
|
|
1675
|
-
|
|
1676
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1677
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
1678
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1679
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1680
|
-
if (sanitizedOut.replyRequired) {
|
|
1681
|
-
shouldGenerateToolReply = true;
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
|
|
1685
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
1686
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
1687
|
-
ignoreTaskSwitch = true;
|
|
1688
|
-
// TODO(brian): should we mark the function call as failed to notify the LLM?
|
|
1689
|
-
}
|
|
1690
|
-
|
|
1691
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1692
|
-
|
|
1693
|
-
this.logger.debug(
|
|
1694
|
-
{
|
|
1695
|
-
speechId: speechHandle.id,
|
|
1696
|
-
name: sanitizedOut.toolCall?.name,
|
|
1697
|
-
args: sanitizedOut.toolCall.args,
|
|
1698
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
1699
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
1700
|
-
},
|
|
1701
|
-
'Tool call execution finished',
|
|
1702
|
-
);
|
|
1703
|
-
}
|
|
1773
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
1774
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1704
1775
|
|
|
1705
1776
|
this.agentSession.emit(
|
|
1706
1777
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1707
1778
|
functionToolsExecutedEvent,
|
|
1708
1779
|
);
|
|
1709
1780
|
|
|
1710
|
-
let
|
|
1781
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1711
1782
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1712
1783
|
this.agentSession.updateAgent(newAgentTask);
|
|
1713
|
-
|
|
1784
|
+
schedulingPaused = true;
|
|
1714
1785
|
}
|
|
1715
1786
|
|
|
1716
1787
|
const toolMessages = [
|
|
@@ -1725,11 +1796,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1725
1796
|
|
|
1726
1797
|
// Avoid setting tool_choice to "required" or a specific function when
|
|
1727
1798
|
// passing tool response back to the LLM
|
|
1728
|
-
const respondToolChoice =
|
|
1799
|
+
const respondToolChoice =
|
|
1800
|
+
schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1729
1801
|
|
|
1730
1802
|
// Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
|
|
1731
1803
|
const toolResponseTask = this.createSpeechTask({
|
|
1732
|
-
|
|
1804
|
+
taskFn: () =>
|
|
1733
1805
|
this.pipelineReplyTask(
|
|
1734
1806
|
speechHandle,
|
|
1735
1807
|
chatCtx,
|
|
@@ -1740,12 +1812,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1740
1812
|
undefined,
|
|
1741
1813
|
toolMessages,
|
|
1742
1814
|
),
|
|
1743
|
-
),
|
|
1744
1815
|
ownedSpeechHandle: speechHandle,
|
|
1745
1816
|
name: 'AgentActivity.pipelineReply',
|
|
1746
1817
|
});
|
|
1747
1818
|
|
|
1748
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1819
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1749
1820
|
|
|
1750
1821
|
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1751
1822
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -1753,15 +1824,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1753
1824
|
msg.createdAt = replyStartedAt;
|
|
1754
1825
|
}
|
|
1755
1826
|
|
|
1756
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1757
|
-
|
|
1758
|
-
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1759
|
-
// were already added by onToolExecutionStarted when the tool execution began
|
|
1760
1827
|
const toolCallOutputs = toolMessages.filter(
|
|
1761
1828
|
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1762
1829
|
);
|
|
1763
1830
|
|
|
1764
1831
|
if (toolCallOutputs.length > 0) {
|
|
1832
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1765
1833
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1766
1834
|
}
|
|
1767
1835
|
}
|
|
@@ -2164,50 +2232,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2164
2232
|
return;
|
|
2165
2233
|
}
|
|
2166
2234
|
|
|
2167
|
-
const functionToolsExecutedEvent =
|
|
2168
|
-
|
|
2169
|
-
functionCallOutputs: [],
|
|
2170
|
-
});
|
|
2171
|
-
let shouldGenerateToolReply: boolean = false;
|
|
2172
|
-
let newAgentTask: Agent | null = null;
|
|
2173
|
-
let ignoreTaskSwitch: boolean = false;
|
|
2174
|
-
|
|
2175
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
2176
|
-
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2177
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2178
|
-
if (sanitizedOut.replyRequired) {
|
|
2179
|
-
shouldGenerateToolReply = true;
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
|
-
|
|
2183
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2184
|
-
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2185
|
-
ignoreTaskSwitch = true;
|
|
2186
|
-
}
|
|
2187
|
-
|
|
2188
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2189
|
-
|
|
2190
|
-
this.logger.debug(
|
|
2191
|
-
{
|
|
2192
|
-
speechId: speechHandle.id,
|
|
2193
|
-
name: sanitizedOut.toolCall?.name,
|
|
2194
|
-
args: sanitizedOut.toolCall.args,
|
|
2195
|
-
output: sanitizedOut.toolCallOutput?.output,
|
|
2196
|
-
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2197
|
-
},
|
|
2198
|
-
'Tool call execution finished',
|
|
2199
|
-
);
|
|
2200
|
-
}
|
|
2235
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
|
|
2236
|
+
this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
2201
2237
|
|
|
2202
2238
|
this.agentSession.emit(
|
|
2203
2239
|
AgentSessionEventTypes.FunctionToolsExecuted,
|
|
2204
2240
|
functionToolsExecutedEvent,
|
|
2205
2241
|
);
|
|
2206
2242
|
|
|
2207
|
-
let
|
|
2243
|
+
let schedulingPaused = this.schedulingPaused;
|
|
2208
2244
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
2209
2245
|
this.agentSession.updateAgent(newAgentTask);
|
|
2210
|
-
|
|
2246
|
+
schedulingPaused = true;
|
|
2211
2247
|
}
|
|
2212
2248
|
|
|
2213
2249
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
@@ -2263,15 +2299,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2263
2299
|
}),
|
|
2264
2300
|
);
|
|
2265
2301
|
|
|
2266
|
-
const toolChoice =
|
|
2302
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
2267
2303
|
this.createSpeechTask({
|
|
2268
|
-
|
|
2304
|
+
taskFn: (abortController: AbortController) =>
|
|
2269
2305
|
this.realtimeReplyTask({
|
|
2270
2306
|
speechHandle: replySpeechHandle,
|
|
2271
2307
|
modelSettings: { toolChoice },
|
|
2272
2308
|
abortController,
|
|
2273
2309
|
}),
|
|
2274
|
-
),
|
|
2275
2310
|
ownedSpeechHandle: replySpeechHandle,
|
|
2276
2311
|
name: 'AgentActivity.realtime_reply',
|
|
2277
2312
|
});
|
|
@@ -2279,6 +2314,53 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2279
2314
|
this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
2280
2315
|
}
|
|
2281
2316
|
|
|
2317
|
+
private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
|
|
2318
|
+
const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
|
|
2319
|
+
functionCalls: [],
|
|
2320
|
+
functionCallOutputs: [],
|
|
2321
|
+
});
|
|
2322
|
+
|
|
2323
|
+
let shouldGenerateToolReply = false;
|
|
2324
|
+
let newAgentTask: Agent | null = null;
|
|
2325
|
+
let ignoreTaskSwitch = false;
|
|
2326
|
+
|
|
2327
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
2328
|
+
if (sanitizedOut.toolCallOutput !== undefined) {
|
|
2329
|
+
// Keep event payload symmetric for pipeline + realtime paths.
|
|
2330
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
2331
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
2332
|
+
if (sanitizedOut.replyRequired) {
|
|
2333
|
+
shouldGenerateToolReply = true;
|
|
2334
|
+
}
|
|
2335
|
+
}
|
|
2336
|
+
|
|
2337
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
|
|
2338
|
+
this.logger.error('expected to receive only one agent task from the tool executions');
|
|
2339
|
+
ignoreTaskSwitch = true;
|
|
2340
|
+
}
|
|
2341
|
+
|
|
2342
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
2343
|
+
|
|
2344
|
+
this.logger.debug(
|
|
2345
|
+
{
|
|
2346
|
+
speechId: speechHandle.id,
|
|
2347
|
+
name: sanitizedOut.toolCall?.name,
|
|
2348
|
+
args: sanitizedOut.toolCall.args,
|
|
2349
|
+
output: sanitizedOut.toolCallOutput?.output,
|
|
2350
|
+
isError: sanitizedOut.toolCallOutput?.isError,
|
|
2351
|
+
},
|
|
2352
|
+
'Tool call execution finished',
|
|
2353
|
+
);
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
return {
|
|
2357
|
+
functionToolsExecutedEvent,
|
|
2358
|
+
shouldGenerateToolReply,
|
|
2359
|
+
newAgentTask,
|
|
2360
|
+
ignoreTaskSwitch,
|
|
2361
|
+
};
|
|
2362
|
+
}
|
|
2363
|
+
|
|
2282
2364
|
private async realtimeReplyTask({
|
|
2283
2365
|
speechHandle,
|
|
2284
2366
|
modelSettings: { toolChoice },
|
|
@@ -2337,10 +2419,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2337
2419
|
priority: number,
|
|
2338
2420
|
force: boolean = false,
|
|
2339
2421
|
): void {
|
|
2340
|
-
// when force=true, we allow tool responses to bypass
|
|
2422
|
+
// when force=true, we allow tool responses to bypass scheduling pause
|
|
2341
2423
|
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
2342
|
-
if (this.
|
|
2343
|
-
throw new Error('cannot schedule new speech, the
|
|
2424
|
+
if (this.schedulingPaused && !force) {
|
|
2425
|
+
throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
|
|
2344
2426
|
}
|
|
2345
2427
|
|
|
2346
2428
|
// Monotonic time to avoid near 0 collisions
|
|
@@ -2349,6 +2431,48 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2349
2431
|
this.wakeupMainTask();
|
|
2350
2432
|
}
|
|
2351
2433
|
|
|
2434
|
+
private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
|
|
2435
|
+
if (this._schedulingPaused) return;
|
|
2436
|
+
|
|
2437
|
+
this._schedulingPaused = true;
|
|
2438
|
+
this._drainBlockedTasks = blockedTasks;
|
|
2439
|
+
this.wakeupMainTask();
|
|
2440
|
+
|
|
2441
|
+
if (this._mainTask) {
|
|
2442
|
+
// When pausing/draining, we ensure that all speech_tasks complete fully.
|
|
2443
|
+
// This means that even if the SpeechHandle themselves have finished,
|
|
2444
|
+
// we still wait for the entire execution (e.g function_tools)
|
|
2445
|
+
await this._mainTask.result;
|
|
2446
|
+
}
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
private _resumeSchedulingTask(): void {
|
|
2450
|
+
if (!this._schedulingPaused) return;
|
|
2451
|
+
|
|
2452
|
+
this._schedulingPaused = false;
|
|
2453
|
+
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
2454
|
+
}
|
|
2455
|
+
|
|
2456
|
+
async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
|
|
2457
|
+
const { blockedTasks = [] } = options;
|
|
2458
|
+
const unlock = await this.lock.lock();
|
|
2459
|
+
|
|
2460
|
+
try {
|
|
2461
|
+
const span = tracer.startSpan({
|
|
2462
|
+
name: 'pause_agent_activity',
|
|
2463
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2464
|
+
});
|
|
2465
|
+
try {
|
|
2466
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
2467
|
+
await this._closeSessionResources();
|
|
2468
|
+
} finally {
|
|
2469
|
+
span.end();
|
|
2470
|
+
}
|
|
2471
|
+
} finally {
|
|
2472
|
+
unlock();
|
|
2473
|
+
}
|
|
2474
|
+
}
|
|
2475
|
+
|
|
2352
2476
|
async drain(): Promise<void> {
|
|
2353
2477
|
// Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
|
|
2354
2478
|
return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
@@ -2362,23 +2486,22 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2362
2486
|
|
|
2363
2487
|
const unlock = await this.lock.lock();
|
|
2364
2488
|
try {
|
|
2365
|
-
if (this.
|
|
2489
|
+
if (this._schedulingPaused) return;
|
|
2366
2490
|
|
|
2367
|
-
this.
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
this.createSpeechTask({
|
|
2375
|
-
task: Task.from(() => onExitTask),
|
|
2491
|
+
this._onExitTask = this.createSpeechTask({
|
|
2492
|
+
taskFn: () =>
|
|
2493
|
+
tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
2494
|
+
name: 'on_exit',
|
|
2495
|
+
attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
2496
|
+
}),
|
|
2497
|
+
inlineTask: true,
|
|
2376
2498
|
name: 'AgentActivity_onExit',
|
|
2377
2499
|
});
|
|
2378
2500
|
|
|
2379
|
-
this.
|
|
2380
|
-
|
|
2381
|
-
await this.
|
|
2501
|
+
this.cancelPreemptiveGeneration();
|
|
2502
|
+
|
|
2503
|
+
await this._onExitTask.result;
|
|
2504
|
+
await this._pauseSchedulingTask([]);
|
|
2382
2505
|
} finally {
|
|
2383
2506
|
unlock();
|
|
2384
2507
|
}
|
|
@@ -2387,44 +2510,59 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2387
2510
|
async close(): Promise<void> {
|
|
2388
2511
|
const unlock = await this.lock.lock();
|
|
2389
2512
|
try {
|
|
2390
|
-
if (!this._draining) {
|
|
2391
|
-
this.logger.warn('task closing without draining');
|
|
2392
|
-
}
|
|
2393
|
-
|
|
2394
2513
|
this.cancelPreemptiveGeneration();
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
if (this.realtimeSession) {
|
|
2400
|
-
this.realtimeSession.off('generation_created', this.onGenerationCreated);
|
|
2401
|
-
this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
|
|
2402
|
-
this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
|
|
2403
|
-
this.realtimeSession.off(
|
|
2404
|
-
'input_audio_transcription_completed',
|
|
2405
|
-
this.onInputAudioTranscriptionCompleted,
|
|
2406
|
-
);
|
|
2407
|
-
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2408
|
-
}
|
|
2409
|
-
if (this.stt instanceof STT) {
|
|
2410
|
-
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2411
|
-
}
|
|
2412
|
-
if (this.tts instanceof TTS) {
|
|
2413
|
-
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2414
|
-
}
|
|
2415
|
-
if (this.vad instanceof VAD) {
|
|
2416
|
-
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2514
|
+
await this._closeSessionResources();
|
|
2515
|
+
|
|
2516
|
+
if (this._mainTask) {
|
|
2517
|
+
await this._mainTask.cancelAndWait();
|
|
2417
2518
|
}
|
|
2418
2519
|
|
|
2419
|
-
this.
|
|
2420
|
-
this.realtimeSpans?.clear();
|
|
2421
|
-
await this.realtimeSession?.close();
|
|
2422
|
-
await this.audioRecognition?.close();
|
|
2423
|
-
await this._mainTask?.cancelAndWait();
|
|
2520
|
+
this.agent._agentActivity = undefined;
|
|
2424
2521
|
} finally {
|
|
2425
2522
|
unlock();
|
|
2426
2523
|
}
|
|
2427
2524
|
}
|
|
2525
|
+
|
|
2526
|
+
private async _closeSessionResources(): Promise<void> {
|
|
2527
|
+
// Unregister event handlers to prevent duplicate metrics
|
|
2528
|
+
if (this.llm instanceof LLM) {
|
|
2529
|
+
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
2530
|
+
this.llm.off('error', this.onModelError);
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
if (this.realtimeSession) {
|
|
2534
|
+
this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
|
|
2535
|
+
this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
|
|
2536
|
+
this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
|
|
2537
|
+
this.realtimeSession.off(
|
|
2538
|
+
'input_audio_transcription_completed',
|
|
2539
|
+
this.onRealtimeInputAudioTranscriptionCompleted,
|
|
2540
|
+
);
|
|
2541
|
+
this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
|
|
2542
|
+
this.realtimeSession.off('error', this.onModelError);
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
if (this.stt instanceof STT) {
|
|
2546
|
+
this.stt.off('metrics_collected', this.onMetricsCollected);
|
|
2547
|
+
this.stt.off('error', this.onModelError);
|
|
2548
|
+
}
|
|
2549
|
+
|
|
2550
|
+
if (this.tts instanceof TTS) {
|
|
2551
|
+
this.tts.off('metrics_collected', this.onMetricsCollected);
|
|
2552
|
+
this.tts.off('error', this.onModelError);
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
if (this.vad instanceof VAD) {
|
|
2556
|
+
this.vad.off('metrics_collected', this.onMetricsCollected);
|
|
2557
|
+
}
|
|
2558
|
+
|
|
2559
|
+
this.detachAudioInput();
|
|
2560
|
+
this.realtimeSpans?.clear();
|
|
2561
|
+
await this.realtimeSession?.close();
|
|
2562
|
+
await this.audioRecognition?.close();
|
|
2563
|
+
this.realtimeSession = undefined;
|
|
2564
|
+
this.audioRecognition = undefined;
|
|
2565
|
+
}
|
|
2428
2566
|
}
|
|
2429
2567
|
|
|
2430
2568
|
function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
|