@livekit/agents 1.0.45 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stream/index.cjs +3 -0
- package/dist/stream/index.cjs.map +1 -1
- package/dist/stream/index.d.cts +1 -0
- package/dist/stream/index.d.ts +1 -0
- package/dist/stream/index.d.ts.map +1 -1
- package/dist/stream/index.js +2 -0
- package/dist/stream/index.js.map +1 -1
- package/dist/stream/multi_input_stream.cjs +139 -0
- package/dist/stream/multi_input_stream.cjs.map +1 -0
- package/dist/stream/multi_input_stream.d.cts +55 -0
- package/dist/stream/multi_input_stream.d.ts +55 -0
- package/dist/stream/multi_input_stream.d.ts.map +1 -0
- package/dist/stream/multi_input_stream.js +115 -0
- package/dist/stream/multi_input_stream.js.map +1 -0
- package/dist/stream/multi_input_stream.test.cjs +340 -0
- package/dist/stream/multi_input_stream.test.cjs.map +1 -0
- package/dist/stream/multi_input_stream.test.js +339 -0
- package/dist/stream/multi_input_stream.test.js.map +1 -0
- package/dist/telemetry/trace_types.cjs +42 -0
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +14 -0
- package/dist/telemetry/trace_types.d.ts +14 -0
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +28 -0
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/utils.cjs +44 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +44 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +402 -292
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +35 -7
- package/dist/voice/agent_activity.d.ts +35 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +402 -287
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +156 -44
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +22 -9
- package/dist/voice/agent_session.d.ts +22 -9
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +156 -44
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +89 -36
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +22 -1
- package/dist/voice/audio_recognition.d.ts +22 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +93 -36
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +233 -0
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
- package/dist/voice/audio_recognition_span.test.js +232 -0
- package/dist/voice/audio_recognition_span.test.js.map +1 -0
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +6 -3
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +3 -2
- package/dist/voice/io.d.ts +3 -2
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +6 -3
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +3 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +3 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +17 -17
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +2 -2
- package/dist/voice/room_io/_input.d.ts +2 -2
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +7 -6
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +9 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +9 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/dist/voice/utils.cjs +47 -0
- package/dist/voice/utils.cjs.map +1 -0
- package/dist/voice/utils.d.cts +4 -0
- package/dist/voice/utils.d.ts +4 -0
- package/dist/voice/utils.d.ts.map +1 -0
- package/dist/voice/utils.js +23 -0
- package/dist/voice/utils.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/stream/index.ts +1 -0
- package/src/stream/multi_input_stream.test.ts +540 -0
- package/src/stream/multi_input_stream.ts +172 -0
- package/src/telemetry/trace_types.ts +18 -0
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +52 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +449 -286
- package/src/voice/agent_session.ts +195 -51
- package/src/voice/audio_recognition.ts +118 -38
- package/src/voice/audio_recognition_span.test.ts +261 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +7 -4
- package/src/voice/recorder_io/recorder_io.ts +2 -1
- package/src/voice/room_io/_input.ts +11 -7
- package/src/voice/room_io/room_io.ts +12 -0
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
- package/src/voice/utils.ts +29 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { Mutex } from '@livekit/mutex';
|
|
4
5
|
import type { AudioFrame, Room } from '@livekit/rtc-node';
|
|
5
6
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
7
|
import type { Context, Span } from '@opentelemetry/api';
|
|
7
8
|
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
9
|
import { EventEmitter } from 'node:events';
|
|
9
10
|
import type { ReadableStream } from 'node:stream/web';
|
|
11
|
+
import type { z } from 'zod';
|
|
10
12
|
import {
|
|
11
13
|
LLM as InferenceLLM,
|
|
12
14
|
STT as InferenceSTT,
|
|
@@ -31,6 +33,7 @@ import {
|
|
|
31
33
|
type ResolvedSessionConnectOptions,
|
|
32
34
|
type SessionConnectOptions,
|
|
33
35
|
} from '../types.js';
|
|
36
|
+
import { Task } from '../utils.js';
|
|
34
37
|
import type { VAD } from '../vad.js';
|
|
35
38
|
import type { Agent } from './agent.js';
|
|
36
39
|
import { AgentActivity } from './agent_activity.js';
|
|
@@ -62,6 +65,7 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io
|
|
|
62
65
|
import type { UnknownUserData } from './run_context.js';
|
|
63
66
|
import type { SpeechHandle } from './speech_handle.js';
|
|
64
67
|
import { RunResult } from './testing/run_result.js';
|
|
68
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
65
69
|
|
|
66
70
|
export interface VoiceOptions {
|
|
67
71
|
allowInterruptions: boolean;
|
|
@@ -114,6 +118,13 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
|
|
|
114
118
|
connOptions?: SessionConnectOptions;
|
|
115
119
|
};
|
|
116
120
|
|
|
121
|
+
type ActivityTransitionOptions = {
|
|
122
|
+
previousActivity?: 'close' | 'pause';
|
|
123
|
+
newActivity?: 'start' | 'resume';
|
|
124
|
+
blockedTasks?: Task<any>[];
|
|
125
|
+
waitOnEnter?: boolean;
|
|
126
|
+
};
|
|
127
|
+
|
|
117
128
|
export class AgentSession<
|
|
118
129
|
UserData = UnknownUserData,
|
|
119
130
|
> extends (EventEmitter as new () => TypedEmitter<AgentSessionCallbacks>) {
|
|
@@ -128,10 +139,13 @@ export class AgentSession<
|
|
|
128
139
|
private agent?: Agent;
|
|
129
140
|
private activity?: AgentActivity;
|
|
130
141
|
private nextActivity?: AgentActivity;
|
|
142
|
+
private updateActivityTask?: Task<void>;
|
|
131
143
|
private started = false;
|
|
132
144
|
private userState: UserState = 'listening';
|
|
145
|
+
private readonly activityLock = new Mutex();
|
|
133
146
|
|
|
134
|
-
|
|
147
|
+
/** @internal */
|
|
148
|
+
_roomIO?: RoomIO;
|
|
135
149
|
private logger = log();
|
|
136
150
|
|
|
137
151
|
private _chatCtx: ChatContext;
|
|
@@ -294,7 +308,7 @@ export class AgentSession<
|
|
|
294
308
|
|
|
295
309
|
const tasks: Promise<void>[] = [];
|
|
296
310
|
|
|
297
|
-
if (room && !this.
|
|
311
|
+
if (room && !this._roomIO) {
|
|
298
312
|
// Check for existing input/output configuration and warn if needed
|
|
299
313
|
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
300
314
|
this.logger.warn(
|
|
@@ -314,13 +328,13 @@ export class AgentSession<
|
|
|
314
328
|
);
|
|
315
329
|
}
|
|
316
330
|
|
|
317
|
-
this.
|
|
331
|
+
this._roomIO = new RoomIO({
|
|
318
332
|
agentSession: this,
|
|
319
333
|
room,
|
|
320
334
|
inputOptions,
|
|
321
335
|
outputOptions,
|
|
322
336
|
});
|
|
323
|
-
this.
|
|
337
|
+
this._roomIO.start();
|
|
324
338
|
}
|
|
325
339
|
|
|
326
340
|
let ctx: JobContext | undefined = undefined;
|
|
@@ -358,7 +372,8 @@ export class AgentSession<
|
|
|
358
372
|
}
|
|
359
373
|
|
|
360
374
|
// TODO(AJS-265): add shutdown callback to job context
|
|
361
|
-
|
|
375
|
+
// Initial start does not wait on onEnter
|
|
376
|
+
tasks.push(this._updateActivity(this.agent, { waitOnEnter: false }));
|
|
362
377
|
|
|
363
378
|
await Promise.allSettled(tasks);
|
|
364
379
|
|
|
@@ -430,8 +445,34 @@ export class AgentSession<
|
|
|
430
445
|
updateAgent(agent: Agent): void {
|
|
431
446
|
this.agent = agent;
|
|
432
447
|
|
|
433
|
-
if (this.started) {
|
|
434
|
-
|
|
448
|
+
if (!this.started) {
|
|
449
|
+
return;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
const _updateActivityTask = async (oldTask: Task<void> | undefined, agent: Agent) => {
|
|
453
|
+
if (oldTask) {
|
|
454
|
+
try {
|
|
455
|
+
await oldTask.result;
|
|
456
|
+
} catch (error) {
|
|
457
|
+
this.logger.error(error, 'previous updateAgent transition failed');
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
await this._updateActivity(agent);
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
const oldTask = this.updateActivityTask;
|
|
465
|
+
this.updateActivityTask = Task.from(
|
|
466
|
+
async () => _updateActivityTask(oldTask, agent),
|
|
467
|
+
undefined,
|
|
468
|
+
'AgentSession_updateActivityTask',
|
|
469
|
+
);
|
|
470
|
+
|
|
471
|
+
const runState = this._globalRunState;
|
|
472
|
+
if (runState) {
|
|
473
|
+
// Don't mark the RunResult as done, if there is currently an agent transition happening.
|
|
474
|
+
// (used to make sure we're correctly adding the AgentHandoffResult before completion)
|
|
475
|
+
runState._watchHandle(this.updateActivityTask);
|
|
435
476
|
}
|
|
436
477
|
}
|
|
437
478
|
|
|
@@ -462,24 +503,42 @@ export class AgentSession<
|
|
|
462
503
|
throw new Error('AgentSession is not running');
|
|
463
504
|
}
|
|
464
505
|
|
|
465
|
-
const doSay = (activity: AgentActivity) => {
|
|
506
|
+
const doSay = (activity: AgentActivity, nextActivity?: AgentActivity) => {
|
|
507
|
+
if (activity.schedulingPaused) {
|
|
508
|
+
if (!nextActivity) {
|
|
509
|
+
throw new Error('AgentSession is closing, cannot use say()');
|
|
510
|
+
}
|
|
511
|
+
return nextActivity.say(text, options);
|
|
512
|
+
}
|
|
466
513
|
return activity.say(text, options);
|
|
467
514
|
};
|
|
468
515
|
|
|
516
|
+
const runState = this._globalRunState;
|
|
517
|
+
let handle: SpeechHandle;
|
|
518
|
+
|
|
469
519
|
// attach to the session span if called outside of the AgentSession
|
|
470
520
|
const activeSpan = trace.getActiveSpan();
|
|
471
521
|
if (!activeSpan && this.rootSpanContext) {
|
|
472
|
-
|
|
522
|
+
handle = otelContext.with(this.rootSpanContext, () =>
|
|
523
|
+
doSay(this.activity!, this.nextActivity),
|
|
524
|
+
);
|
|
525
|
+
} else {
|
|
526
|
+
handle = doSay(this.activity, this.nextActivity);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
if (runState) {
|
|
530
|
+
runState._watchHandle(handle);
|
|
473
531
|
}
|
|
474
532
|
|
|
475
|
-
return
|
|
533
|
+
return handle;
|
|
476
534
|
}
|
|
477
535
|
|
|
478
|
-
interrupt() {
|
|
536
|
+
interrupt(options?: { force?: boolean }) {
|
|
479
537
|
if (!this.activity) {
|
|
480
538
|
throw new Error('AgentSession is not running');
|
|
481
539
|
}
|
|
482
|
-
|
|
540
|
+
|
|
541
|
+
return this.activity.interrupt(options);
|
|
483
542
|
}
|
|
484
543
|
|
|
485
544
|
generateReply(options?: {
|
|
@@ -500,7 +559,7 @@ export class AgentSession<
|
|
|
500
559
|
: undefined;
|
|
501
560
|
|
|
502
561
|
const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
|
|
503
|
-
if (activity.
|
|
562
|
+
if (activity.schedulingPaused) {
|
|
504
563
|
if (!nextActivity) {
|
|
505
564
|
throw new Error('AgentSession is closing, cannot use generateReply()');
|
|
506
565
|
}
|
|
@@ -540,53 +599,128 @@ export class AgentSession<
|
|
|
540
599
|
* result.expect.noMoreEvents();
|
|
541
600
|
* ```
|
|
542
601
|
*
|
|
543
|
-
* @param options - Run options including user input
|
|
602
|
+
* @param options - Run options including user input and optional output type
|
|
544
603
|
* @returns A RunResult that resolves when the agent finishes responding
|
|
545
|
-
*
|
|
546
|
-
* TODO: Add outputType parameter for typed outputs (parity with Python)
|
|
547
604
|
*/
|
|
548
|
-
run
|
|
605
|
+
run<T = unknown>({
|
|
606
|
+
userInput,
|
|
607
|
+
outputType,
|
|
608
|
+
}: {
|
|
609
|
+
userInput: string;
|
|
610
|
+
outputType?: z.ZodType<T>;
|
|
611
|
+
}): RunResult<T> {
|
|
549
612
|
if (this._globalRunState && !this._globalRunState.done()) {
|
|
550
613
|
throw new Error('nested runs are not supported');
|
|
551
614
|
}
|
|
552
615
|
|
|
553
|
-
const runState = new RunResult({
|
|
616
|
+
const runState = new RunResult<T>({
|
|
617
|
+
userInput,
|
|
618
|
+
outputType,
|
|
619
|
+
});
|
|
620
|
+
|
|
554
621
|
this._globalRunState = runState;
|
|
555
|
-
|
|
622
|
+
|
|
623
|
+
// Defer generateReply through the activityLock to ensure any in-progress
|
|
624
|
+
// activity transition (e.g. AgentTask started from onEnter) completes first.
|
|
625
|
+
// TS Task.from starts onEnter synchronously, so the transition may already be
|
|
626
|
+
// mid-flight by the time run() is called after session.start() resolves.
|
|
627
|
+
// Acquiring and immediately releasing the lock guarantees FIFO ordering:
|
|
628
|
+
// the transition's lock section finishes before we route generateReply.
|
|
629
|
+
(async () => {
|
|
630
|
+
try {
|
|
631
|
+
const unlock = await this.activityLock.lock();
|
|
632
|
+
unlock();
|
|
633
|
+
this.generateReply({ userInput });
|
|
634
|
+
} catch (e) {
|
|
635
|
+
runState._reject(e instanceof Error ? e : new Error(String(e)));
|
|
636
|
+
}
|
|
637
|
+
})();
|
|
556
638
|
|
|
557
639
|
return runState;
|
|
558
640
|
}
|
|
559
641
|
|
|
560
|
-
|
|
642
|
+
/** @internal */
|
|
643
|
+
async _updateActivity(agent: Agent, options: ActivityTransitionOptions = {}): Promise<void> {
|
|
644
|
+
const { previousActivity = 'close', newActivity = 'start', blockedTasks = [] } = options;
|
|
645
|
+
const waitOnEnter = options.waitOnEnter ?? newActivity === 'start';
|
|
646
|
+
|
|
561
647
|
const runWithContext = async () => {
|
|
562
|
-
|
|
563
|
-
|
|
648
|
+
const unlock = await this.activityLock.lock();
|
|
649
|
+
let onEnterTask: Task<void> | undefined;
|
|
564
650
|
|
|
565
|
-
|
|
651
|
+
try {
|
|
652
|
+
this.agent = agent;
|
|
653
|
+
const prevActivityObj = this.activity;
|
|
654
|
+
|
|
655
|
+
if (newActivity === 'start') {
|
|
656
|
+
const prevAgent = prevActivityObj?.agent;
|
|
657
|
+
if (
|
|
658
|
+
agent._agentActivity &&
|
|
659
|
+
// allow updating the same agent that is running
|
|
660
|
+
(agent !== prevAgent || previousActivity !== 'close')
|
|
661
|
+
) {
|
|
662
|
+
throw new Error('Cannot start agent: an activity is already running');
|
|
663
|
+
}
|
|
664
|
+
this.nextActivity = new AgentActivity(agent, this);
|
|
665
|
+
} else if (newActivity === 'resume') {
|
|
666
|
+
if (!agent._agentActivity) {
|
|
667
|
+
throw new Error('Cannot resume agent: no existing activity to resume');
|
|
668
|
+
}
|
|
669
|
+
this.nextActivity = agent._agentActivity;
|
|
670
|
+
}
|
|
566
671
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
672
|
+
if (prevActivityObj && prevActivityObj !== this.nextActivity) {
|
|
673
|
+
if (previousActivity === 'pause') {
|
|
674
|
+
await prevActivityObj.pause({ blockedTasks });
|
|
675
|
+
} else {
|
|
676
|
+
await prevActivityObj.drain();
|
|
677
|
+
await prevActivityObj.close();
|
|
678
|
+
}
|
|
679
|
+
}
|
|
571
680
|
|
|
572
|
-
|
|
573
|
-
|
|
681
|
+
this.activity = this.nextActivity;
|
|
682
|
+
this.nextActivity = undefined;
|
|
574
683
|
|
|
575
|
-
|
|
576
|
-
new AgentHandoffItem({
|
|
577
|
-
oldAgentId:
|
|
684
|
+
const runState = this._globalRunState;
|
|
685
|
+
const handoffItem = new AgentHandoffItem({
|
|
686
|
+
oldAgentId: prevActivityObj?.agent.id,
|
|
578
687
|
newAgentId: agent.id,
|
|
579
|
-
})
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
688
|
+
});
|
|
689
|
+
|
|
690
|
+
if (runState) {
|
|
691
|
+
runState._agentHandoff({
|
|
692
|
+
item: handoffItem,
|
|
693
|
+
oldAgent: prevActivityObj?.agent,
|
|
694
|
+
newAgent: this.activity!.agent,
|
|
695
|
+
});
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
this._chatCtx.insert(handoffItem);
|
|
699
|
+
this.logger.debug(
|
|
700
|
+
{ previousAgentId: prevActivityObj?.agent.id, newAgentId: agent.id },
|
|
701
|
+
'Agent handoff inserted into chat context',
|
|
702
|
+
);
|
|
703
|
+
|
|
704
|
+
if (newActivity === 'start') {
|
|
705
|
+
await this.activity!.start();
|
|
706
|
+
} else {
|
|
707
|
+
await this.activity!.resume();
|
|
708
|
+
}
|
|
585
709
|
|
|
586
|
-
|
|
710
|
+
onEnterTask = this.activity!._onEnterTask;
|
|
587
711
|
|
|
588
|
-
|
|
589
|
-
|
|
712
|
+
if (this._input.audio) {
|
|
713
|
+
this.activity!.attachAudioInput(this._input.audio.stream);
|
|
714
|
+
}
|
|
715
|
+
} finally {
|
|
716
|
+
unlock();
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
if (waitOnEnter) {
|
|
720
|
+
if (!onEnterTask) {
|
|
721
|
+
throw new Error('expected onEnter task to be available while waitOnEnter=true');
|
|
722
|
+
}
|
|
723
|
+
await onEnterTask.result;
|
|
590
724
|
}
|
|
591
725
|
};
|
|
592
726
|
|
|
@@ -700,8 +834,10 @@ export class AgentSession<
|
|
|
700
834
|
startTime: options?.startTime,
|
|
701
835
|
});
|
|
702
836
|
|
|
703
|
-
|
|
704
|
-
|
|
837
|
+
const localParticipant = this._roomIO?.localParticipant;
|
|
838
|
+
if (localParticipant) {
|
|
839
|
+
setParticipantSpanAttributes(this.agentSpeakingSpan, localParticipant);
|
|
840
|
+
}
|
|
705
841
|
}
|
|
706
842
|
} else if (this.agentSpeakingSpan !== undefined) {
|
|
707
843
|
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
@@ -738,8 +874,10 @@ export class AgentSession<
|
|
|
738
874
|
startTime: lastSpeakingTime,
|
|
739
875
|
});
|
|
740
876
|
|
|
741
|
-
|
|
742
|
-
|
|
877
|
+
const linked = this._roomIO?.linkedParticipant;
|
|
878
|
+
if (linked) {
|
|
879
|
+
setParticipantSpanAttributes(this.userSpeakingSpan, linked);
|
|
880
|
+
}
|
|
743
881
|
} else if (this.userSpeakingSpan !== undefined) {
|
|
744
882
|
this.userSpeakingSpan.end(lastSpeakingTime);
|
|
745
883
|
this.userSpeakingSpan = undefined;
|
|
@@ -783,7 +921,7 @@ export class AgentSession<
|
|
|
783
921
|
return;
|
|
784
922
|
}
|
|
785
923
|
|
|
786
|
-
if (this.
|
|
924
|
+
if (this._roomIO && !this._roomIO.isParticipantAvailable) {
|
|
787
925
|
return;
|
|
788
926
|
}
|
|
789
927
|
|
|
@@ -836,15 +974,21 @@ export class AgentSession<
|
|
|
836
974
|
if (this.activity) {
|
|
837
975
|
if (!drain) {
|
|
838
976
|
try {
|
|
839
|
-
this.activity.interrupt();
|
|
977
|
+
await this.activity.interrupt({ force: true }).await;
|
|
840
978
|
} catch (error) {
|
|
841
|
-
//
|
|
842
|
-
|
|
979
|
+
// Uninterruptible speech can throw during forced interruption.
|
|
980
|
+
this.logger.warn({ error }, 'Error interrupting activity');
|
|
843
981
|
}
|
|
844
982
|
}
|
|
983
|
+
|
|
845
984
|
await this.activity.drain();
|
|
846
985
|
// wait any uninterruptible speech to finish
|
|
847
986
|
await this.activity.currentSpeech?.waitForPlayout();
|
|
987
|
+
|
|
988
|
+
if (reason !== CloseReason.ERROR) {
|
|
989
|
+
this.activity.commitUserTurn({ audioDetached: true, throwIfNotReady: false });
|
|
990
|
+
}
|
|
991
|
+
|
|
848
992
|
try {
|
|
849
993
|
this.activity.detachAudioInput();
|
|
850
994
|
} catch (error) {
|
|
@@ -862,8 +1006,8 @@ export class AgentSession<
|
|
|
862
1006
|
this.output.audio = null;
|
|
863
1007
|
this.output.transcription = null;
|
|
864
1008
|
|
|
865
|
-
await this.
|
|
866
|
-
this.
|
|
1009
|
+
await this._roomIO?.close();
|
|
1010
|
+
this._roomIO = undefined;
|
|
867
1011
|
|
|
868
1012
|
await this.activity?.close();
|
|
869
1013
|
this.activity = undefined;
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { ParticipantKind } from '@livekit/rtc-node';
|
|
4
5
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
-
import
|
|
6
|
+
import {
|
|
7
|
+
type Context,
|
|
8
|
+
ROOT_CONTEXT,
|
|
9
|
+
type Span,
|
|
10
|
+
context as otelContext,
|
|
11
|
+
trace,
|
|
12
|
+
} from '@opentelemetry/api';
|
|
6
13
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
7
14
|
import { ReadableStream } from 'node:stream/web';
|
|
8
15
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
@@ -16,6 +23,7 @@ import { Task, delay } from '../utils.js';
|
|
|
16
23
|
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
17
24
|
import type { TurnDetectionMode } from './agent_session.js';
|
|
18
25
|
import type { STTNode } from './io.js';
|
|
26
|
+
import { setParticipantSpanAttributes } from './utils.js';
|
|
19
27
|
|
|
20
28
|
export interface EndOfTurnInfo {
|
|
21
29
|
/** The new transcript text from the user's speech. */
|
|
@@ -72,6 +80,22 @@ export interface AudioRecognitionOptions {
|
|
|
72
80
|
maxEndpointingDelay: number;
|
|
73
81
|
/** Root span context for tracing. */
|
|
74
82
|
rootSpanContext?: Context;
|
|
83
|
+
/** STT model name for tracing */
|
|
84
|
+
sttModel?: string;
|
|
85
|
+
/** STT provider name for tracing */
|
|
86
|
+
sttProvider?: string;
|
|
87
|
+
/** Getter for linked participant for span attribution */
|
|
88
|
+
getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Minimal participant shape for span attribution.
|
|
93
|
+
* Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
|
|
94
|
+
*/
|
|
95
|
+
export interface ParticipantLike {
|
|
96
|
+
sid: string | undefined;
|
|
97
|
+
identity: string;
|
|
98
|
+
kind: ParticipantKind;
|
|
75
99
|
}
|
|
76
100
|
|
|
77
101
|
export class AudioRecognition {
|
|
@@ -84,6 +108,9 @@ export class AudioRecognition {
|
|
|
84
108
|
private maxEndpointingDelay: number;
|
|
85
109
|
private lastLanguage?: string;
|
|
86
110
|
private rootSpanContext?: Context;
|
|
111
|
+
private sttModel?: string;
|
|
112
|
+
private sttProvider?: string;
|
|
113
|
+
private getLinkedParticipant?: () => ParticipantLike | undefined;
|
|
87
114
|
|
|
88
115
|
private deferredInputStream: DeferredReadableStream<AudioFrame>;
|
|
89
116
|
private logger = log();
|
|
@@ -121,6 +148,9 @@ export class AudioRecognition {
|
|
|
121
148
|
this.maxEndpointingDelay = opts.maxEndpointingDelay;
|
|
122
149
|
this.lastLanguage = undefined;
|
|
123
150
|
this.rootSpanContext = opts.rootSpanContext;
|
|
151
|
+
this.sttModel = opts.sttModel;
|
|
152
|
+
this.sttProvider = opts.sttProvider;
|
|
153
|
+
this.getLinkedParticipant = opts.getLinkedParticipant;
|
|
124
154
|
|
|
125
155
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
126
156
|
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
@@ -151,6 +181,37 @@ export class AudioRecognition {
|
|
|
151
181
|
});
|
|
152
182
|
}
|
|
153
183
|
|
|
184
|
+
private ensureUserTurnSpan(startTime?: number): Span {
|
|
185
|
+
if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
|
|
186
|
+
return this.userTurnSpan;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
this.userTurnSpan = tracer.startSpan({
|
|
190
|
+
name: 'user_turn',
|
|
191
|
+
context: this.rootSpanContext,
|
|
192
|
+
startTime,
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
const participant = this.getLinkedParticipant?.();
|
|
196
|
+
if (participant) {
|
|
197
|
+
setParticipantSpanAttributes(this.userTurnSpan, participant);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (this.sttModel) {
|
|
201
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
|
|
202
|
+
}
|
|
203
|
+
if (this.sttProvider) {
|
|
204
|
+
this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return this.userTurnSpan;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
private userTurnContext(span: Span): Context {
|
|
211
|
+
const base = this.rootSpanContext ?? ROOT_CONTEXT;
|
|
212
|
+
return trace.setSpan(base, span);
|
|
213
|
+
}
|
|
214
|
+
|
|
154
215
|
private async onSTTEvent(ev: SpeechEvent) {
|
|
155
216
|
if (
|
|
156
217
|
this.turnDetectionMode === 'manual' &&
|
|
@@ -299,19 +360,25 @@ export class AudioRecognition {
|
|
|
299
360
|
break;
|
|
300
361
|
case SpeechEventType.START_OF_SPEECH:
|
|
301
362
|
if (this.turnDetectionMode !== 'stt') break;
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
363
|
+
{
|
|
364
|
+
const span = this.ensureUserTurnSpan(Date.now());
|
|
365
|
+
const ctx = this.userTurnContext(span);
|
|
366
|
+
otelContext.with(ctx, () => {
|
|
367
|
+
this.hooks.onStartOfSpeech({
|
|
368
|
+
type: VADEventType.START_OF_SPEECH,
|
|
369
|
+
samplesIndex: 0,
|
|
370
|
+
timestamp: Date.now(),
|
|
371
|
+
speechDuration: 0,
|
|
372
|
+
silenceDuration: 0,
|
|
373
|
+
frames: [],
|
|
374
|
+
probability: 0,
|
|
375
|
+
inferenceDuration: 0,
|
|
376
|
+
speaking: true,
|
|
377
|
+
rawAccumulatedSilence: 0,
|
|
378
|
+
rawAccumulatedSpeech: 0,
|
|
379
|
+
});
|
|
380
|
+
});
|
|
381
|
+
}
|
|
315
382
|
this.speaking = true;
|
|
316
383
|
this.lastSpeakingTime = Date.now();
|
|
317
384
|
|
|
@@ -319,19 +386,25 @@ export class AudioRecognition {
|
|
|
319
386
|
break;
|
|
320
387
|
case SpeechEventType.END_OF_SPEECH:
|
|
321
388
|
if (this.turnDetectionMode !== 'stt') break;
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
389
|
+
{
|
|
390
|
+
const span = this.ensureUserTurnSpan();
|
|
391
|
+
const ctx = this.userTurnContext(span);
|
|
392
|
+
otelContext.with(ctx, () => {
|
|
393
|
+
this.hooks.onEndOfSpeech({
|
|
394
|
+
type: VADEventType.END_OF_SPEECH,
|
|
395
|
+
samplesIndex: 0,
|
|
396
|
+
timestamp: Date.now(),
|
|
397
|
+
speechDuration: 0,
|
|
398
|
+
silenceDuration: 0,
|
|
399
|
+
frames: [],
|
|
400
|
+
probability: 0,
|
|
401
|
+
inferenceDuration: 0,
|
|
402
|
+
speaking: false,
|
|
403
|
+
rawAccumulatedSilence: 0,
|
|
404
|
+
rawAccumulatedSpeech: 0,
|
|
405
|
+
});
|
|
406
|
+
});
|
|
407
|
+
}
|
|
335
408
|
this.speaking = false;
|
|
336
409
|
this.userTurnCommitted = true;
|
|
337
410
|
this.lastSpeakingTime = Date.now();
|
|
@@ -376,6 +449,9 @@ export class AudioRecognition {
|
|
|
376
449
|
async (controller: AbortController) => {
|
|
377
450
|
let endpointingDelay = this.minEndpointingDelay;
|
|
378
451
|
|
|
452
|
+
const userTurnSpan = this.ensureUserTurnSpan();
|
|
453
|
+
const userTurnCtx = this.userTurnContext(userTurnSpan);
|
|
454
|
+
|
|
379
455
|
if (turnDetector) {
|
|
380
456
|
await tracer.startActiveSpan(
|
|
381
457
|
async (span) => {
|
|
@@ -415,7 +491,7 @@ export class AudioRecognition {
|
|
|
415
491
|
},
|
|
416
492
|
{
|
|
417
493
|
name: 'eou_detection',
|
|
418
|
-
context:
|
|
494
|
+
context: userTurnCtx,
|
|
419
495
|
},
|
|
420
496
|
);
|
|
421
497
|
}
|
|
@@ -577,17 +653,13 @@ export class AudioRecognition {
|
|
|
577
653
|
switch (ev.type) {
|
|
578
654
|
case VADEventType.START_OF_SPEECH:
|
|
579
655
|
this.logger.debug('VAD task: START_OF_SPEECH');
|
|
580
|
-
|
|
581
|
-
this.speaking = true;
|
|
582
|
-
|
|
583
|
-
if (!this.userTurnSpan) {
|
|
656
|
+
{
|
|
584
657
|
const startTime = Date.now() - ev.speechDuration;
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
startTime,
|
|
589
|
-
});
|
|
658
|
+
const span = this.ensureUserTurnSpan(startTime);
|
|
659
|
+
const ctx = this.userTurnContext(span);
|
|
660
|
+
otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
|
|
590
661
|
}
|
|
662
|
+
this.speaking = true;
|
|
591
663
|
|
|
592
664
|
// Capture sample rate from the first VAD event if not already set
|
|
593
665
|
if (ev.frames.length > 0 && ev.frames[0]) {
|
|
@@ -609,7 +681,11 @@ export class AudioRecognition {
|
|
|
609
681
|
break;
|
|
610
682
|
case VADEventType.END_OF_SPEECH:
|
|
611
683
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
612
|
-
|
|
684
|
+
{
|
|
685
|
+
const span = this.ensureUserTurnSpan();
|
|
686
|
+
const ctx = this.userTurnContext(span);
|
|
687
|
+
otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
|
|
688
|
+
}
|
|
613
689
|
|
|
614
690
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
615
691
|
this.speaking = false;
|
|
@@ -692,6 +768,10 @@ export class AudioRecognition {
|
|
|
692
768
|
this.logger.debug('User turn committed');
|
|
693
769
|
})
|
|
694
770
|
.catch((err: unknown) => {
|
|
771
|
+
if (err instanceof Error && err.name === 'AbortError') {
|
|
772
|
+
this.logger.debug('User turn commit task cancelled');
|
|
773
|
+
return;
|
|
774
|
+
}
|
|
695
775
|
this.logger.error(err, 'Error in user turn commit task:');
|
|
696
776
|
});
|
|
697
777
|
}
|