@livekit/agents 1.0.15 → 1.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +12 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +13 -13
- package/dist/cli.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -1
- package/dist/inference/tts.d.ts +2 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +1 -5
- package/dist/inference/tts.js.map +1 -1
- package/dist/llm/chat_context.cjs +78 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +16 -0
- package/dist/llm/chat_context.d.ts +16 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +78 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +531 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +531 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/tool_context.cjs +40 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +2 -0
- package/dist/llm/tool_context.d.ts +2 -0
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +38 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +7 -0
- package/dist/metrics/base.d.ts +7 -0
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/stt/stt.cjs +1 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -1
- package/dist/stt/stt.d.ts +7 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +1 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/tts.cjs +2 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +3 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/voice/agent_activity.cjs +83 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +6 -2
- package/dist/voice/agent_activity.d.ts +6 -2
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +83 -8
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +3 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -1
- package/dist/voice/agent_session.d.ts +2 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +3 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +138 -16
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +11 -0
- package/dist/voice/audio_recognition.d.ts +11 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +138 -16
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +0 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs +17 -11
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +16 -9
- package/dist/worker.d.ts +16 -9
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +16 -12
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +17 -17
- package/src/inference/stt.ts +2 -1
- package/src/inference/tts.ts +2 -5
- package/src/llm/chat_context.test.ts +607 -0
- package/src/llm/chat_context.ts +106 -0
- package/src/llm/tool_context.ts +44 -0
- package/src/metrics/base.ts +7 -0
- package/src/stt/stt.ts +8 -1
- package/src/tts/tts.ts +7 -5
- package/src/voice/agent_activity.ts +119 -9
- package/src/voice/agent_session.ts +3 -1
- package/src/voice/audio_recognition.ts +235 -57
- package/src/voice/room_io/_input.ts +1 -1
- package/src/worker.ts +29 -18
package/src/llm/tool_context.ts
CHANGED
|
@@ -187,6 +187,50 @@ export type ToolContext<UserData = UnknownUserData> = {
|
|
|
187
187
|
[name: string]: FunctionTool<any, UserData, any>;
|
|
188
188
|
};
|
|
189
189
|
|
|
190
|
+
export function isSameToolContext(ctx1: ToolContext, ctx2: ToolContext): boolean {
|
|
191
|
+
const toolNames = new Set(Object.keys(ctx1));
|
|
192
|
+
const toolNames2 = new Set(Object.keys(ctx2));
|
|
193
|
+
|
|
194
|
+
if (toolNames.size !== toolNames2.size) {
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
for (const name of toolNames) {
|
|
199
|
+
if (!toolNames2.has(name)) {
|
|
200
|
+
return false;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
const tool1 = ctx1[name];
|
|
204
|
+
const tool2 = ctx2[name];
|
|
205
|
+
|
|
206
|
+
if (!tool1 || !tool2) {
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (tool1.description !== tool2.description) {
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return true;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
export function isSameToolChoice(choice1: ToolChoice | null, choice2: ToolChoice | null): boolean {
|
|
219
|
+
if (choice1 === choice2) {
|
|
220
|
+
return true;
|
|
221
|
+
}
|
|
222
|
+
if (choice1 === null || choice2 === null) {
|
|
223
|
+
return false;
|
|
224
|
+
}
|
|
225
|
+
if (typeof choice1 === 'string' && typeof choice2 === 'string') {
|
|
226
|
+
return choice1 === choice2;
|
|
227
|
+
}
|
|
228
|
+
if (typeof choice1 === 'object' && typeof choice2 === 'object') {
|
|
229
|
+
return choice1.type === choice2.type && choice1.function.name === choice2.function.name;
|
|
230
|
+
}
|
|
231
|
+
return false;
|
|
232
|
+
}
|
|
233
|
+
|
|
190
234
|
/**
|
|
191
235
|
* Create a function tool with inferred parameters from the schema.
|
|
192
236
|
*/
|
package/src/metrics/base.ts
CHANGED
|
@@ -91,6 +91,13 @@ export type EOUMetrics = {
|
|
|
91
91
|
* Time taken to invoke the user's `Agent.onUserTurnCompleted` callback.
|
|
92
92
|
*/
|
|
93
93
|
onUserTurnCompletedDelayMs: number;
|
|
94
|
+
/**
|
|
95
|
+
* The time the user stopped speaking.
|
|
96
|
+
*/
|
|
97
|
+
lastSpeakingTimeMs: number;
|
|
98
|
+
/**
|
|
99
|
+
* The ID of the speech handle.
|
|
100
|
+
*/
|
|
94
101
|
speechId?: string;
|
|
95
102
|
};
|
|
96
103
|
|
package/src/stt/stt.ts
CHANGED
|
@@ -38,6 +38,12 @@ export enum SpeechEventType {
|
|
|
38
38
|
END_OF_SPEECH = 3,
|
|
39
39
|
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
40
40
|
RECOGNITION_USAGE = 4,
|
|
41
|
+
/**
|
|
42
|
+
* Preflight transcript, emitted before final transcript when STT has high confidence
|
|
43
|
+
* but hasn't fully committed yet. Includes all pre-committed transcripts including
|
|
44
|
+
* final transcript from the previous STT run.
|
|
45
|
+
*/
|
|
46
|
+
PREFLIGHT_TRANSCRIPT = 5,
|
|
41
47
|
}
|
|
42
48
|
|
|
43
49
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
@@ -198,7 +204,8 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
198
204
|
options: { retryable: false },
|
|
199
205
|
});
|
|
200
206
|
} else {
|
|
201
|
-
|
|
207
|
+
// Don't emit error event for recoverable errors during retry loop
|
|
208
|
+
// to avoid ERR_UNHANDLED_ERROR or premature session termination
|
|
202
209
|
this.logger.warn(
|
|
203
210
|
{ tts: this.#stt.label, attempt: i + 1, error },
|
|
204
211
|
`failed to recognize speech, retrying in ${retryInterval}s`,
|
package/src/tts/tts.ts
CHANGED
|
@@ -5,7 +5,7 @@ import type { AudioFrame } from '@livekit/rtc-node';
|
|
|
5
5
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
6
|
import { EventEmitter } from 'node:events';
|
|
7
7
|
import type { ReadableStream } from 'node:stream/web';
|
|
8
|
-
import { APIConnectionError,
|
|
8
|
+
import { APIConnectionError, APIError } from '../_exceptions.js';
|
|
9
9
|
import { log } from '../log.js';
|
|
10
10
|
import type { TTSMetrics } from '../metrics/base.js';
|
|
11
11
|
import { DeferredReadableStream } from '../stream/deferred_stream.js';
|
|
@@ -161,7 +161,7 @@ export abstract class SynthesizeStream
|
|
|
161
161
|
try {
|
|
162
162
|
return await this.run();
|
|
163
163
|
} catch (error) {
|
|
164
|
-
if (error instanceof
|
|
164
|
+
if (error instanceof APIError) {
|
|
165
165
|
const retryInterval = this._connOptions._intervalForRetry(i);
|
|
166
166
|
|
|
167
167
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
@@ -174,7 +174,8 @@ export abstract class SynthesizeStream
|
|
|
174
174
|
options: { retryable: false },
|
|
175
175
|
});
|
|
176
176
|
} else {
|
|
177
|
-
|
|
177
|
+
// Don't emit error event for recoverable errors during retry loop
|
|
178
|
+
// to avoid ERR_UNHANDLED_ERROR or premature session termination
|
|
178
179
|
this.logger.warn(
|
|
179
180
|
{ tts: this.#tts.label, attempt: i + 1, error },
|
|
180
181
|
`failed to synthesize speech, retrying in ${retryInterval}s`,
|
|
@@ -388,7 +389,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
388
389
|
try {
|
|
389
390
|
return await this.run();
|
|
390
391
|
} catch (error) {
|
|
391
|
-
if (error instanceof
|
|
392
|
+
if (error instanceof APIError) {
|
|
392
393
|
const retryInterval = this._connOptions._intervalForRetry(i);
|
|
393
394
|
|
|
394
395
|
if (this._connOptions.maxRetry === 0 || !error.retryable) {
|
|
@@ -401,7 +402,8 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
401
402
|
options: { retryable: false },
|
|
402
403
|
});
|
|
403
404
|
} else {
|
|
404
|
-
|
|
405
|
+
// Don't emit error event for recoverable errors during retry loop
|
|
406
|
+
// to avoid ERR_UNHANDLED_ERROR or premature session termination
|
|
405
407
|
this.logger.warn(
|
|
406
408
|
{ tts: this.#tts.label, attempt: i + 1, error },
|
|
407
409
|
`failed to generate TTS completion, retrying in ${retryInterval}s`,
|
|
@@ -22,6 +22,7 @@ import {
|
|
|
22
22
|
type ToolContext,
|
|
23
23
|
} from '../llm/index.js';
|
|
24
24
|
import type { LLMError } from '../llm/llm.js';
|
|
25
|
+
import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
25
26
|
import { log } from '../log.js';
|
|
26
27
|
import type {
|
|
27
28
|
EOUMetrics,
|
|
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
|
43
44
|
import {
|
|
44
45
|
AudioRecognition,
|
|
45
46
|
type EndOfTurnInfo,
|
|
47
|
+
type PreemptiveGenerationInfo,
|
|
46
48
|
type RecognitionHooks,
|
|
47
49
|
type _TurnDetector,
|
|
48
50
|
} from './audio_recognition.js';
|
|
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
|
|
|
71
73
|
// equivalent to Python's contextvars
|
|
72
74
|
const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
|
|
73
75
|
|
|
76
|
+
interface PreemptiveGeneration {
|
|
77
|
+
speechHandle: SpeechHandle;
|
|
78
|
+
userMessage: ChatMessage;
|
|
79
|
+
info: PreemptiveGenerationInfo;
|
|
80
|
+
chatCtx: ChatContext;
|
|
81
|
+
tools: ToolContext;
|
|
82
|
+
toolChoice: ToolChoice | null;
|
|
83
|
+
createdAt: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
74
86
|
export class AgentActivity implements RecognitionHooks {
|
|
75
87
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
76
88
|
private started = false;
|
|
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
87
99
|
private audioStream = new DeferredReadableStream<AudioFrame>();
|
|
88
100
|
// default to null as None, which maps to the default provider tool choice value
|
|
89
101
|
private toolChoice: ToolChoice | null = null;
|
|
102
|
+
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
90
103
|
|
|
91
104
|
agent: Agent;
|
|
92
105
|
agentSession: AgentSession;
|
|
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
589
602
|
this.agentSession._updateUserState('speaking');
|
|
590
603
|
}
|
|
591
604
|
|
|
592
|
-
onEndOfSpeech(
|
|
593
|
-
|
|
605
|
+
onEndOfSpeech(ev: VADEvent): void {
|
|
606
|
+
let speechEndTime = Date.now();
|
|
607
|
+
if (ev) {
|
|
608
|
+
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
609
|
+
}
|
|
610
|
+
this.agentSession._updateUserState('listening', speechEndTime);
|
|
594
611
|
}
|
|
595
612
|
|
|
596
613
|
onVADInferenceDone(ev: VADEvent): void {
|
|
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
664
681
|
);
|
|
665
682
|
}
|
|
666
683
|
|
|
684
|
+
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
685
|
+
if (
|
|
686
|
+
!this.agentSession.options.preemptiveGeneration ||
|
|
687
|
+
this.draining ||
|
|
688
|
+
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
689
|
+
!(this.llm instanceof LLM)
|
|
690
|
+
) {
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
this.cancelPreemptiveGeneration();
|
|
695
|
+
|
|
696
|
+
this.logger.info(
|
|
697
|
+
{
|
|
698
|
+
newTranscript: info.newTranscript,
|
|
699
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
700
|
+
},
|
|
701
|
+
'starting preemptive generation',
|
|
702
|
+
);
|
|
703
|
+
|
|
704
|
+
const userMessage = ChatMessage.create({
|
|
705
|
+
role: 'user',
|
|
706
|
+
content: info.newTranscript,
|
|
707
|
+
});
|
|
708
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
709
|
+
const speechHandle = this.generateReply({
|
|
710
|
+
userMessage,
|
|
711
|
+
chatCtx,
|
|
712
|
+
scheduleSpeech: false,
|
|
713
|
+
});
|
|
714
|
+
|
|
715
|
+
this._preemptiveGeneration = {
|
|
716
|
+
speechHandle,
|
|
717
|
+
userMessage,
|
|
718
|
+
info,
|
|
719
|
+
chatCtx: chatCtx.copy(),
|
|
720
|
+
tools: { ...this.tools },
|
|
721
|
+
toolChoice: this.toolChoice,
|
|
722
|
+
createdAt: Date.now(),
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
private cancelPreemptiveGeneration(): void {
|
|
727
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
728
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
729
|
+
this._preemptiveGeneration = undefined;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
667
733
|
private createSpeechTask(options: {
|
|
668
734
|
task: Task<void>;
|
|
669
735
|
ownedSpeechHandle?: SpeechHandle;
|
|
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
694
760
|
|
|
695
761
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
696
762
|
if (this.draining) {
|
|
763
|
+
this.cancelPreemptiveGeneration();
|
|
697
764
|
this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
|
|
698
765
|
// copied from python:
|
|
699
766
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
710
777
|
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
|
|
711
778
|
) {
|
|
712
779
|
// avoid interruption if the new_transcript is too short
|
|
780
|
+
this.cancelPreemptiveGeneration();
|
|
713
781
|
this.logger.info('skipping user input, new_transcript is too short');
|
|
714
782
|
return false;
|
|
715
783
|
}
|
|
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
775
843
|
instructions?: string;
|
|
776
844
|
toolChoice?: ToolChoice | null;
|
|
777
845
|
allowInterruptions?: boolean;
|
|
846
|
+
scheduleSpeech?: boolean;
|
|
778
847
|
}): SpeechHandle {
|
|
779
848
|
const {
|
|
780
849
|
userMessage,
|
|
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
782
851
|
instructions: defaultInstructions,
|
|
783
852
|
toolChoice: defaultToolChoice,
|
|
784
853
|
allowInterruptions: defaultAllowInterruptions,
|
|
854
|
+
scheduleSpeech = true,
|
|
785
855
|
} = options;
|
|
786
856
|
|
|
787
857
|
let instructions = defaultInstructions;
|
|
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
871
941
|
task.finally(() => this.onPipelineReplyDone());
|
|
872
942
|
}
|
|
873
943
|
|
|
874
|
-
|
|
944
|
+
if (scheduleSpeech) {
|
|
945
|
+
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
946
|
+
}
|
|
875
947
|
return handle;
|
|
876
948
|
}
|
|
877
949
|
|
|
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
977
1049
|
return;
|
|
978
1050
|
}
|
|
979
1051
|
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1052
|
+
let speechHandle: SpeechHandle | undefined;
|
|
1053
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
1054
|
+
const preemptive = this._preemptiveGeneration;
|
|
1055
|
+
// make sure the onUserTurnCompleted didn't change some request parameters
|
|
1056
|
+
// otherwise invalidate the preemptive generation
|
|
1057
|
+
if (
|
|
1058
|
+
preemptive.info.newTranscript === userMessage?.textContent &&
|
|
1059
|
+
preemptive.chatCtx.isEquivalent(chatCtx) &&
|
|
1060
|
+
isSameToolContext(preemptive.tools, this.tools) &&
|
|
1061
|
+
isSameToolChoice(preemptive.toolChoice, this.toolChoice)
|
|
1062
|
+
) {
|
|
1063
|
+
speechHandle = preemptive.speechHandle;
|
|
1064
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1065
|
+
this.logger.debug(
|
|
1066
|
+
{
|
|
1067
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt,
|
|
1068
|
+
},
|
|
1069
|
+
'using preemptive generation',
|
|
1070
|
+
);
|
|
1071
|
+
} else {
|
|
1072
|
+
this.logger.warn(
|
|
1073
|
+
'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
|
|
1074
|
+
);
|
|
1075
|
+
preemptive.speechHandle._cancel();
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
this._preemptiveGeneration = undefined;
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
if (speechHandle === undefined) {
|
|
1082
|
+
// Ensure the new message is passed to generateReply
|
|
1083
|
+
// This preserves the original message id, making it easier for users to track responses
|
|
1084
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
1085
|
+
}
|
|
983
1086
|
|
|
984
1087
|
const eouMetrics: EOUMetrics = {
|
|
985
1088
|
type: 'eou_metrics',
|
|
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
987
1090
|
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
988
1091
|
transcriptionDelayMs: info.transcriptionDelay,
|
|
989
1092
|
onUserTurnCompletedDelayMs: callbackDuration,
|
|
1093
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
990
1094
|
speechId: speechHandle.id,
|
|
991
1095
|
};
|
|
992
1096
|
|
|
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1139
1243
|
|
|
1140
1244
|
chatCtx = chatCtx.copy();
|
|
1141
1245
|
|
|
1246
|
+
// Insert new message into temporary chat context for LLM inference
|
|
1142
1247
|
if (newMessage) {
|
|
1143
1248
|
chatCtx.insert(newMessage);
|
|
1144
|
-
this.agent._chatCtx.insert(newMessage);
|
|
1145
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
1146
1249
|
}
|
|
1147
1250
|
|
|
1148
1251
|
if (instructions) {
|
|
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1157
1260
|
}
|
|
1158
1261
|
}
|
|
1159
1262
|
|
|
1160
|
-
this.agentSession._updateAgentState('thinking');
|
|
1161
1263
|
const tasks: Array<Task<void>> = [];
|
|
1162
1264
|
const [llmTask, llmGenData] = performLLMInference(
|
|
1163
1265
|
// preserve `this` context in llmNode
|
|
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1185
1287
|
|
|
1186
1288
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1187
1289
|
|
|
1290
|
+
// Add new message to actual chat context if the speech is scheduled
|
|
1291
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1292
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1293
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1188
1296
|
if (speechHandle.interrupted) {
|
|
1189
1297
|
replyAbortController.abort();
|
|
1190
1298
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1917
2025
|
try {
|
|
1918
2026
|
if (this._draining) return;
|
|
1919
2027
|
|
|
2028
|
+
this.cancelPreemptiveGeneration();
|
|
1920
2029
|
this.createSpeechTask({
|
|
1921
2030
|
task: Task.from(() => this.agent.onExit()),
|
|
1922
2031
|
name: 'AgentActivity_onExit',
|
|
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1937
2046
|
this.logger.warn('task closing without draining');
|
|
1938
2047
|
}
|
|
1939
2048
|
|
|
2049
|
+
this.cancelPreemptiveGeneration();
|
|
1940
2050
|
// Unregister event handlers to prevent duplicate metrics
|
|
1941
2051
|
if (this.llm instanceof LLM) {
|
|
1942
2052
|
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
@@ -57,6 +57,7 @@ export interface VoiceOptions {
|
|
|
57
57
|
minEndpointingDelay: number;
|
|
58
58
|
maxEndpointingDelay: number;
|
|
59
59
|
maxToolSteps: number;
|
|
60
|
+
preemptiveGeneration: boolean;
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
const defaultVoiceOptions: VoiceOptions = {
|
|
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
|
|
|
67
68
|
minEndpointingDelay: 500,
|
|
68
69
|
maxEndpointingDelay: 6000,
|
|
69
70
|
maxToolSteps: 3,
|
|
71
|
+
preemptiveGeneration: false,
|
|
70
72
|
} as const;
|
|
71
73
|
|
|
72
74
|
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
|
|
@@ -421,7 +423,7 @@ export class AgentSession<
|
|
|
421
423
|
}
|
|
422
424
|
|
|
423
425
|
/** @internal */
|
|
424
|
-
_updateUserState(state: UserState) {
|
|
426
|
+
_updateUserState(state: UserState, _lastSpeakingTime?: number) {
|
|
425
427
|
if (this.userState === state) {
|
|
426
428
|
return;
|
|
427
429
|
}
|