@livekit/agents 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +12 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +13 -13
- package/dist/cli.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -1
- package/dist/inference/tts.d.ts +2 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +1 -5
- package/dist/inference/tts.js.map +1 -1
- package/dist/llm/chat_context.cjs +78 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +16 -0
- package/dist/llm/chat_context.d.ts +16 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +78 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +531 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +531 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/tool_context.cjs +43 -2
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +39 -11
- package/dist/llm/tool_context.d.ts +39 -11
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +42 -3
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/llm/tool_context.test.cjs +197 -0
- package/dist/llm/tool_context.test.cjs.map +1 -1
- package/dist/llm/tool_context.test.js +175 -0
- package/dist/llm/tool_context.test.js.map +1 -1
- package/dist/llm/utils.cjs +17 -11
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +1 -2
- package/dist/llm/utils.d.ts +1 -2
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +17 -11
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +99 -0
- package/dist/llm/zod-utils.cjs.map +1 -0
- package/dist/llm/zod-utils.d.cts +65 -0
- package/dist/llm/zod-utils.d.ts +65 -0
- package/dist/llm/zod-utils.d.ts.map +1 -0
- package/dist/llm/zod-utils.js +61 -0
- package/dist/llm/zod-utils.js.map +1 -0
- package/dist/llm/zod-utils.test.cjs +389 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -0
- package/dist/llm/zod-utils.test.js +372 -0
- package/dist/llm/zod-utils.test.js.map +1 -0
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +7 -0
- package/dist/metrics/base.d.ts +7 -0
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/stt/stt.cjs +1 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -1
- package/dist/stt/stt.d.ts +7 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +1 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/vad.cjs +16 -0
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +6 -0
- package/dist/vad.d.ts +6 -0
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +16 -0
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +83 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +6 -2
- package/dist/voice/agent_activity.d.ts +6 -2
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +83 -8
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +3 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -1
- package/dist/voice/agent_session.d.ts +2 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +3 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +138 -16
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +11 -0
- package/dist/voice/audio_recognition.d.ts +11 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +138 -16
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +8 -3
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +8 -3
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +0 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs +17 -11
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +16 -9
- package/dist/worker.d.ts +16 -9
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +16 -12
- package/dist/worker.js.map +1 -1
- package/package.json +5 -4
- package/src/cli.ts +17 -17
- package/src/inference/stt.ts +2 -1
- package/src/inference/tts.ts +2 -5
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +341 -0
- package/src/llm/chat_context.test.ts +607 -0
- package/src/llm/chat_context.ts +106 -0
- package/src/llm/tool_context.test.ts +210 -1
- package/src/llm/tool_context.ts +101 -17
- package/src/llm/utils.ts +18 -15
- package/src/llm/zod-utils.test.ts +476 -0
- package/src/llm/zod-utils.ts +144 -0
- package/src/metrics/base.ts +7 -0
- package/src/stt/stt.ts +6 -0
- package/src/vad.ts +18 -0
- package/src/voice/agent_activity.ts +119 -9
- package/src/voice/agent_session.ts +3 -1
- package/src/voice/audio_recognition.ts +235 -57
- package/src/voice/generation.ts +8 -3
- package/src/voice/room_io/_input.ts +1 -1
- package/src/worker.ts +29 -18
|
@@ -22,6 +22,7 @@ import {
|
|
|
22
22
|
type ToolContext,
|
|
23
23
|
} from '../llm/index.js';
|
|
24
24
|
import type { LLMError } from '../llm/llm.js';
|
|
25
|
+
import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
|
|
25
26
|
import { log } from '../log.js';
|
|
26
27
|
import type {
|
|
27
28
|
EOUMetrics,
|
|
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
|
|
|
43
44
|
import {
|
|
44
45
|
AudioRecognition,
|
|
45
46
|
type EndOfTurnInfo,
|
|
47
|
+
type PreemptiveGenerationInfo,
|
|
46
48
|
type RecognitionHooks,
|
|
47
49
|
type _TurnDetector,
|
|
48
50
|
} from './audio_recognition.js';
|
|
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
|
|
|
71
73
|
// equivalent to Python's contextvars
|
|
72
74
|
const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
|
|
73
75
|
|
|
76
|
+
interface PreemptiveGeneration {
|
|
77
|
+
speechHandle: SpeechHandle;
|
|
78
|
+
userMessage: ChatMessage;
|
|
79
|
+
info: PreemptiveGenerationInfo;
|
|
80
|
+
chatCtx: ChatContext;
|
|
81
|
+
tools: ToolContext;
|
|
82
|
+
toolChoice: ToolChoice | null;
|
|
83
|
+
createdAt: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
74
86
|
export class AgentActivity implements RecognitionHooks {
|
|
75
87
|
private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
|
|
76
88
|
private started = false;
|
|
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
87
99
|
private audioStream = new DeferredReadableStream<AudioFrame>();
|
|
88
100
|
// default to null as None, which maps to the default provider tool choice value
|
|
89
101
|
private toolChoice: ToolChoice | null = null;
|
|
102
|
+
private _preemptiveGeneration?: PreemptiveGeneration;
|
|
90
103
|
|
|
91
104
|
agent: Agent;
|
|
92
105
|
agentSession: AgentSession;
|
|
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
589
602
|
this.agentSession._updateUserState('speaking');
|
|
590
603
|
}
|
|
591
604
|
|
|
592
|
-
onEndOfSpeech(
|
|
593
|
-
|
|
605
|
+
onEndOfSpeech(ev: VADEvent): void {
|
|
606
|
+
let speechEndTime = Date.now();
|
|
607
|
+
if (ev) {
|
|
608
|
+
speechEndTime = speechEndTime - ev.silenceDuration;
|
|
609
|
+
}
|
|
610
|
+
this.agentSession._updateUserState('listening', speechEndTime);
|
|
594
611
|
}
|
|
595
612
|
|
|
596
613
|
onVADInferenceDone(ev: VADEvent): void {
|
|
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
664
681
|
);
|
|
665
682
|
}
|
|
666
683
|
|
|
684
|
+
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
685
|
+
if (
|
|
686
|
+
!this.agentSession.options.preemptiveGeneration ||
|
|
687
|
+
this.draining ||
|
|
688
|
+
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
689
|
+
!(this.llm instanceof LLM)
|
|
690
|
+
) {
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
this.cancelPreemptiveGeneration();
|
|
695
|
+
|
|
696
|
+
this.logger.info(
|
|
697
|
+
{
|
|
698
|
+
newTranscript: info.newTranscript,
|
|
699
|
+
transcriptConfidence: info.transcriptConfidence,
|
|
700
|
+
},
|
|
701
|
+
'starting preemptive generation',
|
|
702
|
+
);
|
|
703
|
+
|
|
704
|
+
const userMessage = ChatMessage.create({
|
|
705
|
+
role: 'user',
|
|
706
|
+
content: info.newTranscript,
|
|
707
|
+
});
|
|
708
|
+
const chatCtx = this.agent.chatCtx.copy();
|
|
709
|
+
const speechHandle = this.generateReply({
|
|
710
|
+
userMessage,
|
|
711
|
+
chatCtx,
|
|
712
|
+
scheduleSpeech: false,
|
|
713
|
+
});
|
|
714
|
+
|
|
715
|
+
this._preemptiveGeneration = {
|
|
716
|
+
speechHandle,
|
|
717
|
+
userMessage,
|
|
718
|
+
info,
|
|
719
|
+
chatCtx: chatCtx.copy(),
|
|
720
|
+
tools: { ...this.tools },
|
|
721
|
+
toolChoice: this.toolChoice,
|
|
722
|
+
createdAt: Date.now(),
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
private cancelPreemptiveGeneration(): void {
|
|
727
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
728
|
+
this._preemptiveGeneration.speechHandle._cancel();
|
|
729
|
+
this._preemptiveGeneration = undefined;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
667
733
|
private createSpeechTask(options: {
|
|
668
734
|
task: Task<void>;
|
|
669
735
|
ownedSpeechHandle?: SpeechHandle;
|
|
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
694
760
|
|
|
695
761
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
696
762
|
if (this.draining) {
|
|
763
|
+
this.cancelPreemptiveGeneration();
|
|
697
764
|
this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
|
|
698
765
|
// copied from python:
|
|
699
766
|
// TODO(shubhra): should we "forward" this new turn to the next agent/activity?
|
|
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
710
777
|
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
|
|
711
778
|
) {
|
|
712
779
|
// avoid interruption if the new_transcript is too short
|
|
780
|
+
this.cancelPreemptiveGeneration();
|
|
713
781
|
this.logger.info('skipping user input, new_transcript is too short');
|
|
714
782
|
return false;
|
|
715
783
|
}
|
|
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
775
843
|
instructions?: string;
|
|
776
844
|
toolChoice?: ToolChoice | null;
|
|
777
845
|
allowInterruptions?: boolean;
|
|
846
|
+
scheduleSpeech?: boolean;
|
|
778
847
|
}): SpeechHandle {
|
|
779
848
|
const {
|
|
780
849
|
userMessage,
|
|
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
782
851
|
instructions: defaultInstructions,
|
|
783
852
|
toolChoice: defaultToolChoice,
|
|
784
853
|
allowInterruptions: defaultAllowInterruptions,
|
|
854
|
+
scheduleSpeech = true,
|
|
785
855
|
} = options;
|
|
786
856
|
|
|
787
857
|
let instructions = defaultInstructions;
|
|
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
871
941
|
task.finally(() => this.onPipelineReplyDone());
|
|
872
942
|
}
|
|
873
943
|
|
|
874
|
-
|
|
944
|
+
if (scheduleSpeech) {
|
|
945
|
+
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
946
|
+
}
|
|
875
947
|
return handle;
|
|
876
948
|
}
|
|
877
949
|
|
|
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
977
1049
|
return;
|
|
978
1050
|
}
|
|
979
1051
|
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1052
|
+
let speechHandle: SpeechHandle | undefined;
|
|
1053
|
+
if (this._preemptiveGeneration !== undefined) {
|
|
1054
|
+
const preemptive = this._preemptiveGeneration;
|
|
1055
|
+
// make sure the onUserTurnCompleted didn't change some request parameters
|
|
1056
|
+
// otherwise invalidate the preemptive generation
|
|
1057
|
+
if (
|
|
1058
|
+
preemptive.info.newTranscript === userMessage?.textContent &&
|
|
1059
|
+
preemptive.chatCtx.isEquivalent(chatCtx) &&
|
|
1060
|
+
isSameToolContext(preemptive.tools, this.tools) &&
|
|
1061
|
+
isSameToolChoice(preemptive.toolChoice, this.toolChoice)
|
|
1062
|
+
) {
|
|
1063
|
+
speechHandle = preemptive.speechHandle;
|
|
1064
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
1065
|
+
this.logger.debug(
|
|
1066
|
+
{
|
|
1067
|
+
preemptiveLeadTime: Date.now() - preemptive.createdAt,
|
|
1068
|
+
},
|
|
1069
|
+
'using preemptive generation',
|
|
1070
|
+
);
|
|
1071
|
+
} else {
|
|
1072
|
+
this.logger.warn(
|
|
1073
|
+
'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
|
|
1074
|
+
);
|
|
1075
|
+
preemptive.speechHandle._cancel();
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
this._preemptiveGeneration = undefined;
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
if (speechHandle === undefined) {
|
|
1082
|
+
// Ensure the new message is passed to generateReply
|
|
1083
|
+
// This preserves the original message id, making it easier for users to track responses
|
|
1084
|
+
speechHandle = this.generateReply({ userMessage, chatCtx });
|
|
1085
|
+
}
|
|
983
1086
|
|
|
984
1087
|
const eouMetrics: EOUMetrics = {
|
|
985
1088
|
type: 'eou_metrics',
|
|
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
987
1090
|
endOfUtteranceDelayMs: info.endOfUtteranceDelay,
|
|
988
1091
|
transcriptionDelayMs: info.transcriptionDelay,
|
|
989
1092
|
onUserTurnCompletedDelayMs: callbackDuration,
|
|
1093
|
+
lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
|
|
990
1094
|
speechId: speechHandle.id,
|
|
991
1095
|
};
|
|
992
1096
|
|
|
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1139
1243
|
|
|
1140
1244
|
chatCtx = chatCtx.copy();
|
|
1141
1245
|
|
|
1246
|
+
// Insert new message into temporary chat context for LLM inference
|
|
1142
1247
|
if (newMessage) {
|
|
1143
1248
|
chatCtx.insert(newMessage);
|
|
1144
|
-
this.agent._chatCtx.insert(newMessage);
|
|
1145
|
-
this.agentSession._conversationItemAdded(newMessage);
|
|
1146
1249
|
}
|
|
1147
1250
|
|
|
1148
1251
|
if (instructions) {
|
|
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1157
1260
|
}
|
|
1158
1261
|
}
|
|
1159
1262
|
|
|
1160
|
-
this.agentSession._updateAgentState('thinking');
|
|
1161
1263
|
const tasks: Array<Task<void>> = [];
|
|
1162
1264
|
const [llmTask, llmGenData] = performLLMInference(
|
|
1163
1265
|
// preserve `this` context in llmNode
|
|
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1185
1287
|
|
|
1186
1288
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1187
1289
|
|
|
1290
|
+
// Add new message to actual chat context if the speech is scheduled
|
|
1291
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1292
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1293
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1188
1296
|
if (speechHandle.interrupted) {
|
|
1189
1297
|
replyAbortController.abort();
|
|
1190
1298
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1917
2025
|
try {
|
|
1918
2026
|
if (this._draining) return;
|
|
1919
2027
|
|
|
2028
|
+
this.cancelPreemptiveGeneration();
|
|
1920
2029
|
this.createSpeechTask({
|
|
1921
2030
|
task: Task.from(() => this.agent.onExit()),
|
|
1922
2031
|
name: 'AgentActivity_onExit',
|
|
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1937
2046
|
this.logger.warn('task closing without draining');
|
|
1938
2047
|
}
|
|
1939
2048
|
|
|
2049
|
+
this.cancelPreemptiveGeneration();
|
|
1940
2050
|
// Unregister event handlers to prevent duplicate metrics
|
|
1941
2051
|
if (this.llm instanceof LLM) {
|
|
1942
2052
|
this.llm.off('metrics_collected', this.onMetricsCollected);
|
|
@@ -57,6 +57,7 @@ export interface VoiceOptions {
|
|
|
57
57
|
minEndpointingDelay: number;
|
|
58
58
|
maxEndpointingDelay: number;
|
|
59
59
|
maxToolSteps: number;
|
|
60
|
+
preemptiveGeneration: boolean;
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
const defaultVoiceOptions: VoiceOptions = {
|
|
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
|
|
|
67
68
|
minEndpointingDelay: 500,
|
|
68
69
|
maxEndpointingDelay: 6000,
|
|
69
70
|
maxToolSteps: 3,
|
|
71
|
+
preemptiveGeneration: false,
|
|
70
72
|
} as const;
|
|
71
73
|
|
|
72
74
|
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
|
|
@@ -421,7 +423,7 @@ export class AgentSession<
|
|
|
421
423
|
}
|
|
422
424
|
|
|
423
425
|
/** @internal */
|
|
424
|
-
_updateUserState(state: UserState) {
|
|
426
|
+
_updateUserState(state: UserState, _lastSpeakingTime?: number) {
|
|
425
427
|
if (this.userState === state) {
|
|
426
428
|
return;
|
|
427
429
|
}
|
|
@@ -17,8 +17,16 @@ import type { STTNode } from './io.js';
|
|
|
17
17
|
|
|
18
18
|
export interface EndOfTurnInfo {
|
|
19
19
|
newTranscript: string;
|
|
20
|
+
transcriptConfidence: number;
|
|
20
21
|
transcriptionDelay: number;
|
|
21
22
|
endOfUtteranceDelay: number;
|
|
23
|
+
startedSpeakingAt: number | undefined;
|
|
24
|
+
stoppedSpeakingAt: number | undefined;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface PreemptiveGenerationInfo {
|
|
28
|
+
newTranscript: string;
|
|
29
|
+
transcriptConfidence: number;
|
|
22
30
|
}
|
|
23
31
|
|
|
24
32
|
export interface RecognitionHooks {
|
|
@@ -28,6 +36,7 @@ export interface RecognitionHooks {
|
|
|
28
36
|
onInterimTranscript: (ev: SpeechEvent) => void;
|
|
29
37
|
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
30
38
|
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
39
|
+
onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
|
|
31
40
|
|
|
32
41
|
retrieveChatCtx: () => ChatContext;
|
|
33
42
|
}
|
|
@@ -63,7 +72,10 @@ export class AudioRecognition {
|
|
|
63
72
|
private lastFinalTranscriptTime = 0;
|
|
64
73
|
private audioTranscript = '';
|
|
65
74
|
private audioInterimTranscript = '';
|
|
66
|
-
private
|
|
75
|
+
private audioPreflightTranscript = '';
|
|
76
|
+
private finalTranscriptConfidence: number[] = [];
|
|
77
|
+
private lastSpeakingTime: number | undefined;
|
|
78
|
+
private speechStartTime: number | undefined;
|
|
67
79
|
private userTurnCommitted = false;
|
|
68
80
|
private speaking = false;
|
|
69
81
|
private sampleRate?: number;
|
|
@@ -144,6 +156,7 @@ export class AudioRecognition {
|
|
|
144
156
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
145
157
|
this.hooks.onFinalTranscript(ev);
|
|
146
158
|
const transcript = ev.alternatives?.[0]?.text;
|
|
159
|
+
const confidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
147
160
|
this.lastLanguage = ev.alternatives?.[0]?.language;
|
|
148
161
|
|
|
149
162
|
if (!transcript) {
|
|
@@ -162,34 +175,144 @@ export class AudioRecognition {
|
|
|
162
175
|
this.lastFinalTranscriptTime = Date.now();
|
|
163
176
|
this.audioTranscript += ` ${transcript}`;
|
|
164
177
|
this.audioTranscript = this.audioTranscript.trimStart();
|
|
178
|
+
this.finalTranscriptConfidence.push(confidence);
|
|
179
|
+
const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
|
|
165
180
|
this.audioInterimTranscript = '';
|
|
181
|
+
this.audioPreflightTranscript = '';
|
|
182
|
+
|
|
183
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
184
|
+
// vad disabled, use stt timestamp
|
|
185
|
+
// TODO: this would screw up transcription latency metrics
|
|
186
|
+
// but we'll live with it for now.
|
|
187
|
+
// the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
|
|
188
|
+
// and using that timestamp for lastSpeakingTime
|
|
189
|
+
this.lastSpeakingTime = Date.now();
|
|
190
|
+
}
|
|
166
191
|
|
|
167
|
-
if (
|
|
168
|
-
if (
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
192
|
+
if (this.vadBaseTurnDetection || this.userTurnCommitted) {
|
|
193
|
+
if (transcriptChanged) {
|
|
194
|
+
this.logger.debug(
|
|
195
|
+
{ transcript: this.audioTranscript },
|
|
196
|
+
'triggering preemptive generation (FINAL_TRANSCRIPT)',
|
|
197
|
+
);
|
|
198
|
+
this.hooks.onPreemptiveGeneration({
|
|
199
|
+
newTranscript: this.audioTranscript,
|
|
200
|
+
transcriptConfidence:
|
|
201
|
+
this.finalTranscriptConfidence.length > 0
|
|
202
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
203
|
+
this.finalTranscriptConfidence.length
|
|
204
|
+
: 0,
|
|
205
|
+
});
|
|
176
206
|
}
|
|
177
207
|
|
|
178
|
-
if (this.
|
|
208
|
+
if (!this.speaking) {
|
|
179
209
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
180
210
|
this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
|
|
181
211
|
this.runEOUDetection(chatCtx);
|
|
182
212
|
}
|
|
183
213
|
}
|
|
184
214
|
break;
|
|
215
|
+
case SpeechEventType.PREFLIGHT_TRANSCRIPT:
|
|
216
|
+
this.hooks.onInterimTranscript(ev);
|
|
217
|
+
const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
218
|
+
const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
219
|
+
const preflightLanguage = ev.alternatives?.[0]?.language;
|
|
220
|
+
|
|
221
|
+
const MIN_LANGUAGE_DETECTION_LENGTH = 5;
|
|
222
|
+
if (
|
|
223
|
+
!this.lastLanguage ||
|
|
224
|
+
(preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
|
|
225
|
+
) {
|
|
226
|
+
this.lastLanguage = preflightLanguage;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (!preflightTranscript) {
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
this.logger.debug(
|
|
234
|
+
{
|
|
235
|
+
user_transcript: preflightTranscript,
|
|
236
|
+
language: this.lastLanguage,
|
|
237
|
+
},
|
|
238
|
+
'received user preflight transcript',
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
// still need to increment it as it's used for turn detection,
|
|
242
|
+
this.lastFinalTranscriptTime = Date.now();
|
|
243
|
+
// preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
|
|
244
|
+
this.audioPreflightTranscript =
|
|
245
|
+
`${this.audioTranscript} ${preflightTranscript}`.trimStart();
|
|
246
|
+
this.audioInterimTranscript = preflightTranscript;
|
|
247
|
+
|
|
248
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
249
|
+
// vad disabled, use stt timestamp
|
|
250
|
+
this.lastSpeakingTime = Date.now();
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
|
|
254
|
+
const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
|
|
255
|
+
this.logger.debug(
|
|
256
|
+
{
|
|
257
|
+
transcript:
|
|
258
|
+
this.audioPreflightTranscript.length > 100
|
|
259
|
+
? this.audioPreflightTranscript.slice(0, 100) + '...'
|
|
260
|
+
: this.audioPreflightTranscript,
|
|
261
|
+
},
|
|
262
|
+
'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
|
|
263
|
+
);
|
|
264
|
+
this.hooks.onPreemptiveGeneration({
|
|
265
|
+
newTranscript: this.audioPreflightTranscript,
|
|
266
|
+
transcriptConfidence:
|
|
267
|
+
confidenceVals.length > 0
|
|
268
|
+
? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
|
|
269
|
+
: 0,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
break;
|
|
185
273
|
case SpeechEventType.INTERIM_TRANSCRIPT:
|
|
186
274
|
this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
|
|
187
275
|
this.hooks.onInterimTranscript(ev);
|
|
188
276
|
this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
189
277
|
break;
|
|
278
|
+
case SpeechEventType.START_OF_SPEECH:
|
|
279
|
+
if (this.turnDetectionMode !== 'stt') break;
|
|
280
|
+
this.hooks.onStartOfSpeech({
|
|
281
|
+
type: VADEventType.START_OF_SPEECH,
|
|
282
|
+
samplesIndex: 0,
|
|
283
|
+
timestamp: Date.now(),
|
|
284
|
+
speechDuration: 0,
|
|
285
|
+
silenceDuration: 0,
|
|
286
|
+
frames: [],
|
|
287
|
+
probability: 0,
|
|
288
|
+
inferenceDuration: 0,
|
|
289
|
+
speaking: true,
|
|
290
|
+
rawAccumulatedSilence: 0,
|
|
291
|
+
rawAccumulatedSpeech: 0,
|
|
292
|
+
});
|
|
293
|
+
this.speaking = true;
|
|
294
|
+
this.lastSpeakingTime = Date.now();
|
|
295
|
+
|
|
296
|
+
this.bounceEOUTask?.cancel();
|
|
297
|
+
break;
|
|
190
298
|
case SpeechEventType.END_OF_SPEECH:
|
|
191
299
|
if (this.turnDetectionMode !== 'stt') break;
|
|
300
|
+
this.hooks.onEndOfSpeech({
|
|
301
|
+
type: VADEventType.END_OF_SPEECH,
|
|
302
|
+
samplesIndex: 0,
|
|
303
|
+
timestamp: Date.now(),
|
|
304
|
+
speechDuration: 0,
|
|
305
|
+
silenceDuration: 0,
|
|
306
|
+
frames: [],
|
|
307
|
+
probability: 0,
|
|
308
|
+
inferenceDuration: 0,
|
|
309
|
+
speaking: false,
|
|
310
|
+
rawAccumulatedSilence: 0,
|
|
311
|
+
rawAccumulatedSpeech: 0,
|
|
312
|
+
});
|
|
313
|
+
this.speaking = false;
|
|
192
314
|
this.userTurnCommitted = true;
|
|
315
|
+
this.lastSpeakingTime = Date.now();
|
|
193
316
|
|
|
194
317
|
if (!this.speaking) {
|
|
195
318
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
@@ -222,61 +345,106 @@ export class AudioRecognition {
|
|
|
222
345
|
// disable EOU model if manual turn detection enabled
|
|
223
346
|
this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
|
|
224
347
|
|
|
225
|
-
const bounceEOUTask =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
} else {
|
|
234
|
-
const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
235
|
-
this.logger.debug(
|
|
236
|
-
{ endOfTurnProbability, language: this.lastLanguage },
|
|
237
|
-
'end of turn probability',
|
|
238
|
-
);
|
|
239
|
-
|
|
240
|
-
const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
241
|
-
this.logger.debug(
|
|
242
|
-
{
|
|
243
|
-
unlikelyThreshold,
|
|
244
|
-
endOfTurnProbability,
|
|
245
|
-
language: this.lastLanguage,
|
|
246
|
-
transcript: this.audioTranscript,
|
|
247
|
-
},
|
|
248
|
-
'EOU Detection',
|
|
249
|
-
);
|
|
348
|
+
const bounceEOUTask =
|
|
349
|
+
(
|
|
350
|
+
lastSpeakingTime: number | undefined,
|
|
351
|
+
lastFinalTranscriptTime: number,
|
|
352
|
+
speechStartTime: number | undefined,
|
|
353
|
+
) =>
|
|
354
|
+
async (controller: AbortController) => {
|
|
355
|
+
let endpointingDelay = this.minEndpointingDelay;
|
|
250
356
|
|
|
251
|
-
|
|
252
|
-
|
|
357
|
+
if (turnDetector) {
|
|
358
|
+
this.logger.debug('Running turn detector model');
|
|
359
|
+
if (!turnDetector.supportsLanguage(this.lastLanguage)) {
|
|
360
|
+
this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
|
|
361
|
+
} else {
|
|
362
|
+
const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
363
|
+
this.logger.debug(
|
|
364
|
+
{ endOfTurnProbability, language: this.lastLanguage },
|
|
365
|
+
'end of turn probability',
|
|
366
|
+
);
|
|
367
|
+
|
|
368
|
+
const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
369
|
+
this.logger.debug(
|
|
370
|
+
{
|
|
371
|
+
unlikelyThreshold,
|
|
372
|
+
endOfTurnProbability,
|
|
373
|
+
language: this.lastLanguage,
|
|
374
|
+
transcript: this.audioTranscript,
|
|
375
|
+
},
|
|
376
|
+
'EOU Detection',
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
380
|
+
endpointingDelay = this.maxEndpointingDelay;
|
|
381
|
+
}
|
|
253
382
|
}
|
|
254
383
|
}
|
|
255
|
-
}
|
|
256
384
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
385
|
+
let extraSleep = endpointingDelay;
|
|
386
|
+
if (lastSpeakingTime !== undefined) {
|
|
387
|
+
extraSleep += lastSpeakingTime - Date.now();
|
|
388
|
+
}
|
|
260
389
|
|
|
261
|
-
|
|
390
|
+
if (extraSleep > 0) {
|
|
391
|
+
// add delay to see if there's a potential upcoming EOU task that cancels this one
|
|
392
|
+
await delay(Math.max(extraSleep, 0), { signal: controller.signal });
|
|
393
|
+
}
|
|
262
394
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
395
|
+
this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
|
|
396
|
+
|
|
397
|
+
const confidenceAvg =
|
|
398
|
+
this.finalTranscriptConfidence.length > 0
|
|
399
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
400
|
+
this.finalTranscriptConfidence.length
|
|
401
|
+
: 0;
|
|
402
|
+
|
|
403
|
+
let startedSpeakingAt: number | undefined;
|
|
404
|
+
let stoppedSpeakingAt: number | undefined;
|
|
405
|
+
let transcriptionDelay: number | undefined;
|
|
406
|
+
let endOfUtteranceDelay: number | undefined;
|
|
407
|
+
|
|
408
|
+
// sometimes, we can't calculate the metrics because VAD was unreliable.
|
|
409
|
+
// in this case, we just ignore the calculation, it's better than providing likely wrong values
|
|
410
|
+
if (
|
|
411
|
+
lastFinalTranscriptTime !== 0 &&
|
|
412
|
+
lastSpeakingTime !== undefined &&
|
|
413
|
+
speechStartTime !== undefined
|
|
414
|
+
) {
|
|
415
|
+
startedSpeakingAt = speechStartTime;
|
|
416
|
+
stoppedSpeakingAt = lastSpeakingTime;
|
|
417
|
+
transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
|
|
418
|
+
endOfUtteranceDelay = Date.now() - lastSpeakingTime;
|
|
419
|
+
}
|
|
268
420
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
421
|
+
const committed = await this.hooks.onEndOfTurn({
|
|
422
|
+
newTranscript: this.audioTranscript,
|
|
423
|
+
transcriptConfidence: confidenceAvg,
|
|
424
|
+
transcriptionDelay: transcriptionDelay ?? 0,
|
|
425
|
+
endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
|
|
426
|
+
startedSpeakingAt,
|
|
427
|
+
stoppedSpeakingAt,
|
|
428
|
+
});
|
|
429
|
+
|
|
430
|
+
if (committed) {
|
|
431
|
+
// clear the transcript if the user turn was committed
|
|
432
|
+
this.audioTranscript = '';
|
|
433
|
+
this.finalTranscriptConfidence = [];
|
|
434
|
+
this.lastSpeakingTime = undefined;
|
|
435
|
+
this.lastFinalTranscriptTime = 0;
|
|
436
|
+
this.speechStartTime = undefined;
|
|
437
|
+
}
|
|
273
438
|
|
|
274
|
-
|
|
275
|
-
|
|
439
|
+
this.userTurnCommitted = false;
|
|
440
|
+
};
|
|
276
441
|
|
|
277
442
|
// cancel any existing EOU task
|
|
278
443
|
this.bounceEOUTask?.cancel();
|
|
279
|
-
|
|
444
|
+
// copy the values before awaiting (the values can change)
|
|
445
|
+
this.bounceEOUTask = Task.from(
|
|
446
|
+
bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
|
|
447
|
+
);
|
|
280
448
|
|
|
281
449
|
this.bounceEOUTask.result
|
|
282
450
|
.then(() => {
|
|
@@ -376,13 +544,21 @@ export class AudioRecognition {
|
|
|
376
544
|
break;
|
|
377
545
|
case VADEventType.INFERENCE_DONE:
|
|
378
546
|
this.hooks.onVADInferenceDone(ev);
|
|
547
|
+
// for metrics, get the "earliest" signal of speech as possible
|
|
548
|
+
if (ev.rawAccumulatedSpeech > 0.0) {
|
|
549
|
+
this.lastSpeakingTime = Date.now();
|
|
550
|
+
|
|
551
|
+
if (this.speechStartTime === undefined) {
|
|
552
|
+
this.speechStartTime = Date.now();
|
|
553
|
+
}
|
|
554
|
+
}
|
|
379
555
|
break;
|
|
380
556
|
case VADEventType.END_OF_SPEECH:
|
|
381
557
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
382
558
|
this.hooks.onEndOfSpeech(ev);
|
|
383
|
-
|
|
559
|
+
|
|
384
560
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
385
|
-
this.
|
|
561
|
+
this.speaking = false;
|
|
386
562
|
|
|
387
563
|
if (
|
|
388
564
|
this.vadBaseTurnDetection ||
|
|
@@ -412,6 +588,8 @@ export class AudioRecognition {
|
|
|
412
588
|
clearUserTurn() {
|
|
413
589
|
this.audioTranscript = '';
|
|
414
590
|
this.audioInterimTranscript = '';
|
|
591
|
+
this.audioPreflightTranscript = '';
|
|
592
|
+
this.finalTranscriptConfidence = [];
|
|
415
593
|
this.userTurnCommitted = false;
|
|
416
594
|
|
|
417
595
|
this.sttTask?.cancelAndWait().finally(() => {
|
package/src/voice/generation.ts
CHANGED
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { AudioResampler } from '@livekit/rtc-node';
|
|
6
6
|
import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
|
|
7
|
-
import { ZodObject } from 'zod';
|
|
8
7
|
import {
|
|
9
8
|
type ChatContext,
|
|
10
9
|
ChatMessage,
|
|
@@ -19,6 +18,7 @@ import {
|
|
|
19
18
|
isFunctionTool,
|
|
20
19
|
isToolError,
|
|
21
20
|
} from '../llm/tool_context.js';
|
|
21
|
+
import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
|
|
22
22
|
import { log } from '../log.js';
|
|
23
23
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
24
24
|
import { Future, Task, shortuuid, toError } from '../utils.js';
|
|
@@ -732,8 +732,13 @@ export function performToolExecutions({
|
|
|
732
732
|
try {
|
|
733
733
|
const jsonArgs = JSON.parse(toolCall.args);
|
|
734
734
|
|
|
735
|
-
if (tool.parameters
|
|
736
|
-
|
|
735
|
+
if (isZodSchema(tool.parameters)) {
|
|
736
|
+
const result = await parseZodSchema<object>(tool.parameters, jsonArgs);
|
|
737
|
+
if (result.success) {
|
|
738
|
+
parsedArgs = result.data;
|
|
739
|
+
} else {
|
|
740
|
+
throw result.error;
|
|
741
|
+
}
|
|
737
742
|
} else {
|
|
738
743
|
parsedArgs = jsonArgs;
|
|
739
744
|
}
|