@livekit/agents 1.0.38 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +7 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +5 -6
- package/dist/inference/llm.d.ts +5 -6
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +7 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +5 -4
- package/dist/inference/stt.d.ts +5 -4
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +10 -7
- package/dist/inference/tts.d.ts +10 -7
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +9 -1
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +9 -1
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/utils.cjs +5 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +4 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +1 -2
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.js +1 -2
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +23 -14
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +23 -14
- package/dist/voice/agent_activity.js.map +1 -1
- package/package.json +2 -2
- package/src/inference/llm.ts +20 -15
- package/src/inference/stt.ts +9 -7
- package/src/inference/tts.ts +36 -16
- package/src/stt/stream_adapter.ts +12 -1
- package/src/utils.ts +14 -0
- package/src/voice/agent.ts +2 -2
- package/src/voice/agent_activity.ts +36 -15
package/src/inference/llm.ts
CHANGED
|
@@ -17,6 +17,10 @@ import { type AnyString, createAccessToken } from './utils.js';
|
|
|
17
17
|
const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
|
|
18
18
|
|
|
19
19
|
export type OpenAIModels =
|
|
20
|
+
| 'openai/gpt-5.2'
|
|
21
|
+
| 'openai/gpt-5.2-chat-latest'
|
|
22
|
+
| 'openai/gpt-5.1'
|
|
23
|
+
| 'openai/gpt-5.1-chat-latest'
|
|
20
24
|
| 'openai/gpt-5'
|
|
21
25
|
| 'openai/gpt-5-mini'
|
|
22
26
|
| 'openai/gpt-5-nano'
|
|
@@ -28,19 +32,17 @@ export type OpenAIModels =
|
|
|
28
32
|
| 'openai/gpt-oss-120b';
|
|
29
33
|
|
|
30
34
|
export type GoogleModels =
|
|
31
|
-
| 'google/gemini-3-pro
|
|
32
|
-
| 'google/gemini-3-flash
|
|
35
|
+
| 'google/gemini-3-pro'
|
|
36
|
+
| 'google/gemini-3-flash'
|
|
33
37
|
| 'google/gemini-2.5-pro'
|
|
34
38
|
| 'google/gemini-2.5-flash'
|
|
35
39
|
| 'google/gemini-2.5-flash-lite'
|
|
36
40
|
| 'google/gemini-2.0-flash'
|
|
37
41
|
| 'google/gemini-2.0-flash-lite';
|
|
38
42
|
|
|
39
|
-
export type
|
|
43
|
+
export type MoonshotModels = 'moonshotai/kimi-k2-instruct';
|
|
40
44
|
|
|
41
|
-
export type
|
|
42
|
-
|
|
43
|
-
export type DeepSeekModels = 'deepseek-ai/deepseek-v3';
|
|
45
|
+
export type DeepSeekModels = 'deepseek-ai/deepseek-v3' | 'deepseek-ai/deepseek-v3.2';
|
|
44
46
|
|
|
45
47
|
type ChatCompletionPredictionContentParam =
|
|
46
48
|
Expand<OpenAI.Chat.Completions.ChatCompletionPredictionContent>;
|
|
@@ -80,13 +82,7 @@ export interface ChatCompletionOptions extends Record<string, unknown> {
|
|
|
80
82
|
// response_format?: OpenAI.Chat.Completions.ChatCompletionCreateParams['response_format']
|
|
81
83
|
}
|
|
82
84
|
|
|
83
|
-
export type LLMModels =
|
|
84
|
-
| OpenAIModels
|
|
85
|
-
| GoogleModels
|
|
86
|
-
| QwenModels
|
|
87
|
-
| KimiModels
|
|
88
|
-
| DeepSeekModels
|
|
89
|
-
| AnyString;
|
|
85
|
+
export type LLMModels = OpenAIModels | GoogleModels | MoonshotModels | DeepSeekModels | AnyString;
|
|
90
86
|
|
|
91
87
|
export interface InferenceLLMOptions {
|
|
92
88
|
model: LLMModels;
|
|
@@ -437,7 +433,10 @@ export class LLMStream extends llm.LLMStream {
|
|
|
437
433
|
if (this.toolCallId && tool.id && tool.index !== this.toolIndex) {
|
|
438
434
|
callChunk = this.createRunningToolCallChunk(id, delta);
|
|
439
435
|
this.toolCallId = this.fncName = this.fncRawArguments = undefined;
|
|
440
|
-
|
|
436
|
+
// Note: We intentionally do NOT reset toolExtra here.
|
|
437
|
+
// For Gemini 3+, the thought_signature is only provided on the first tool call
|
|
438
|
+
// in a parallel batch, but must be applied to ALL tool calls in the batch.
|
|
439
|
+
// We preserve toolExtra so subsequent tool calls inherit the thought_signature.
|
|
441
440
|
}
|
|
442
441
|
|
|
443
442
|
// Start or continue building the current tool call
|
|
@@ -447,9 +446,14 @@ export class LLMStream extends llm.LLMStream {
|
|
|
447
446
|
this.fncName = tool.function.name;
|
|
448
447
|
this.fncRawArguments = tool.function.arguments || '';
|
|
449
448
|
// Extract extra from tool call (e.g., Google thought signatures)
|
|
450
|
-
this.
|
|
449
|
+
// Only update toolExtra if this tool call has extra_content.
|
|
450
|
+
// Otherwise, inherit from previous tool call (for parallel Gemini tool calls).
|
|
451
|
+
const newToolExtra =
|
|
451
452
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
452
453
|
((tool as any).extra_content as Record<string, unknown> | undefined) ?? undefined;
|
|
454
|
+
if (newToolExtra) {
|
|
455
|
+
this.toolExtra = newToolExtra;
|
|
456
|
+
}
|
|
453
457
|
} else if (tool.function.arguments) {
|
|
454
458
|
this.fncRawArguments = (this.fncRawArguments || '') + tool.function.arguments;
|
|
455
459
|
}
|
|
@@ -468,6 +472,7 @@ export class LLMStream extends llm.LLMStream {
|
|
|
468
472
|
) {
|
|
469
473
|
const callChunk = this.createRunningToolCallChunk(id, delta);
|
|
470
474
|
this.toolCallId = this.fncName = this.fncRawArguments = undefined;
|
|
475
|
+
// Reset toolExtra at the end of the response (not between parallel tool calls)
|
|
471
476
|
this.toolExtra = undefined;
|
|
472
477
|
return callChunk;
|
|
473
478
|
}
|
package/src/inference/stt.ts
CHANGED
|
@@ -25,19 +25,21 @@ import {
|
|
|
25
25
|
import { type AnyString, connectWs, createAccessToken } from './utils.js';
|
|
26
26
|
|
|
27
27
|
export type DeepgramModels =
|
|
28
|
-
| 'deepgram'
|
|
28
|
+
| 'deepgram/flux-general'
|
|
29
29
|
| 'deepgram/nova-3'
|
|
30
|
-
| 'deepgram/nova-3-general'
|
|
31
30
|
| 'deepgram/nova-3-medical'
|
|
32
|
-
| 'deepgram/nova-2-conversationalai'
|
|
33
31
|
| 'deepgram/nova-2'
|
|
34
|
-
| 'deepgram/nova-2-general'
|
|
35
32
|
| 'deepgram/nova-2-medical'
|
|
33
|
+
| 'deepgram/nova-2-conversationalai'
|
|
36
34
|
| 'deepgram/nova-2-phonecall';
|
|
37
35
|
|
|
38
|
-
export type CartesiaModels = 'cartesia
|
|
36
|
+
export type CartesiaModels = 'cartesia/ink-whisper';
|
|
37
|
+
|
|
38
|
+
export type AssemblyaiModels =
|
|
39
|
+
| 'assemblyai/universal-streaming'
|
|
40
|
+
| 'assemblyai/universal-streaming-multilingual';
|
|
39
41
|
|
|
40
|
-
export type
|
|
42
|
+
export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
|
|
41
43
|
|
|
42
44
|
export interface CartesiaOptions {
|
|
43
45
|
min_volume?: number; // default: not specified
|
|
@@ -77,7 +79,7 @@ export type STTLanguages =
|
|
|
77
79
|
| 'hi'
|
|
78
80
|
| AnyString;
|
|
79
81
|
|
|
80
|
-
type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels;
|
|
82
|
+
type _STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels | ElevenlabsSTTModels;
|
|
81
83
|
|
|
82
84
|
export type STTModels = _STTModels | 'auto' | AnyString;
|
|
83
85
|
|
package/src/inference/tts.ts
CHANGED
|
@@ -23,22 +23,27 @@ import {
|
|
|
23
23
|
import { type AnyString, connectWs, createAccessToken } from './utils.js';
|
|
24
24
|
|
|
25
25
|
export type CartesiaModels =
|
|
26
|
-
| 'cartesia'
|
|
27
|
-
| 'cartesia/sonic'
|
|
26
|
+
| 'cartesia/sonic-3'
|
|
28
27
|
| 'cartesia/sonic-2'
|
|
29
|
-
| 'cartesia/sonic-turbo'
|
|
28
|
+
| 'cartesia/sonic-turbo'
|
|
29
|
+
| 'cartesia/sonic';
|
|
30
|
+
|
|
31
|
+
export type DeepgramTTSModels = 'deepgram/aura' | 'deepgram/aura-2';
|
|
30
32
|
|
|
31
33
|
export type ElevenlabsModels =
|
|
32
|
-
| 'elevenlabs'
|
|
33
34
|
| 'elevenlabs/eleven_flash_v2'
|
|
34
35
|
| 'elevenlabs/eleven_flash_v2_5'
|
|
35
36
|
| 'elevenlabs/eleven_turbo_v2'
|
|
36
37
|
| 'elevenlabs/eleven_turbo_v2_5'
|
|
37
38
|
| 'elevenlabs/eleven_multilingual_v2';
|
|
38
39
|
|
|
39
|
-
export type
|
|
40
|
+
export type InworldModels =
|
|
41
|
+
| 'inworld/inworld-tts-1.5-max'
|
|
42
|
+
| 'inworld/inworld-tts-1.5-mini'
|
|
43
|
+
| 'inworld/inworld-tts-1-max'
|
|
44
|
+
| 'inworld/inworld-tts-1';
|
|
40
45
|
|
|
41
|
-
export type
|
|
46
|
+
export type RimeModels = 'rime/arcana' | 'rime/mistv2';
|
|
42
47
|
|
|
43
48
|
export interface CartesiaOptions {
|
|
44
49
|
duration?: number; // max duration of audio in seconds
|
|
@@ -50,25 +55,40 @@ export interface ElevenlabsOptions {
|
|
|
50
55
|
apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
|
|
51
56
|
}
|
|
52
57
|
|
|
58
|
+
export interface DeepgramTTSOptions {}
|
|
59
|
+
|
|
53
60
|
export interface RimeOptions {}
|
|
54
61
|
|
|
55
62
|
export interface InworldOptions {}
|
|
56
63
|
|
|
57
|
-
type _TTSModels =
|
|
58
|
-
|
|
59
|
-
|
|
64
|
+
type _TTSModels =
|
|
65
|
+
| CartesiaModels
|
|
66
|
+
| DeepgramTTSModels
|
|
67
|
+
| ElevenlabsModels
|
|
68
|
+
| RimeModels
|
|
69
|
+
| InworldModels;
|
|
70
|
+
|
|
71
|
+
export type TTSModels =
|
|
72
|
+
| CartesiaModels
|
|
73
|
+
| DeepgramTTSModels
|
|
74
|
+
| ElevenlabsModels
|
|
75
|
+
| RimeModels
|
|
76
|
+
| InworldModels
|
|
77
|
+
| AnyString;
|
|
60
78
|
|
|
61
79
|
export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
|
|
62
80
|
|
|
63
81
|
export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
|
|
64
82
|
? CartesiaOptions
|
|
65
|
-
: TModel extends
|
|
66
|
-
?
|
|
67
|
-
: TModel extends
|
|
68
|
-
?
|
|
69
|
-
: TModel extends
|
|
70
|
-
?
|
|
71
|
-
:
|
|
83
|
+
: TModel extends DeepgramTTSModels
|
|
84
|
+
? DeepgramTTSOptions
|
|
85
|
+
: TModel extends ElevenlabsModels
|
|
86
|
+
? ElevenlabsOptions
|
|
87
|
+
: TModel extends RimeModels
|
|
88
|
+
? RimeOptions
|
|
89
|
+
: TModel extends InworldModels
|
|
90
|
+
? InworldOptions
|
|
91
|
+
: Record<string, unknown>;
|
|
72
92
|
|
|
73
93
|
type TTSEncoding = 'pcm_s16le';
|
|
74
94
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { log } from '../log.js';
|
|
6
6
|
import type { APIConnectOptions } from '../types.js';
|
|
7
|
+
import { isStreamClosedError } from '../utils.js';
|
|
7
8
|
import type { VAD, VADStream } from '../vad.js';
|
|
8
9
|
import { VADEventType } from '../vad.js';
|
|
9
10
|
import type { SpeechEvent } from './stt.js';
|
|
@@ -68,7 +69,17 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
68
69
|
this.#vadStream.pushFrame(input);
|
|
69
70
|
}
|
|
70
71
|
}
|
|
71
|
-
|
|
72
|
+
|
|
73
|
+
// Guard against calling endInput() on already-closed stream
|
|
74
|
+
// This happens during handover when close() is called while forwardInput is running
|
|
75
|
+
try {
|
|
76
|
+
this.#vadStream.endInput();
|
|
77
|
+
} catch (e) {
|
|
78
|
+
if (isStreamClosedError(e)) {
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
throw e;
|
|
82
|
+
}
|
|
72
83
|
};
|
|
73
84
|
|
|
74
85
|
const recognize = async () => {
|
package/src/utils.ts
CHANGED
|
@@ -675,6 +675,20 @@ export class InvalidErrorType extends Error {
|
|
|
675
675
|
}
|
|
676
676
|
}
|
|
677
677
|
|
|
678
|
+
/**
|
|
679
|
+
* Check if an error is a stream closed error that can be safely ignored during cleanup.
|
|
680
|
+
* This happens during handover/cleanup when close() is called while operations are still running.
|
|
681
|
+
*
|
|
682
|
+
* @param error - The error to check.
|
|
683
|
+
* @returns True if the error is a stream closed error.
|
|
684
|
+
*/
|
|
685
|
+
export function isStreamClosedError(error: unknown): boolean {
|
|
686
|
+
return (
|
|
687
|
+
error instanceof Error &&
|
|
688
|
+
(error.message === 'Stream is closed' || error.message === 'Input is closed')
|
|
689
|
+
);
|
|
690
|
+
}
|
|
691
|
+
|
|
678
692
|
/**
|
|
679
693
|
* In JS an error can be any arbitrary value.
|
|
680
694
|
* This function converts an unknown error to an Error and stores the original value in the error object.
|
package/src/voice/agent.ts
CHANGED
|
@@ -325,16 +325,16 @@ export class Agent<UserData = any> {
|
|
|
325
325
|
);
|
|
326
326
|
}
|
|
327
327
|
|
|
328
|
-
// TODO(brian): make parallelToolCalls configurable
|
|
329
328
|
const { toolChoice } = modelSettings;
|
|
330
329
|
const connOptions = activity.agentSession.connOptions.llmConnOptions;
|
|
331
330
|
|
|
331
|
+
// parallelToolCalls is not passed here - it will use the value from LLM's modelOptions
|
|
332
|
+
// This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })
|
|
332
333
|
const stream = activity.llm.chat({
|
|
333
334
|
chatCtx,
|
|
334
335
|
toolCtx,
|
|
335
336
|
toolChoice,
|
|
336
337
|
connOptions,
|
|
337
|
-
parallelToolCalls: true,
|
|
338
338
|
});
|
|
339
339
|
|
|
340
340
|
let cleaned = false;
|
|
@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
194
194
|
if (
|
|
195
195
|
!this.vad &&
|
|
196
196
|
this.stt &&
|
|
197
|
+
!this.stt.capabilities.streaming &&
|
|
197
198
|
this.llm instanceof LLM &&
|
|
198
199
|
this.allowInterruptions &&
|
|
199
200
|
this.turnDetectionMode === undefined
|
|
200
201
|
) {
|
|
201
202
|
this.logger.warn(
|
|
202
|
-
'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
|
|
203
|
+
'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
|
|
203
204
|
'for more responsive interruption handling.',
|
|
204
205
|
);
|
|
205
206
|
}
|
|
@@ -659,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
659
660
|
return;
|
|
660
661
|
}
|
|
661
662
|
|
|
662
|
-
if (
|
|
663
|
-
|
|
664
|
-
return;
|
|
663
|
+
if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
|
|
664
|
+
this.interruptByAudioActivity();
|
|
665
665
|
}
|
|
666
|
+
}
|
|
666
667
|
|
|
667
|
-
|
|
668
|
+
private interruptByAudioActivity(): void {
|
|
669
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
670
|
+
// skip speech handle interruption if server side turn detection is enabled
|
|
668
671
|
return;
|
|
669
672
|
}
|
|
670
673
|
|
|
@@ -694,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
694
697
|
!this._currentSpeech.interrupted &&
|
|
695
698
|
this._currentSpeech.allowInterruptions
|
|
696
699
|
) {
|
|
697
|
-
this.logger.info(
|
|
700
|
+
this.logger.info(
|
|
701
|
+
{ 'speech id': this._currentSpeech.id },
|
|
702
|
+
'speech interrupted by audio activity',
|
|
703
|
+
);
|
|
698
704
|
this.realtimeSession?.interrupt();
|
|
699
705
|
this._currentSpeech.interrupt();
|
|
700
706
|
}
|
|
@@ -715,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
715
721
|
// TODO(AJS-106): add multi participant support
|
|
716
722
|
}),
|
|
717
723
|
);
|
|
724
|
+
|
|
725
|
+
if (ev.alternatives![0].text) {
|
|
726
|
+
this.interruptByAudioActivity();
|
|
727
|
+
}
|
|
718
728
|
}
|
|
719
729
|
|
|
720
730
|
onFinalTranscript(ev: SpeechEvent): void {
|
|
@@ -732,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
732
742
|
// TODO(AJS-106): add multi participant support
|
|
733
743
|
}),
|
|
734
744
|
);
|
|
745
|
+
|
|
746
|
+
// agent speech might not be interrupted if VAD failed and a final transcript is received
|
|
747
|
+
// we call interruptByAudioActivity (idempotent) to pause the speech, if possible
|
|
748
|
+
if (
|
|
749
|
+
this.audioRecognition &&
|
|
750
|
+
this.turnDetection !== 'manual' &&
|
|
751
|
+
this.turnDetection !== 'realtime_llm'
|
|
752
|
+
) {
|
|
753
|
+
this.interruptByAudioActivity();
|
|
754
|
+
|
|
755
|
+
// TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// TODO: resume false interruption - start interrupt paused speech task
|
|
735
759
|
}
|
|
736
760
|
|
|
737
761
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
@@ -1982,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1982
2006
|
|
|
1983
2007
|
if (audioOutput) {
|
|
1984
2008
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1985
|
-
this.agentSession._updateAgentState('listening');
|
|
1986
2009
|
}
|
|
1987
2010
|
|
|
1988
2011
|
if (speechHandle.interrupted) {
|
|
@@ -2069,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2069
2092
|
speechHandle._markGenerationDone();
|
|
2070
2093
|
// TODO(brian): close tees
|
|
2071
2094
|
|
|
2072
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
2073
|
-
this.agentSession._updateAgentState('thinking');
|
|
2074
|
-
});
|
|
2075
|
-
|
|
2076
2095
|
await executeToolsTask.result;
|
|
2077
2096
|
|
|
2097
|
+
if (toolOutput.output.length > 0) {
|
|
2098
|
+
this.agentSession._updateAgentState('thinking');
|
|
2099
|
+
} else if (this.agentSession.agentState === 'speaking') {
|
|
2100
|
+
this.agentSession._updateAgentState('listening');
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2078
2103
|
if (toolOutput.output.length === 0) {
|
|
2079
|
-
// return to listening state for thinking-only turns (no audio output, no tools)
|
|
2080
|
-
if (!speechHandle.interrupted) {
|
|
2081
|
-
this.agentSession._updateAgentState('listening');
|
|
2082
|
-
}
|
|
2083
2104
|
return;
|
|
2084
2105
|
}
|
|
2085
2106
|
|