@livekit/agents 0.7.6 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio.d.cts +9 -0
- package/dist/cli.d.cts +14 -0
- package/dist/constants.d.cts +5 -0
- package/dist/generator.d.cts +23 -0
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +19 -0
- package/dist/http_server.d.ts +1 -0
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js.map +1 -1
- package/dist/index.d.cts +29 -0
- package/dist/inference_runner.d.cts +12 -0
- package/dist/ipc/index.d.cts +2 -0
- package/dist/ipc/inference_executor.d.cts +4 -0
- package/dist/ipc/inference_proc_executor.cjs +3 -2
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.cts +23 -0
- package/dist/ipc/inference_proc_executor.js +1 -1
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.d.cts +2 -0
- package/dist/ipc/job_executor.d.cts +18 -0
- package/dist/ipc/job_proc_executor.cjs +3 -2
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.cts +19 -0
- package/dist/ipc/job_proc_executor.js +1 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.d.cts +2 -0
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/message.d.cts +58 -0
- package/dist/ipc/proc_pool.d.cts +31 -0
- package/dist/ipc/supervised_proc.d.cts +30 -0
- package/dist/job.d.cts +113 -0
- package/dist/llm/chat_context.d.cts +66 -0
- package/dist/llm/function_context.d.cts +47 -0
- package/dist/llm/index.d.cts +4 -0
- package/dist/llm/llm.cjs +3 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +66 -0
- package/dist/llm/llm.js +3 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.d.cts +13 -0
- package/dist/metrics/base.d.cts +96 -0
- package/dist/metrics/index.d.cts +5 -0
- package/dist/metrics/usage_collector.d.cts +14 -0
- package/dist/metrics/utils.d.cts +10 -0
- package/dist/multimodal/agent_playout.d.cts +34 -0
- package/dist/multimodal/index.d.cts +3 -0
- package/dist/multimodal/multimodal_agent.d.cts +48 -0
- package/dist/pipeline/agent_output.d.cts +33 -0
- package/dist/pipeline/agent_playout.d.cts +40 -0
- package/dist/pipeline/human_input.d.cts +30 -0
- package/dist/pipeline/index.d.cts +2 -0
- package/dist/pipeline/pipeline_agent.d.cts +151 -0
- package/dist/pipeline/speech_handle.d.cts +37 -0
- package/dist/plugin.d.cts +10 -0
- package/dist/stt/index.d.cts +3 -0
- package/dist/stt/stream_adapter.d.cts +18 -0
- package/dist/stt/stt.d.cts +124 -0
- package/dist/tokenize/basic/basic.d.cts +18 -0
- package/dist/tokenize/basic/hyphenator.d.cts +17 -0
- package/dist/tokenize/basic/index.d.cts +2 -0
- package/dist/tokenize/basic/paragraph.d.cts +5 -0
- package/dist/tokenize/basic/sentence.d.cts +5 -0
- package/dist/tokenize/basic/word.d.cts +5 -0
- package/dist/tokenize/index.d.cts +5 -0
- package/dist/tokenize/token_stream.d.cts +39 -0
- package/dist/tokenize/tokenizer.d.cts +55 -0
- package/dist/transcription.d.cts +31 -0
- package/dist/tts/index.d.cts +3 -0
- package/dist/tts/stream_adapter.d.cts +17 -0
- package/dist/tts/tts.cjs +4 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +127 -0
- package/dist/tts/tts.js +4 -4
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.d.cts +72 -0
- package/dist/vad.d.cts +78 -0
- package/dist/version.d.cts +2 -0
- package/dist/worker.cjs +2 -1
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +109 -0
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +2 -1
- package/dist/worker.js.map +1 -1
- package/package.json +9 -5
- package/src/http_server.ts +1 -0
- package/src/ipc/inference_proc_executor.ts +1 -1
- package/src/ipc/job_proc_executor.ts +1 -1
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/llm/llm.ts +3 -3
- package/src/tts/tts.ts +4 -4
- package/src/worker.ts +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { AgentMetrics, LLMMetrics, PipelineEOUMetrics, PipelineLLMMetrics, PipelineTTSMetrics, STTMetrics, TTSMetrics, VADMetrics } from './base.js';
|
|
2
|
+
export declare const logMetrics: (metrics: AgentMetrics) => void;
|
|
3
|
+
export declare const isLLMMetrics: (metrics: AgentMetrics) => metrics is LLMMetrics;
|
|
4
|
+
export declare const isPipelineLLMMetrics: (metrics: AgentMetrics) => metrics is PipelineLLMMetrics;
|
|
5
|
+
export declare const isVADMetrics: (metrics: AgentMetrics) => metrics is VADMetrics;
|
|
6
|
+
export declare const isPipelineEOUMetrics: (metrics: AgentMetrics) => metrics is PipelineEOUMetrics;
|
|
7
|
+
export declare const isTTSMetrics: (metrics: AgentMetrics) => metrics is TTSMetrics;
|
|
8
|
+
export declare const isPipelineTTSMetrics: (metrics: AgentMetrics) => metrics is PipelineTTSMetrics;
|
|
9
|
+
export declare const isSTTMetrics: (metrics: AgentMetrics) => metrics is STTMetrics;
|
|
10
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/// <reference types="node" resolution-mode="require"/>
|
|
2
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
3
|
+
import { type AudioSource } from '@livekit/rtc-node';
|
|
4
|
+
import { EventEmitter } from 'node:events';
|
|
5
|
+
import type { TextAudioSynchronizer } from '../transcription.js';
|
|
6
|
+
import { type AsyncIterableQueue, Future } from '../utils.js';
|
|
7
|
+
export declare const proto: {};
|
|
8
|
+
export declare class PlayoutHandle extends EventEmitter {
|
|
9
|
+
#private;
|
|
10
|
+
/** @internal */
|
|
11
|
+
synchronizer: TextAudioSynchronizer;
|
|
12
|
+
/** @internal */
|
|
13
|
+
doneFut: Future;
|
|
14
|
+
/** @internal */
|
|
15
|
+
intFut: Future;
|
|
16
|
+
/** @internal */
|
|
17
|
+
pushedDuration: number;
|
|
18
|
+
/** @internal */
|
|
19
|
+
totalPlayedTime: number | undefined;
|
|
20
|
+
constructor(audioSource: AudioSource, sampleRate: number, itemId: string, contentIndex: number, synchronizer: TextAudioSynchronizer);
|
|
21
|
+
get itemId(): string;
|
|
22
|
+
get audioSamples(): number;
|
|
23
|
+
get textChars(): number;
|
|
24
|
+
get contentIndex(): number;
|
|
25
|
+
get interrupted(): boolean;
|
|
26
|
+
get done(): boolean;
|
|
27
|
+
interrupt(): void;
|
|
28
|
+
}
|
|
29
|
+
export declare class AgentPlayout extends EventEmitter {
|
|
30
|
+
#private;
|
|
31
|
+
constructor(audioSource: AudioSource, sampleRate: number, numChannels: number, inFrameSize: number, outFrameSize: number);
|
|
32
|
+
play(itemId: string, contentIndex: number, synchronizer: TextAudioSynchronizer, textStream: AsyncIterableQueue<string>, audioStream: AsyncIterableQueue<AudioFrame>): PlayoutHandle;
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=agent_playout.d.ts.map
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/// <reference types="node" resolution-mode="require"/>
|
|
2
|
+
import type { NoiseCancellationOptions, RemoteAudioTrack, RemoteParticipant, Room } from '@livekit/rtc-node';
|
|
3
|
+
import { EventEmitter } from 'node:events';
|
|
4
|
+
import * as llm from '../llm/index.js';
|
|
5
|
+
/**
|
|
6
|
+
* @internal
|
|
7
|
+
* @beta
|
|
8
|
+
*/
|
|
9
|
+
export declare abstract class RealtimeSession extends EventEmitter {
|
|
10
|
+
abstract conversation: any;
|
|
11
|
+
abstract inputAudioBuffer: any;
|
|
12
|
+
abstract fncCtx: llm.FunctionContext | undefined;
|
|
13
|
+
abstract recoverFromTextResponse(itemId: string): void;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* @internal
|
|
17
|
+
* @beta
|
|
18
|
+
*/
|
|
19
|
+
export declare abstract class RealtimeModel {
|
|
20
|
+
abstract session(options: any): RealtimeSession;
|
|
21
|
+
abstract close(): Promise<void>;
|
|
22
|
+
abstract sampleRate: number;
|
|
23
|
+
abstract numChannels: number;
|
|
24
|
+
abstract inFrameSize: number;
|
|
25
|
+
abstract outFrameSize: number;
|
|
26
|
+
}
|
|
27
|
+
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
28
|
+
export declare const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
29
|
+
/** @beta */
|
|
30
|
+
export declare class MultimodalAgent extends EventEmitter {
|
|
31
|
+
#private;
|
|
32
|
+
model: RealtimeModel;
|
|
33
|
+
room: Room | null;
|
|
34
|
+
linkedParticipant: RemoteParticipant | null;
|
|
35
|
+
subscribedTrack: RemoteAudioTrack | null;
|
|
36
|
+
readMicroTask: Promise<void> | null;
|
|
37
|
+
constructor({ model, chatCtx, fncCtx, maxTextResponseRetries, noiseCancellation, }: {
|
|
38
|
+
model: RealtimeModel;
|
|
39
|
+
chatCtx?: llm.ChatContext;
|
|
40
|
+
fncCtx?: llm.FunctionContext;
|
|
41
|
+
maxTextResponseRetries?: number;
|
|
42
|
+
noiseCancellation?: NoiseCancellationOptions;
|
|
43
|
+
});
|
|
44
|
+
get fncCtx(): llm.FunctionContext | undefined;
|
|
45
|
+
set fncCtx(ctx: llm.FunctionContext | undefined);
|
|
46
|
+
start(room: Room, participant?: RemoteParticipant | string | null): Promise<RealtimeSession>;
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=multimodal_agent.d.ts.map
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { TextAudioSynchronizer } from '../transcription.js';
|
|
3
|
+
import { type TTS } from '../tts/index.js';
|
|
4
|
+
import { AsyncIterableQueue, Future } from '../utils.js';
|
|
5
|
+
import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
|
|
6
|
+
export type SpeechSource = AsyncIterable<string> | string | Promise<string>;
|
|
7
|
+
export declare class SynthesisHandle {
|
|
8
|
+
#private;
|
|
9
|
+
static readonly FLUSH_SENTINEL: unique symbol;
|
|
10
|
+
text?: string;
|
|
11
|
+
ttsSource: SpeechSource;
|
|
12
|
+
tts: TTS;
|
|
13
|
+
queue: AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
|
|
14
|
+
intFut: Future;
|
|
15
|
+
synchronizer: TextAudioSynchronizer;
|
|
16
|
+
constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS, synchronizer: TextAudioSynchronizer);
|
|
17
|
+
get speechId(): string;
|
|
18
|
+
get validated(): boolean;
|
|
19
|
+
get interrupted(): boolean;
|
|
20
|
+
get playHandle(): PlayoutHandle | undefined;
|
|
21
|
+
/** Validate the speech for playout. */
|
|
22
|
+
play(): PlayoutHandle;
|
|
23
|
+
/** Interrupt the speech. */
|
|
24
|
+
interrupt(): void;
|
|
25
|
+
}
|
|
26
|
+
export declare class AgentOutput {
|
|
27
|
+
#private;
|
|
28
|
+
constructor(agentPlayout: AgentPlayout, tts: TTS);
|
|
29
|
+
get playout(): AgentPlayout;
|
|
30
|
+
close(): Promise<void>;
|
|
31
|
+
synthesize(speechId: string, ttsSource: SpeechSource, synchronizer: TextAudioSynchronizer): SynthesisHandle;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=agent_output.d.ts.map
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { TextAudioSynchronizer } from '../transcription.js';
|
|
4
|
+
import { Future } from '../utils.js';
|
|
5
|
+
import { SynthesisHandle } from './agent_output.js';
|
|
6
|
+
export declare enum AgentPlayoutEvent {
|
|
7
|
+
PLAYOUT_STARTED = 0,
|
|
8
|
+
PLAYOUT_STOPPED = 1
|
|
9
|
+
}
|
|
10
|
+
export type AgentPlayoutCallbacks = {
|
|
11
|
+
[AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;
|
|
12
|
+
[AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;
|
|
13
|
+
};
|
|
14
|
+
export declare class PlayoutHandle {
|
|
15
|
+
#private;
|
|
16
|
+
playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
|
|
17
|
+
totalPlayedTime?: number;
|
|
18
|
+
synchronizer: TextAudioSynchronizer;
|
|
19
|
+
pushedDuration: number;
|
|
20
|
+
intFut: Future;
|
|
21
|
+
doneFut: Future;
|
|
22
|
+
constructor(speechId: string, audioSource: AudioSource, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer);
|
|
23
|
+
get speechId(): string;
|
|
24
|
+
get interrupted(): boolean;
|
|
25
|
+
get timePlayed(): number;
|
|
26
|
+
get done(): boolean;
|
|
27
|
+
interrupt(): void;
|
|
28
|
+
join(): Future;
|
|
29
|
+
}
|
|
30
|
+
declare const AgentPlayout_base: new () => TypedEmitter<AgentPlayoutCallbacks>;
|
|
31
|
+
export declare class AgentPlayout extends AgentPlayout_base {
|
|
32
|
+
#private;
|
|
33
|
+
constructor(audioSource: AudioSource);
|
|
34
|
+
get targetVolume(): number;
|
|
35
|
+
set targetVolume(vol: number);
|
|
36
|
+
play(speechId: string, playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>, synchronizer: TextAudioSynchronizer): PlayoutHandle;
|
|
37
|
+
close(): Promise<void>;
|
|
38
|
+
}
|
|
39
|
+
export {};
|
|
40
|
+
//# sourceMappingURL=agent_playout.d.ts.map
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { NoiseCancellationOptions, RemoteAudioTrack, RemoteParticipant, Room } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { STT, SpeechEvent } from '../stt/stt.js';
|
|
4
|
+
import type { VAD, VADEvent } from '../vad.js';
|
|
5
|
+
export declare enum HumanInputEvent {
|
|
6
|
+
START_OF_SPEECH = 0,
|
|
7
|
+
VAD_INFERENCE_DONE = 1,
|
|
8
|
+
END_OF_SPEECH = 2,
|
|
9
|
+
FINAL_TRANSCRIPT = 3,
|
|
10
|
+
INTERIM_TRANSCRIPT = 4
|
|
11
|
+
}
|
|
12
|
+
export type HumanInputCallbacks = {
|
|
13
|
+
[HumanInputEvent.START_OF_SPEECH]: (event: VADEvent) => void;
|
|
14
|
+
[HumanInputEvent.VAD_INFERENCE_DONE]: (event: VADEvent) => void;
|
|
15
|
+
[HumanInputEvent.END_OF_SPEECH]: (event: VADEvent) => void;
|
|
16
|
+
[HumanInputEvent.FINAL_TRANSCRIPT]: (event: SpeechEvent) => void;
|
|
17
|
+
[HumanInputEvent.INTERIM_TRANSCRIPT]: (event: SpeechEvent) => void;
|
|
18
|
+
};
|
|
19
|
+
declare const HumanInput_base: new () => TypedEmitter<HumanInputCallbacks>;
|
|
20
|
+
export declare class HumanInput extends HumanInput_base {
|
|
21
|
+
#private;
|
|
22
|
+
constructor(room: Room, vad: VAD, stt: STT, participant: RemoteParticipant, noiseCancellation?: NoiseCancellationOptions);
|
|
23
|
+
get participant(): RemoteParticipant;
|
|
24
|
+
get subscribedTrack(): RemoteAudioTrack | undefined;
|
|
25
|
+
get speaking(): boolean;
|
|
26
|
+
get speakingProbability(): number;
|
|
27
|
+
close(): Promise<void>;
|
|
28
|
+
}
|
|
29
|
+
export {};
|
|
30
|
+
//# sourceMappingURL=human_input.d.ts.map
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import type { NoiseCancellationOptions, RemoteParticipant, Room } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { CallableFunctionResult, FunctionCallInfo, FunctionContext, LLM } from '../llm/index.js';
|
|
4
|
+
import { LLMStream } from '../llm/index.js';
|
|
5
|
+
import { ChatContext, ChatMessage } from '../llm/index.js';
|
|
6
|
+
import type { AgentMetrics } from '../metrics/base.js';
|
|
7
|
+
import { type STT } from '../stt/index.js';
|
|
8
|
+
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
9
|
+
import type { TTS } from '../tts/index.js';
|
|
10
|
+
import { type VAD } from '../vad.js';
|
|
11
|
+
import type { SpeechSource } from './agent_output.js';
|
|
12
|
+
import { SpeechHandle } from './speech_handle.js';
|
|
13
|
+
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
14
|
+
export declare const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
15
|
+
export type BeforeLLMCallback = (agent: VoicePipelineAgent, chatCtx: ChatContext) => LLMStream | false | void | Promise<LLMStream | false | void>;
|
|
16
|
+
export type BeforeTTSCallback = (agent: VoicePipelineAgent, source: string | AsyncIterable<string>) => SpeechSource;
|
|
17
|
+
export declare enum VPAEvent {
|
|
18
|
+
USER_STARTED_SPEAKING = 0,
|
|
19
|
+
USER_STOPPED_SPEAKING = 1,
|
|
20
|
+
AGENT_STARTED_SPEAKING = 2,
|
|
21
|
+
AGENT_STOPPED_SPEAKING = 3,
|
|
22
|
+
USER_SPEECH_COMMITTED = 4,
|
|
23
|
+
AGENT_SPEECH_COMMITTED = 5,
|
|
24
|
+
AGENT_SPEECH_INTERRUPTED = 6,
|
|
25
|
+
FUNCTION_CALLS_COLLECTED = 7,
|
|
26
|
+
FUNCTION_CALLS_FINISHED = 8,
|
|
27
|
+
METRICS_COLLECTED = 9
|
|
28
|
+
}
|
|
29
|
+
export type VPACallbacks = {
|
|
30
|
+
[VPAEvent.USER_STARTED_SPEAKING]: () => void;
|
|
31
|
+
[VPAEvent.USER_STOPPED_SPEAKING]: () => void;
|
|
32
|
+
[VPAEvent.AGENT_STARTED_SPEAKING]: () => void;
|
|
33
|
+
[VPAEvent.AGENT_STOPPED_SPEAKING]: () => void;
|
|
34
|
+
[VPAEvent.USER_SPEECH_COMMITTED]: (msg: ChatMessage) => void;
|
|
35
|
+
[VPAEvent.AGENT_SPEECH_COMMITTED]: (msg: ChatMessage) => void;
|
|
36
|
+
[VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
|
|
37
|
+
[VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
|
|
38
|
+
[VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
|
|
39
|
+
[VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
|
|
40
|
+
};
|
|
41
|
+
interface TurnDetector {
|
|
42
|
+
unlikelyThreshold: number;
|
|
43
|
+
supportsLanguage: (language?: string) => boolean;
|
|
44
|
+
predictEndOfTurn: (chatCtx: ChatContext) => Promise<number>;
|
|
45
|
+
}
|
|
46
|
+
export declare class AgentCallContext {
|
|
47
|
+
#private;
|
|
48
|
+
constructor(agent: VoicePipelineAgent, llmStream: LLMStream);
|
|
49
|
+
static getCurrent(): AgentCallContext;
|
|
50
|
+
get agent(): VoicePipelineAgent;
|
|
51
|
+
storeMetadata(key: string, value: any): void;
|
|
52
|
+
getMetadata(key: string, orDefault?: any): any;
|
|
53
|
+
get llmStream(): LLMStream;
|
|
54
|
+
get extraChatMessages(): ChatMessage[];
|
|
55
|
+
addExtraChatMessage(message: ChatMessage): void;
|
|
56
|
+
}
|
|
57
|
+
export interface AgentTranscriptionOptions {
|
|
58
|
+
/** Whether to forward the user transcription to the client */
|
|
59
|
+
userTranscription: boolean;
|
|
60
|
+
/** Whether to forward the agent transcription to the client */
|
|
61
|
+
agentTranscription: boolean;
|
|
62
|
+
/**
|
|
63
|
+
* The speed at which the agent's speech transcription is forwarded to the client.
|
|
64
|
+
* We try to mimic the agent's speech speed by adjusting the transcription speed.
|
|
65
|
+
*/
|
|
66
|
+
agentTranscriptionSpeech: number;
|
|
67
|
+
/**
|
|
68
|
+
* The tokenizer used to split the speech into sentences.
|
|
69
|
+
* This is used to decide when to mark a transcript as final for the agent transcription.
|
|
70
|
+
*/
|
|
71
|
+
sentenceTokenizer: SentenceTokenizer;
|
|
72
|
+
/**
|
|
73
|
+
* The tokenizer used to split the speech into words.
|
|
74
|
+
* This is used to simulate the "interim results" of the agent transcription.
|
|
75
|
+
*/
|
|
76
|
+
wordTokenizer: WordTokenizer;
|
|
77
|
+
/**
|
|
78
|
+
* A function that takes a string (word) as input and returns a list of strings,
|
|
79
|
+
* representing the hyphenated parts of the word.
|
|
80
|
+
*/
|
|
81
|
+
hyphenateWord: (word: string) => string[];
|
|
82
|
+
}
|
|
83
|
+
export interface VPAOptions {
|
|
84
|
+
/** Chat context for the assistant. */
|
|
85
|
+
chatCtx?: ChatContext;
|
|
86
|
+
/** Function context for the assistant. */
|
|
87
|
+
fncCtx?: FunctionContext;
|
|
88
|
+
/** Whether to allow the user to interrupt the assistant. */
|
|
89
|
+
allowInterruptions: boolean;
|
|
90
|
+
/** Minimum duration of speech to consider for interruption. */
|
|
91
|
+
interruptSpeechDuration: number;
|
|
92
|
+
/** Minimum number of words to consider for interuption. This may increase latency. */
|
|
93
|
+
interruptMinWords: number;
|
|
94
|
+
/** Delay to wait before considering the user speech done. */
|
|
95
|
+
minEndpointingDelay: number;
|
|
96
|
+
maxNestedFncCalls: number;
|
|
97
|
+
preemptiveSynthesis: boolean;
|
|
98
|
+
beforeLLMCallback: BeforeLLMCallback;
|
|
99
|
+
beforeTTSCallback: BeforeTTSCallback;
|
|
100
|
+
/** Options for assistant transcription. */
|
|
101
|
+
transcription: AgentTranscriptionOptions;
|
|
102
|
+
/** Turn detection model to use. */
|
|
103
|
+
turnDetector?: TurnDetector;
|
|
104
|
+
/** Noise cancellation options. */
|
|
105
|
+
noiseCancellation?: NoiseCancellationOptions;
|
|
106
|
+
}
|
|
107
|
+
declare const VoicePipelineAgent_base: new () => TypedEmitter<VPACallbacks>;
|
|
108
|
+
/** A pipeline agent (VAD + STT + LLM + TTS) implementation. */
|
|
109
|
+
export declare class VoicePipelineAgent extends VoicePipelineAgent_base {
|
|
110
|
+
#private;
|
|
111
|
+
/** Minimum time played for the user speech to be committed to the chat context. */
|
|
112
|
+
readonly MIN_TIME_PLAYED_FOR_COMMIT = 1.5;
|
|
113
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
114
|
+
transcribedText: string;
|
|
115
|
+
constructor(
|
|
116
|
+
/** Voice Activity Detection instance. */
|
|
117
|
+
vad: VAD,
|
|
118
|
+
/** Speech-to-Text instance. */
|
|
119
|
+
stt: STT,
|
|
120
|
+
/** Large Language Model instance. */
|
|
121
|
+
llm: LLM,
|
|
122
|
+
/** Text-to-Speech instance. */
|
|
123
|
+
tts: TTS,
|
|
124
|
+
/** Additional VoicePipelineAgent options. */
|
|
125
|
+
opts?: Partial<VPAOptions>);
|
|
126
|
+
get fncCtx(): FunctionContext | undefined;
|
|
127
|
+
set fncCtx(ctx: FunctionContext);
|
|
128
|
+
get chatCtx(): ChatContext;
|
|
129
|
+
get llm(): LLM;
|
|
130
|
+
get tts(): TTS;
|
|
131
|
+
get stt(): STT;
|
|
132
|
+
get vad(): VAD;
|
|
133
|
+
/** Start the voice assistant. */
|
|
134
|
+
start(
|
|
135
|
+
/** The room to connect to. */
|
|
136
|
+
room: Room,
|
|
137
|
+
/**
|
|
138
|
+
* The participant to listen to.
|
|
139
|
+
*
|
|
140
|
+
* @remarks
|
|
141
|
+
* Can be a participant or an identity.
|
|
142
|
+
* If omitted, the first participant in the room will be selected.
|
|
143
|
+
*/
|
|
144
|
+
participant?: RemoteParticipant | string | null): void;
|
|
145
|
+
/** Play a speech source through the voice assistant. */
|
|
146
|
+
say(source: string | LLMStream | AsyncIterable<string>, allowInterruptions?: boolean, addToChatCtx?: boolean): Promise<SpeechHandle>;
|
|
147
|
+
/** Close the voice assistant. */
|
|
148
|
+
close(): Promise<void>;
|
|
149
|
+
}
|
|
150
|
+
export {};
|
|
151
|
+
//# sourceMappingURL=pipeline_agent.d.ts.map
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { ChatMessage, LLMStream } from '../llm/index.js';
|
|
2
|
+
import type { SynthesisHandle } from './agent_output.js';
|
|
3
|
+
export declare class SpeechHandle {
|
|
4
|
+
#private;
|
|
5
|
+
constructor(id: string, allowInterruptions: boolean, addToChatCtx: boolean, isReply: boolean, userQuestion: string, fncNestedDepth?: number, extraToolsMessages?: ChatMessage[] | undefined);
|
|
6
|
+
static createAssistantReply(allowInterruptions: boolean, addToChatCtx: boolean, userQuestion: string): SpeechHandle;
|
|
7
|
+
static createAssistantSpeech(allowInterruptions: boolean, addToChatCtx: boolean): SpeechHandle;
|
|
8
|
+
static createToolSpeech(allowInterruptions: boolean, addToChatCtx: boolean, fncNestedDepth: number, extraToolsMessages: ChatMessage[]): SpeechHandle;
|
|
9
|
+
waitForInitialization(): Promise<void>;
|
|
10
|
+
initialize(source: string | LLMStream | AsyncIterable<string>, synthesisHandle: SynthesisHandle): void;
|
|
11
|
+
markUserCommitted(): void;
|
|
12
|
+
markSpeechCommitted(): void;
|
|
13
|
+
get userCommitted(): boolean;
|
|
14
|
+
get speechCommitted(): boolean;
|
|
15
|
+
get id(): string;
|
|
16
|
+
get allowInterruptions(): boolean;
|
|
17
|
+
get addToChatCtx(): boolean;
|
|
18
|
+
get source(): string | LLMStream | AsyncIterable<string>;
|
|
19
|
+
get synthesisHandle(): SynthesisHandle;
|
|
20
|
+
set synthesisHandle(handle: SynthesisHandle);
|
|
21
|
+
get initialized(): boolean;
|
|
22
|
+
get isReply(): boolean;
|
|
23
|
+
get userQuestion(): string;
|
|
24
|
+
get interrupted(): boolean;
|
|
25
|
+
get fncNestedDepth(): number;
|
|
26
|
+
get extraToolsMessages(): ChatMessage[] | undefined;
|
|
27
|
+
addNestedSpeech(handle: SpeechHandle): void;
|
|
28
|
+
get nestedSpeechHandles(): SpeechHandle[];
|
|
29
|
+
nestedSpeechChanged(): Promise<void>;
|
|
30
|
+
get nestedSpeechFinished(): boolean;
|
|
31
|
+
markNestedSpeechFinished(): void;
|
|
32
|
+
join(): Promise<void>;
|
|
33
|
+
setDone(): void;
|
|
34
|
+
interrupt(): void;
|
|
35
|
+
cancel(): void;
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=speech_handle.d.ts.map
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare abstract class Plugin {
|
|
2
|
+
#private;
|
|
3
|
+
registeredPlugins: Plugin[];
|
|
4
|
+
constructor(title: string, version: string);
|
|
5
|
+
static registerPlugins(plugin: Plugin): void;
|
|
6
|
+
abstract downloadFiles(): void;
|
|
7
|
+
get title(): string;
|
|
8
|
+
get version(): string;
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=plugin.d.ts.map
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export { type SpeechEvent, type SpeechData, type STTCapabilities, type RecognitionUsage, type STTCallbacks, SpeechEventType, STT, SpeechStream, } from './stt.js';
|
|
2
|
+
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
|
|
3
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { VAD } from '../vad.js';
|
|
3
|
+
import type { SpeechEvent } from './stt.js';
|
|
4
|
+
import { STT, SpeechStream } from './stt.js';
|
|
5
|
+
export declare class StreamAdapter extends STT {
|
|
6
|
+
#private;
|
|
7
|
+
label: string;
|
|
8
|
+
constructor(stt: STT, vad: VAD);
|
|
9
|
+
_recognize(frame: AudioFrame): Promise<SpeechEvent>;
|
|
10
|
+
stream(): StreamAdapterWrapper;
|
|
11
|
+
}
|
|
12
|
+
export declare class StreamAdapterWrapper extends SpeechStream {
|
|
13
|
+
#private;
|
|
14
|
+
label: string;
|
|
15
|
+
constructor(stt: STT, vad: VAD);
|
|
16
|
+
monitorMetrics(): Promise<void>;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=stream_adapter.d.ts.map
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { STTMetrics } from '../metrics/base.js';
|
|
4
|
+
import type { AudioBuffer } from '../utils.js';
|
|
5
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
6
|
+
/** Indicates start/middle/end of speech */
|
|
7
|
+
export declare enum SpeechEventType {
|
|
8
|
+
/**
|
|
9
|
+
* Indicate the start of speech.
|
|
10
|
+
* If the STT doesn't support this event, this will be emitted at the same time
|
|
11
|
+
* as the first INTERIM_TRANSCRIPT.
|
|
12
|
+
*/
|
|
13
|
+
START_OF_SPEECH = 0,
|
|
14
|
+
/**
|
|
15
|
+
* Interim transcript, useful for real-time transcription.
|
|
16
|
+
*/
|
|
17
|
+
INTERIM_TRANSCRIPT = 1,
|
|
18
|
+
/**
|
|
19
|
+
* Final transcript, emitted when the STT is confident enough that a certain
|
|
20
|
+
* portion of the speech will not change.
|
|
21
|
+
*/
|
|
22
|
+
FINAL_TRANSCRIPT = 2,
|
|
23
|
+
/**
|
|
24
|
+
* Indicate the end of speech, emitted when the user stops speaking.
|
|
25
|
+
* The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.
|
|
26
|
+
*/
|
|
27
|
+
END_OF_SPEECH = 3,
|
|
28
|
+
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
29
|
+
RECOGNITION_USAGE = 4,
|
|
30
|
+
METRICS_COLLECTED = 5
|
|
31
|
+
}
|
|
32
|
+
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
33
|
+
export interface SpeechData {
|
|
34
|
+
language: string;
|
|
35
|
+
text: string;
|
|
36
|
+
startTime: number;
|
|
37
|
+
endTime: number;
|
|
38
|
+
confidence: number;
|
|
39
|
+
}
|
|
40
|
+
export interface RecognitionUsage {
|
|
41
|
+
audioDuration: number;
|
|
42
|
+
}
|
|
43
|
+
/** SpeechEvent is a packet of speech-to-text data. */
|
|
44
|
+
export interface SpeechEvent {
|
|
45
|
+
type: SpeechEventType;
|
|
46
|
+
alternatives?: [SpeechData, ...SpeechData[]];
|
|
47
|
+
requestId?: string;
|
|
48
|
+
recognitionUsage?: RecognitionUsage;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Describes the capabilities of the STT provider.
|
|
52
|
+
*
|
|
53
|
+
* @remarks
|
|
54
|
+
* At present, the framework only supports providers that have a streaming endpoint.
|
|
55
|
+
*/
|
|
56
|
+
export interface STTCapabilities {
|
|
57
|
+
streaming: boolean;
|
|
58
|
+
interimResults: boolean;
|
|
59
|
+
}
|
|
60
|
+
export type STTCallbacks = {
|
|
61
|
+
[SpeechEventType.METRICS_COLLECTED]: (metrics: STTMetrics) => void;
|
|
62
|
+
};
|
|
63
|
+
declare const STT_base: new () => TypedEmitter<STTCallbacks>;
|
|
64
|
+
/**
|
|
65
|
+
* An instance of a speech-to-text adapter.
|
|
66
|
+
*
|
|
67
|
+
* @remarks
|
|
68
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
69
|
+
* exports its own child STT class, which inherits this class's methods.
|
|
70
|
+
*/
|
|
71
|
+
export declare abstract class STT extends STT_base {
|
|
72
|
+
#private;
|
|
73
|
+
abstract label: string;
|
|
74
|
+
constructor(capabilities: STTCapabilities);
|
|
75
|
+
/** Returns this STT's capabilities */
|
|
76
|
+
get capabilities(): STTCapabilities;
|
|
77
|
+
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
78
|
+
recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
79
|
+
protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
80
|
+
/**
|
|
81
|
+
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
82
|
+
* transcriptions
|
|
83
|
+
*/
|
|
84
|
+
abstract stream(): SpeechStream;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* An instance of a speech-to-text stream, as an asynchronous iterable iterator.
|
|
88
|
+
*
|
|
89
|
+
* @example Looping through frames
|
|
90
|
+
* ```ts
|
|
91
|
+
* for await (const event of stream) {
|
|
92
|
+
* if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {
|
|
93
|
+
* console.log(event.alternatives[0].text)
|
|
94
|
+
* }
|
|
95
|
+
* }
|
|
96
|
+
* ```
|
|
97
|
+
*
|
|
98
|
+
* @remarks
|
|
99
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
100
|
+
* exports its own child SpeechStream class, which inherits this class's methods.
|
|
101
|
+
*/
|
|
102
|
+
export declare abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
|
|
103
|
+
#private;
|
|
104
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
105
|
+
protected input: AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>;
|
|
106
|
+
protected output: AsyncIterableQueue<SpeechEvent>;
|
|
107
|
+
protected queue: AsyncIterableQueue<SpeechEvent>;
|
|
108
|
+
abstract label: string;
|
|
109
|
+
protected closed: boolean;
|
|
110
|
+
constructor(stt: STT);
|
|
111
|
+
protected monitorMetrics(): Promise<void>;
|
|
112
|
+
/** Push an audio frame to the STT */
|
|
113
|
+
pushFrame(frame: AudioFrame): void;
|
|
114
|
+
/** Flush the STT, causing it to process all pending text */
|
|
115
|
+
flush(): void;
|
|
116
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
117
|
+
endInput(): void;
|
|
118
|
+
next(): Promise<IteratorResult<SpeechEvent>>;
|
|
119
|
+
/** Close both the input and output of the STT stream */
|
|
120
|
+
close(): void;
|
|
121
|
+
[Symbol.asyncIterator](): SpeechStream;
|
|
122
|
+
}
|
|
123
|
+
export {};
|
|
124
|
+
//# sourceMappingURL=stt.d.ts.map
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import * as tokenizer from '../tokenizer.js';
|
|
2
|
+
import { splitWords } from './word.js';
|
|
3
|
+
export declare class SentenceTokenizer extends tokenizer.SentenceTokenizer {
|
|
4
|
+
#private;
|
|
5
|
+
constructor(language?: string, minSentenceLength?: number, streamContextLength?: number);
|
|
6
|
+
tokenize(text: string, language?: string): string[];
|
|
7
|
+
stream(language?: string): tokenizer.SentenceStream;
|
|
8
|
+
}
|
|
9
|
+
export declare class WordTokenizer extends tokenizer.WordTokenizer {
|
|
10
|
+
#private;
|
|
11
|
+
constructor(ignorePunctuation?: boolean);
|
|
12
|
+
tokenize(text: string, language?: string): string[];
|
|
13
|
+
stream(language?: string): tokenizer.WordStream;
|
|
14
|
+
}
|
|
15
|
+
export declare const hyphenateWord: (word: string) => string[];
|
|
16
|
+
export { splitWords };
|
|
17
|
+
export declare const tokenizeParagraphs: (text: string) => string[];
|
|
18
|
+
//# sourceMappingURL=basic.d.ts.map
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
declare const END: unique symbol;
|
|
2
|
+
interface Tree {
|
|
3
|
+
[id: string]: Tree | string;
|
|
4
|
+
[END]?: number[];
|
|
5
|
+
}
|
|
6
|
+
declare class Hyphenator {
|
|
7
|
+
#private;
|
|
8
|
+
tree: Tree;
|
|
9
|
+
exceptions: {
|
|
10
|
+
[id: string]: number[];
|
|
11
|
+
};
|
|
12
|
+
constructor(patterns: string, exceptions: string);
|
|
13
|
+
hyphenateWord(word: string): string[];
|
|
14
|
+
}
|
|
15
|
+
export declare const hyphenator: Hyphenator;
|
|
16
|
+
export {};
|
|
17
|
+
//# sourceMappingURL=hyphenator.d.ts.map
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import * as basic from './basic/index.js';
|
|
2
|
+
export { type TokenData, SentenceTokenizer, SentenceStream, WordTokenizer, WordStream, } from './tokenizer.js';
|
|
3
|
+
export { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';
|
|
4
|
+
export { basic };
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|