@livekit/agents 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +7 -13
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +1 -4
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +7 -13
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +9 -2
- package/dist/pipeline/agent_output.cjs.map +1 -1
- package/dist/pipeline/agent_output.d.ts +1 -0
- package/dist/pipeline/agent_output.d.ts.map +1 -1
- package/dist/pipeline/agent_output.js +9 -2
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +168 -70
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +171 -73
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +2 -1
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +12 -17
- package/src/pipeline/agent_output.ts +14 -7
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +210 -95
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/vad.ts +61 -4
|
@@ -3,11 +3,13 @@ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
|
3
3
|
import type { CallableFunctionResult, FunctionCallInfo, FunctionContext, LLM } from '../llm/index.js';
|
|
4
4
|
import { LLMStream } from '../llm/index.js';
|
|
5
5
|
import { ChatContext, ChatMessage } from '../llm/index.js';
|
|
6
|
+
import type { AgentMetrics } from '../metrics/base.js';
|
|
6
7
|
import { type STT } from '../stt/index.js';
|
|
7
8
|
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
8
9
|
import type { TTS } from '../tts/index.js';
|
|
9
|
-
import type
|
|
10
|
+
import { type VAD } from '../vad.js';
|
|
10
11
|
import type { SpeechSource } from './agent_output.js';
|
|
12
|
+
import { SpeechHandle } from './speech_handle.js';
|
|
11
13
|
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
12
14
|
export declare const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
13
15
|
export type BeforeLLMCallback = (agent: VoicePipelineAgent, chatCtx: ChatContext) => LLMStream | false | void | Promise<LLMStream | false | void>;
|
|
@@ -21,7 +23,8 @@ export declare enum VPAEvent {
|
|
|
21
23
|
AGENT_SPEECH_COMMITTED = 5,
|
|
22
24
|
AGENT_SPEECH_INTERRUPTED = 6,
|
|
23
25
|
FUNCTION_CALLS_COLLECTED = 7,
|
|
24
|
-
FUNCTION_CALLS_FINISHED = 8
|
|
26
|
+
FUNCTION_CALLS_FINISHED = 8,
|
|
27
|
+
METRICS_COLLECTED = 9
|
|
25
28
|
}
|
|
26
29
|
export type VPACallbacks = {
|
|
27
30
|
[VPAEvent.USER_STARTED_SPEAKING]: () => void;
|
|
@@ -33,6 +36,7 @@ export type VPACallbacks = {
|
|
|
33
36
|
[VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
|
|
34
37
|
[VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
|
|
35
38
|
[VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
|
|
39
|
+
[VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
|
|
36
40
|
};
|
|
37
41
|
export declare class AgentCallContext {
|
|
38
42
|
#private;
|
|
@@ -42,6 +46,8 @@ export declare class AgentCallContext {
|
|
|
42
46
|
storeMetadata(key: string, value: any): void;
|
|
43
47
|
getMetadata(key: string, orDefault?: any): any;
|
|
44
48
|
get llmStream(): LLMStream;
|
|
49
|
+
get extraChatMessages(): ChatMessage[];
|
|
50
|
+
addExtraChatMessage(message: ChatMessage): void;
|
|
45
51
|
}
|
|
46
52
|
export interface AgentTranscriptionOptions {
|
|
47
53
|
/** Whether to forward the user transcription to the client */
|
|
@@ -82,7 +88,7 @@ export interface VPAOptions {
|
|
|
82
88
|
interruptMinWords: number;
|
|
83
89
|
/** Delay to wait before considering the user speech done. */
|
|
84
90
|
minEndpointingDelay: number;
|
|
85
|
-
|
|
91
|
+
maxNestedFncCalls: number;
|
|
86
92
|
preemptiveSynthesis: boolean;
|
|
87
93
|
beforeLLMCallback: BeforeLLMCallback;
|
|
88
94
|
beforeTTSCallback: BeforeTTSCallback;
|
|
@@ -127,7 +133,7 @@ export declare class VoicePipelineAgent extends VoicePipelineAgent_base {
|
|
|
127
133
|
*/
|
|
128
134
|
participant?: RemoteParticipant | string | null): void;
|
|
129
135
|
/** Play a speech source through the voice assistant. */
|
|
130
|
-
say(source: string | LLMStream | AsyncIterable<string>, allowInterruptions?: boolean, addToChatCtx?: boolean): Promise<
|
|
136
|
+
say(source: string | LLMStream | AsyncIterable<string>, allowInterruptions?: boolean, addToChatCtx?: boolean): Promise<SpeechHandle>;
|
|
131
137
|
/** Close the voice assistant. */
|
|
132
138
|
close(): Promise<void>;
|
|
133
139
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline_agent.d.ts","sourceRoot":"","sources":["../../src/pipeline/pipeline_agent.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAyB,iBAAiB,EAAE,IAAI,EAAE,MAAM,mBAAmB,CAAC;AAQxF,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EACV,sBAAsB,EACtB,gBAAgB,EAChB,eAAe,EACf,GAAG,EACJ,MAAM,iBAAiB,CAAC;AACzB,OAAO,
|
|
1
|
+
{"version":3,"file":"pipeline_agent.d.ts","sourceRoot":"","sources":["../../src/pipeline/pipeline_agent.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAyB,iBAAiB,EAAE,IAAI,EAAE,MAAM,mBAAmB,CAAC;AAQxF,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EACV,sBAAsB,EACtB,gBAAgB,EAChB,eAAe,EACf,GAAG,EACJ,MAAM,iBAAiB,CAAC;AACzB,OAAO,EAAY,SAAS,EAAE,MAAM,iBAAiB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,WAAW,EAAY,MAAM,iBAAiB,CAAC;AAErE,OAAO,KAAK,EAAE,YAAY,EAAsB,MAAM,oBAAoB,CAAC;AAC3E,OAAO,EAAE,KAAK,GAAG,EAAsD,MAAM,iBAAiB,CAAC;AAM/F,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACjF,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAG3C,OAAO,EAAE,KAAK,GAAG,EAA+B,MAAM,WAAW,CAAC;AAClE,OAAO,KAAK,EAAE,YAAY,EAAmB,MAAM,mBAAmB,CAAC;AAIvE,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,MAAM,MAAM,UAAU,GAAG,cAAc,GAAG,UAAU,GAAG,WAAW,GAAG,UAAU,CAAC;AAChF,eAAO,MAAM,qBAAqB,mBAAmB,CAAC;AAGtD,MAAM,MAAM,iBAAiB,GAAG,CAC9B,KAAK,EAAE,kBAAkB,EACzB,OAAO,EAAE,WAAW,KACjB,SAAS,GAAG,KAAK,GAAG,IAAI,GAAG,OAAO,CAAC,SAAS,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC;AAElE,MAAM,MAAM,iBAAiB,GAAG,CAC9B,KAAK,EAAE,kBAAkB,EACzB,MAAM,EAAE,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC,KACnC,YAAY,CAAC;AAElB,oBAAY,QAAQ;IAClB,qBAAqB,IAAA;IACrB,qBAAqB,IAAA;IACrB,sBAAsB,IAAA;IACtB,sBAAsB,IAAA;IACtB,qBAAqB,IAAA;IACrB,sBAAsB,IAAA;IACtB,wBAAwB,IAAA;IACxB,wBAAwB,IAAA;IACxB,uBAAuB,IAAA;IACvB,iBAAiB,IAAA;CAClB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,QAAQ,CAAC,qBAAqB,CAAC,EAAE,MAAM,IAAI,CAAC;IAC7C,CAAC,QAAQ,CAAC,qBAAqB,CAAC,EAAE,MAAM,IAAI,CAAC;IAC7C,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,MAAM,IAAI,CAAC;IAC9C,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,MAAM,IAAI,CAAC;IAC9C,CAAC,QAAQ,CAAC,qBAAqB,CAAC,EAAE,CAAC,GAAG,EAAE,WAAW,KAAK,IAAI,CAAC;IAC7D,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC,GAAG,EAAE,WAAW,KAAK,IAAI,CAAC;IAC9D,CAAC,QAAQ,CAAC,wBAAwB,CAAC,EAAE,CAAC,GAAG,EAAE,WAAW,KAAK,IAAI,CAAC;IAChE,CAAC,QAAQ,CAAC,wBAAwB,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,EAAE,KAAK,IAAI,CAAC;IACzE,CAAC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,CAAC,KAAK,EAAE,sBAAsB,EAAE,KAAK,IAAI,CAAC;IAC9E,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,IAAI,CAAC;CAC/D,CAAC;AAEF,qBAAa,gBAAgB;;gBAOf,KAAK,EAAE,kBAAkB,EAAE,SAAS,EAAE,SAAS;IAM3D,MAAM,CAAC,UAAU,IAAI,gBAAgB;IAIrC,IAAI,KAAK,IAAI,kBAAkB,CAE9B;IAED,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG;IAIrC,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,GAAE,GAAe;IAInD,IAAI,SAAS,IAAI,SAAS,CAEzB;IAED,IAAI,iBAAiB,kBAEpB;IAED,mBAAmB,CAAC,OAAO,EAAE,WAAW;CAGzC;AAiBD,MAAM,WAAW,yBAAyB;IACxC,8DAA8D;IAC9D,iBAAiB,EAAE,OAAO,CAAC;IAC3B,+DAA+D;IAC/D,kBAAkB,EAAE,OAAO,CAAC;IAC5B;;;OAGG;IACH,wBAAwB,EAAE,MAAM,CAAC;IACjC;;;OAGG;IACH,iBAAiB,EAAE,iBAAiB,CAAC;IACrC;;;OAGG;IACH,aAAa,EAAE,aAAa,CAAC;IAC7B;;;OAGG;IACH,aAAa,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,EAAE,CAAC;CAC3C;AAWD,MAAM,WAAW,UAAU;IACzB,sCAAsC;IACtC,OAAO,CAAC,EAAE,WAAW,CAAC;IACtB,0CAA0C;IAC1C,MAAM,CAAC,EAAE,eAAe,CAAC;IACzB,4DAA4D;IAC5D,kBAAkB,EAAE,OAAO,CAAC;IAC5B,+DAA+D;IAC/D,uBAAuB,EAAE,MAAM,CAAC;IAChC,sFAAsF;IACtF,iBAAiB,EAAE,MAAM,CAAC;IAC1B,6DAA6D;IAC7D,mBAAmB,EAAE,MAAM,CAAC;IAC5B,iBAAiB,EAAE,MAAM,CAAC;IAE1B,mBAAmB,EAAE,OAAO,CAAC;IAS7B,iBAAiB,EAAE,iBAAiB,CAAC;IAQrC,iBAAiB,EAAE,iBAAiB,CAAC;IACrC,2CAA2C;IAC3C,aAAa,EAAE,yBAAyB,CAAC;CAC1C;iDAgBkE,aAAa,YAAY,CAAC;AAD7F,+DAA+D;AAC/D,qBAAa,kBAAmB,SAAQ,uBAAsD;;IAC5F,mFAAmF;IACnF,QAAQ,CAAC,0BAA0B,OAAO;IAC1C,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;;IA4BlE,yCAAyC;IACzC,GAAG,EAAE,GAAG;IACR,+BAA+B;IAC/B,GAAG,EAAE,GAAG;IACR,qCAAqC;IACrC,GAAG,EAAE,GAAG;IACR,+BAA+B;IAC/B,GAAG,EAAE,GAAG;IACR,6CAA6C;IAC7C,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAyB/C,IAAI,MAAM,IAAI,eAAe,GAAG,SAAS,CAExC;IAED,IAAI,MAAM,CAAC,GAAG,EAAE,eAAe,EAE9B;IAED,IAAI,OAAO,IAAI,WAAW,CAEzB;IAED,IAAI,GAAG,IAAI,GAAG,CAEb;IAED,IAAI,GAAG,IAAI,GAAG,CAEb;IAED,IAAI,GAAG,IAAI,GAAG,CAEb;IAED,IAAI,GAAG,IAAI,GAAG,CAEb;IAED,iCAAiC;IACjC,KAAK;IACH,8BAA8B;IAC9B,IAAI,EAAE,IAAI;IACV;;;;;;OAMG;IACH,WAAW,GAAE,iBAAiB,GAAG,MAAM,GAAG,IAAW;IA8CvD,wDAAwD;IAClD,GAAG,CACP,MAAM,EAAE,MAAM,GAAG,SAAS,GAAG,aAAa,CAAC,MAAM,CAAC,EAClD,kBAAkB,UAAO,EACzB,YAAY,UAAO,GAClB,OAAO,CAAC,YAAY,CAAC;IAoiBxB,iCAAiC;IAC3B,KAAK;CAQZ"}
|
|
@@ -6,22 +6,24 @@ import {
|
|
|
6
6
|
TrackSource
|
|
7
7
|
} from "@livekit/rtc-node";
|
|
8
8
|
import EventEmitter from "node:events";
|
|
9
|
-
import { LLMStream } from "../llm/index.js";
|
|
9
|
+
import { LLMEvent, LLMStream } from "../llm/index.js";
|
|
10
10
|
import { ChatContext, ChatMessage, ChatRole } from "../llm/index.js";
|
|
11
11
|
import { log } from "../log.js";
|
|
12
|
-
import { StreamAdapter as STTStreamAdapter } from "../stt/index.js";
|
|
12
|
+
import { StreamAdapter as STTStreamAdapter, SpeechEventType } from "../stt/index.js";
|
|
13
13
|
import {
|
|
14
14
|
SentenceTokenizer as BasicSentenceTokenizer,
|
|
15
15
|
WordTokenizer as BasicWordTokenizer,
|
|
16
16
|
hyphenateWord
|
|
17
17
|
} from "../tokenize/basic/index.js";
|
|
18
|
-
import { StreamAdapter as TTSStreamAdapter } from "../tts/index.js";
|
|
18
|
+
import { TTSEvent, StreamAdapter as TTSStreamAdapter } from "../tts/index.js";
|
|
19
19
|
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from "../utils.js";
|
|
20
|
+
import { VADEventType } from "../vad.js";
|
|
20
21
|
import { AgentOutput } from "./agent_output.js";
|
|
21
22
|
import { AgentPlayout, AgentPlayoutEvent } from "./agent_playout.js";
|
|
22
23
|
import { HumanInput, HumanInputEvent } from "./human_input.js";
|
|
23
24
|
import { SpeechHandle } from "./speech_handle.js";
|
|
24
25
|
const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
26
|
+
let speechData;
|
|
25
27
|
var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
|
|
26
28
|
VPAEvent2[VPAEvent2["USER_STARTED_SPEAKING"] = 0] = "USER_STARTED_SPEAKING";
|
|
27
29
|
VPAEvent2[VPAEvent2["USER_STOPPED_SPEAKING"] = 1] = "USER_STOPPED_SPEAKING";
|
|
@@ -32,12 +34,14 @@ var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
|
|
|
32
34
|
VPAEvent2[VPAEvent2["AGENT_SPEECH_INTERRUPTED"] = 6] = "AGENT_SPEECH_INTERRUPTED";
|
|
33
35
|
VPAEvent2[VPAEvent2["FUNCTION_CALLS_COLLECTED"] = 7] = "FUNCTION_CALLS_COLLECTED";
|
|
34
36
|
VPAEvent2[VPAEvent2["FUNCTION_CALLS_FINISHED"] = 8] = "FUNCTION_CALLS_FINISHED";
|
|
37
|
+
VPAEvent2[VPAEvent2["METRICS_COLLECTED"] = 9] = "METRICS_COLLECTED";
|
|
35
38
|
return VPAEvent2;
|
|
36
39
|
})(VPAEvent || {});
|
|
37
40
|
class AgentCallContext {
|
|
38
41
|
#agent;
|
|
39
42
|
#llmStream;
|
|
40
43
|
#metadata = /* @__PURE__ */ new Map();
|
|
44
|
+
#extraChatMessages = [];
|
|
41
45
|
static #current;
|
|
42
46
|
constructor(agent, llmStream) {
|
|
43
47
|
this.#agent = agent;
|
|
@@ -59,6 +63,12 @@ class AgentCallContext {
|
|
|
59
63
|
get llmStream() {
|
|
60
64
|
return this.#llmStream;
|
|
61
65
|
}
|
|
66
|
+
get extraChatMessages() {
|
|
67
|
+
return this.#extraChatMessages;
|
|
68
|
+
}
|
|
69
|
+
addExtraChatMessage(message) {
|
|
70
|
+
this.#extraChatMessages.push(message);
|
|
71
|
+
}
|
|
62
72
|
}
|
|
63
73
|
const defaultBeforeLLMCallback = (agent, chatCtx) => {
|
|
64
74
|
return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });
|
|
@@ -80,7 +90,7 @@ const defaultVPAOptions = {
|
|
|
80
90
|
interruptSpeechDuration: 50,
|
|
81
91
|
interruptMinWords: 0,
|
|
82
92
|
minEndpointingDelay: 500,
|
|
83
|
-
|
|
93
|
+
maxNestedFncCalls: 1,
|
|
84
94
|
preemptiveSynthesis: false,
|
|
85
95
|
beforeLLMCallback: defaultBeforeLLMCallback,
|
|
86
96
|
beforeTTSCallback: defaultBeforeTTSCallback,
|
|
@@ -105,7 +115,6 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
105
115
|
#transcribedInterimText = "";
|
|
106
116
|
#speechQueueOpen = new Future();
|
|
107
117
|
#speechQueue = new AsyncIterableQueue();
|
|
108
|
-
#lastEndOfSpeechTime;
|
|
109
118
|
#updateStateTask;
|
|
110
119
|
#started = false;
|
|
111
120
|
#room;
|
|
@@ -113,6 +122,8 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
113
122
|
#deferredValidation;
|
|
114
123
|
#logger = log();
|
|
115
124
|
#agentPublication;
|
|
125
|
+
#lastFinalTranscriptTime;
|
|
126
|
+
#lastSpeechTime;
|
|
116
127
|
constructor(vad, stt, llm, tts, opts = defaultVPAOptions) {
|
|
117
128
|
super();
|
|
118
129
|
this.#opts = { ...defaultVPAOptions, ...opts };
|
|
@@ -157,6 +168,20 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
157
168
|
if (this.#started) {
|
|
158
169
|
throw new Error("voice assistant already started");
|
|
159
170
|
}
|
|
171
|
+
this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
|
|
172
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
173
|
+
});
|
|
174
|
+
this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
175
|
+
if (!speechData) return;
|
|
176
|
+
this.emit(9 /* METRICS_COLLECTED */, { ...metrics, sequenceId: speechData.sequenceId });
|
|
177
|
+
});
|
|
178
|
+
this.#llm.on(LLMEvent.METRICS_COLLECTED, (metrics) => {
|
|
179
|
+
if (!speechData) return;
|
|
180
|
+
this.emit(9 /* METRICS_COLLECTED */, { ...metrics, sequenceId: speechData.sequenceId });
|
|
181
|
+
});
|
|
182
|
+
this.#vad.on(VADEventType.METRICS_COLLECTED, (metrics) => {
|
|
183
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
184
|
+
});
|
|
160
185
|
room.on(RoomEvent.ParticipantConnected, (participant2) => {
|
|
161
186
|
if (this.#participant) {
|
|
162
187
|
return;
|
|
@@ -177,10 +202,43 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
177
202
|
/** Play a speech source through the voice assistant. */
|
|
178
203
|
async say(source, allowInterruptions = true, addToChatCtx = true) {
|
|
179
204
|
await this.#trackPublishedFut.await;
|
|
205
|
+
let callContext;
|
|
206
|
+
let fncSource;
|
|
207
|
+
if (addToChatCtx) {
|
|
208
|
+
callContext = AgentCallContext.getCurrent();
|
|
209
|
+
if (source instanceof LLMStream) {
|
|
210
|
+
this.#logger.warn("LLMStream will be ignored for function call chat context");
|
|
211
|
+
} else if (typeof source === "string") {
|
|
212
|
+
fncSource = source;
|
|
213
|
+
} else {
|
|
214
|
+
fncSource = source;
|
|
215
|
+
source = new AsyncIterableQueue();
|
|
216
|
+
}
|
|
217
|
+
}
|
|
180
218
|
const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
|
|
181
219
|
const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
|
|
182
220
|
newHandle.initialize(source, synthesisHandle);
|
|
183
|
-
this.#
|
|
221
|
+
if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {
|
|
222
|
+
this.#playingSpeech.addNestedSpeech(newHandle);
|
|
223
|
+
} else {
|
|
224
|
+
this.#addSpeechForPlayout(newHandle);
|
|
225
|
+
}
|
|
226
|
+
if (callContext && fncSource) {
|
|
227
|
+
let text;
|
|
228
|
+
if (typeof source === "string") {
|
|
229
|
+
text = fncSource;
|
|
230
|
+
} else {
|
|
231
|
+
text = "";
|
|
232
|
+
for await (const chunk of fncSource) {
|
|
233
|
+
source.put(chunk);
|
|
234
|
+
text += chunk;
|
|
235
|
+
}
|
|
236
|
+
source.close();
|
|
237
|
+
}
|
|
238
|
+
callContext.addExtraChatMessage(ChatMessage.create({ text, role: ChatRole.ASSISTANT }));
|
|
239
|
+
this.#logger.child({ text }).debug("added speech to function call chat context");
|
|
240
|
+
}
|
|
241
|
+
return newHandle;
|
|
184
242
|
}
|
|
185
243
|
#updateState(state, delay = 0) {
|
|
186
244
|
const runTask = (delay2) => {
|
|
@@ -234,11 +292,13 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
234
292
|
if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
|
|
235
293
|
this.#interruptIfPossible();
|
|
236
294
|
}
|
|
295
|
+
if (event.rawAccumulatedSpeech > 0) {
|
|
296
|
+
this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;
|
|
297
|
+
}
|
|
237
298
|
});
|
|
238
299
|
this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
|
|
239
300
|
this.emit(0 /* USER_STARTED_SPEAKING */);
|
|
240
301
|
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
241
|
-
this.#lastEndOfSpeechTime = Date.now();
|
|
242
302
|
});
|
|
243
303
|
this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
244
304
|
this.#transcribedInterimText = event.alternatives[0].text;
|
|
@@ -246,7 +306,7 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
246
306
|
this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
|
|
247
307
|
const newTranscript = event.alternatives[0].text;
|
|
248
308
|
if (!newTranscript) return;
|
|
249
|
-
this.#
|
|
309
|
+
this.#lastFinalTranscriptTime = Date.now();
|
|
250
310
|
this.#transcribedText += (this.#transcribedText ? " " : "") + newTranscript;
|
|
251
311
|
if (this.#opts.preemptiveSynthesis && (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)) {
|
|
252
312
|
this.#synthesizeAgentReply();
|
|
@@ -318,8 +378,7 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
318
378
|
if ((!playingSpeech.userQuestion || playingSpeech.userCommitted) && !playingSpeech.speechCommitted) {
|
|
319
379
|
copiedCtx.messages.push(
|
|
320
380
|
ChatMessage.create({
|
|
321
|
-
|
|
322
|
-
// text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
|
|
381
|
+
text: playingSpeech.synthesisHandle.text,
|
|
323
382
|
role: ChatRole.ASSISTANT
|
|
324
383
|
})
|
|
325
384
|
);
|
|
@@ -331,23 +390,26 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
331
390
|
role: ChatRole.USER
|
|
332
391
|
})
|
|
333
392
|
);
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
393
|
+
speechData = { sequenceId: handle.id };
|
|
394
|
+
try {
|
|
395
|
+
if (cancelled) resolve();
|
|
396
|
+
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
397
|
+
if (llmStream === false) {
|
|
398
|
+
handle == null ? void 0 : handle.cancel();
|
|
399
|
+
return;
|
|
400
|
+
}
|
|
401
|
+
if (cancelled) resolve();
|
|
402
|
+
if (!(llmStream instanceof LLMStream)) {
|
|
403
|
+
llmStream = await defaultBeforeLLMCallback(this, copiedCtx);
|
|
404
|
+
}
|
|
405
|
+
if (handle.interrupted) {
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
409
|
+
handle.initialize(llmStream, synthesisHandle);
|
|
410
|
+
} finally {
|
|
411
|
+
speechData = void 0;
|
|
346
412
|
}
|
|
347
|
-
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
348
|
-
handle.initialize(llmStream, synthesisHandle);
|
|
349
|
-
const elapsed = !!this.#lastEndOfSpeechTime ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1e3) / 1e3 : -1;
|
|
350
|
-
this.#logger.child({ speechId: handle.id, elapsed }).debug("synthesizing agent reply");
|
|
351
413
|
resolve();
|
|
352
414
|
});
|
|
353
415
|
}
|
|
@@ -387,62 +449,83 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
387
449
|
if (handle.interrupted) break;
|
|
388
450
|
}
|
|
389
451
|
commitUserQuestionIfNeeded();
|
|
390
|
-
|
|
452
|
+
const collectedText = handle.synthesisHandle.text;
|
|
391
453
|
const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
|
|
392
|
-
const
|
|
393
|
-
|
|
394
|
-
|
|
454
|
+
const interrupted = handle.interrupted;
|
|
455
|
+
const executeFunctionCalls = async () => {
|
|
456
|
+
if (!isUsingTools || interrupted) return;
|
|
457
|
+
if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {
|
|
458
|
+
this.#logger.child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth }).warn("max function calls nested depth reached");
|
|
459
|
+
return;
|
|
460
|
+
}
|
|
395
461
|
if (!userQuestion || !handle.userCommitted) {
|
|
396
462
|
throw new Error("user speech should have been committed before using tools");
|
|
397
463
|
}
|
|
398
464
|
const llmStream = handle.source;
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
}
|
|
415
|
-
}
|
|
416
|
-
const toolCallsInfo = [];
|
|
417
|
-
const toolCallsResults = [];
|
|
418
|
-
for (const fnc of calledFuncs) {
|
|
419
|
-
const task = await fnc.task;
|
|
420
|
-
if (!task || task.result === void 0) continue;
|
|
421
|
-
toolCallsInfo.push(fnc);
|
|
422
|
-
toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
|
|
465
|
+
const newFunctionCalls = llmStream.functionCalls;
|
|
466
|
+
new AgentCallContext(this, llmStream);
|
|
467
|
+
this.emit(7 /* FUNCTION_CALLS_COLLECTED */, newFunctionCalls);
|
|
468
|
+
const calledFuncs = [];
|
|
469
|
+
for (const func of newFunctionCalls) {
|
|
470
|
+
const task2 = func.func.execute(func.params).then(
|
|
471
|
+
(result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
|
|
472
|
+
(error) => ({ name: func.name, toolCallId: func.toolCallId, error })
|
|
473
|
+
);
|
|
474
|
+
calledFuncs.push({ ...func, task: task2 });
|
|
475
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).debug("executing AI function");
|
|
476
|
+
try {
|
|
477
|
+
await task2;
|
|
478
|
+
} catch {
|
|
479
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).error("error executing AI function");
|
|
423
480
|
}
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
481
|
+
}
|
|
482
|
+
const toolCallsInfo = [];
|
|
483
|
+
const toolCallsResults = [];
|
|
484
|
+
for (const fnc of calledFuncs) {
|
|
485
|
+
const task2 = await fnc.task;
|
|
486
|
+
if (!task2 || task2.result === void 0) continue;
|
|
487
|
+
toolCallsInfo.push(fnc);
|
|
488
|
+
toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task2));
|
|
489
|
+
}
|
|
490
|
+
if (!toolCallsInfo.length) return;
|
|
491
|
+
const extraToolsMessages = [ChatMessage.createToolCalls(toolCallsInfo, collectedText)];
|
|
492
|
+
extraToolsMessages.push(...toolCallsResults);
|
|
493
|
+
const newSpeechHandle = SpeechHandle.createToolSpeech(
|
|
494
|
+
handle.allowInterruptions,
|
|
495
|
+
handle.addToChatCtx,
|
|
496
|
+
handle.fncNestedDepth + 1,
|
|
497
|
+
extraToolsMessages
|
|
498
|
+
);
|
|
499
|
+
const chatCtx = handle.source.chatCtx.copy();
|
|
500
|
+
chatCtx.messages.push(...extraToolsMessages);
|
|
501
|
+
chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);
|
|
502
|
+
const answerLLMStream = this.llm.chat({
|
|
503
|
+
chatCtx,
|
|
504
|
+
fncCtx: this.fncCtx
|
|
505
|
+
});
|
|
506
|
+
const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
|
|
507
|
+
newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
|
|
508
|
+
handle.addNestedSpeech(newSpeechHandle);
|
|
509
|
+
this.emit(8 /* FUNCTION_CALLS_FINISHED */, calledFuncs);
|
|
510
|
+
};
|
|
511
|
+
const task = executeFunctionCalls().then(() => {
|
|
512
|
+
handle.markNestedSpeechFinished();
|
|
513
|
+
});
|
|
514
|
+
while (!handle.nestedSpeechFinished) {
|
|
515
|
+
const changed = handle.nestedSpeechChanged();
|
|
516
|
+
await Promise.race([changed, task]);
|
|
517
|
+
while (handle.nestedSpeechHandles.length) {
|
|
518
|
+
const speech = handle.nestedSpeechHandles[0];
|
|
519
|
+
this.#playingSpeech = speech;
|
|
520
|
+
await this.#playSpeech(speech);
|
|
521
|
+
handle.nestedSpeechHandles.shift();
|
|
522
|
+
this.#playingSpeech = handle;
|
|
442
523
|
}
|
|
443
524
|
}
|
|
444
525
|
if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
|
|
445
|
-
|
|
526
|
+
if (handle.extraToolsMessages) {
|
|
527
|
+
this.chatCtx.messages.push(...handle.extraToolsMessages);
|
|
528
|
+
}
|
|
446
529
|
if (interrupted) {
|
|
447
530
|
collectedText + "\u2026";
|
|
448
531
|
}
|
|
@@ -459,6 +542,7 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
459
542
|
interrupted,
|
|
460
543
|
speechId: handle.id
|
|
461
544
|
}).debug("committed agent speech");
|
|
545
|
+
handle.setDone();
|
|
462
546
|
}
|
|
463
547
|
}
|
|
464
548
|
#synthesizeAgentSpeech(speechId, source) {
|
|
@@ -499,6 +583,20 @@ class VoicePipelineAgent extends EventEmitter {
|
|
|
499
583
|
}
|
|
500
584
|
}
|
|
501
585
|
this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug("validated agent reply");
|
|
586
|
+
if (this.#lastSpeechTime) {
|
|
587
|
+
const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;
|
|
588
|
+
const transcriptionDelay = Math.max(
|
|
589
|
+
(this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,
|
|
590
|
+
0
|
|
591
|
+
);
|
|
592
|
+
const metrics = {
|
|
593
|
+
timestamp: Date.now(),
|
|
594
|
+
sequenceId: this.#pendingAgentReply.id,
|
|
595
|
+
endOfUtteranceDelay: timeSinceLastSpeech,
|
|
596
|
+
transcriptionDelay
|
|
597
|
+
};
|
|
598
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
599
|
+
}
|
|
502
600
|
this.#addSpeechForPlayout(this.#pendingAgentReply);
|
|
503
601
|
this.#pendingAgentReply = void 0;
|
|
504
602
|
this.#transcribedInterimText = "";
|