@livekit/agents 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +40 -0
- package/dist/audio.js +17 -30
- package/dist/audio.js.map +1 -1
- package/dist/cli.js +3 -14
- package/dist/cli.js.map +1 -1
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.js +5 -9
- package/dist/http_server.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -2
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.js +3 -5
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_main.d.ts +1 -1
- package/dist/ipc/proc_job_executor.js +66 -80
- package/dist/ipc/proc_job_executor.js.map +1 -1
- package/dist/ipc/proc_pool.d.ts +3 -3
- package/dist/ipc/proc_pool.d.ts.map +1 -1
- package/dist/ipc/proc_pool.js +16 -11
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/job.js +56 -73
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.d.ts +66 -0
- package/dist/llm/chat_context.d.ts.map +1 -0
- package/dist/llm/chat_context.js +93 -0
- package/dist/llm/chat_context.js.map +1 -0
- package/dist/llm/function_context.d.ts +19 -1
- package/dist/llm/function_context.d.ts.map +1 -1
- package/dist/llm/function_context.js +54 -18
- package/dist/llm/function_context.js.map +1 -1
- package/dist/llm/function_context.test.d.ts +2 -0
- package/dist/llm/function_context.test.d.ts.map +1 -0
- package/dist/llm/function_context.test.js +218 -0
- package/dist/llm/function_context.test.js.map +1 -0
- package/dist/llm/index.d.ts +3 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -2
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.d.ts +53 -0
- package/dist/llm/llm.d.ts.map +1 -0
- package/dist/llm/llm.js +45 -0
- package/dist/llm/llm.js.map +1 -0
- package/dist/multimodal/agent_playout.d.ts +1 -1
- package/dist/multimodal/agent_playout.js +116 -153
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +4 -3
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +214 -237
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.d.ts +30 -0
- package/dist/pipeline/agent_output.d.ts.map +1 -0
- package/dist/pipeline/agent_output.js +155 -0
- package/dist/pipeline/agent_output.js.map +1 -0
- package/dist/pipeline/agent_playout.d.ts +38 -0
- package/dist/pipeline/agent_playout.d.ts.map +1 -0
- package/dist/pipeline/agent_playout.js +142 -0
- package/dist/pipeline/agent_playout.js.map +1 -0
- package/dist/pipeline/human_input.d.ts +28 -0
- package/dist/pipeline/human_input.d.ts.map +1 -0
- package/dist/pipeline/human_input.js +134 -0
- package/dist/pipeline/human_input.js.map +1 -0
- package/dist/pipeline/index.d.ts +2 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +5 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/pipeline_agent.d.ts +134 -0
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
- package/dist/pipeline/pipeline_agent.js +661 -0
- package/dist/pipeline/pipeline_agent.js.map +1 -0
- package/dist/pipeline/speech_handle.d.ts +27 -0
- package/dist/pipeline/speech_handle.d.ts.map +1 -0
- package/dist/pipeline/speech_handle.js +102 -0
- package/dist/pipeline/speech_handle.js.map +1 -0
- package/dist/plugin.js +7 -20
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.d.ts +1 -2
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js +1 -2
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stt.d.ts +62 -24
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +77 -27
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.d.ts +16 -0
- package/dist/tokenize/basic/basic.d.ts.map +1 -0
- package/dist/tokenize/basic/basic.js +50 -0
- package/dist/tokenize/basic/basic.js.map +1 -0
- package/dist/tokenize/basic/hyphenator.d.ts +17 -0
- package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
- package/dist/tokenize/basic/hyphenator.js +420 -0
- package/dist/tokenize/basic/hyphenator.js.map +1 -0
- package/dist/tokenize/basic/index.d.ts +2 -0
- package/dist/tokenize/basic/index.d.ts.map +1 -0
- package/dist/tokenize/basic/index.js +5 -0
- package/dist/tokenize/basic/index.js.map +1 -0
- package/dist/tokenize/basic/paragraph.d.ts +5 -0
- package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
- package/dist/tokenize/basic/paragraph.js +38 -0
- package/dist/tokenize/basic/paragraph.js.map +1 -0
- package/dist/tokenize/basic/sentence.d.ts +5 -0
- package/dist/tokenize/basic/sentence.d.ts.map +1 -0
- package/dist/tokenize/basic/sentence.js +60 -0
- package/dist/tokenize/basic/sentence.js.map +1 -0
- package/dist/tokenize/basic/word.d.ts +5 -0
- package/dist/tokenize/basic/word.d.ts.map +1 -0
- package/dist/tokenize/basic/word.js +23 -0
- package/dist/tokenize/basic/word.js.map +1 -0
- package/dist/tokenize/index.d.ts +5 -0
- package/dist/tokenize/index.d.ts.map +1 -0
- package/dist/tokenize/index.js +8 -0
- package/dist/tokenize/index.js.map +1 -0
- package/dist/tokenize/token_stream.d.ts +36 -0
- package/dist/tokenize/token_stream.d.ts.map +1 -0
- package/dist/tokenize/token_stream.js +136 -0
- package/dist/tokenize/token_stream.js.map +1 -0
- package/dist/tokenize/tokenizer.d.ts +55 -0
- package/dist/tokenize/tokenizer.d.ts.map +1 -0
- package/dist/tokenize/tokenizer.js +117 -0
- package/dist/tokenize/tokenizer.js.map +1 -0
- package/dist/transcription.js +78 -89
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.d.ts +1 -3
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +1 -3
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/tts.d.ts +66 -37
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +79 -74
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.d.ts +21 -6
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +120 -76
- package/dist/utils.js.map +1 -1
- package/dist/vad.d.ts +43 -39
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +51 -4
- package/dist/vad.js.map +1 -1
- package/dist/worker.d.ts +1 -1
- package/dist/worker.js +257 -247
- package/dist/worker.js.map +1 -1
- package/package.json +4 -3
- package/src/index.ts +16 -2
- package/src/ipc/proc_pool.ts +4 -4
- package/src/llm/chat_context.ts +147 -0
- package/src/llm/function_context.test.ts +248 -0
- package/src/llm/function_context.ts +77 -18
- package/src/llm/index.ts +21 -2
- package/src/llm/llm.ts +102 -0
- package/src/multimodal/multimodal_agent.ts +19 -6
- package/src/pipeline/agent_output.ts +185 -0
- package/src/pipeline/agent_playout.ts +187 -0
- package/src/pipeline/human_input.ts +166 -0
- package/src/pipeline/index.ts +15 -0
- package/src/pipeline/pipeline_agent.ts +917 -0
- package/src/pipeline/speech_handle.ts +136 -0
- package/src/stt/index.ts +8 -2
- package/src/stt/stt.ts +98 -31
- package/src/tokenize/basic/basic.ts +73 -0
- package/src/tokenize/basic/hyphenator.ts +436 -0
- package/src/tokenize/basic/index.ts +5 -0
- package/src/tokenize/basic/paragraph.ts +43 -0
- package/src/tokenize/basic/sentence.ts +69 -0
- package/src/tokenize/basic/word.ts +27 -0
- package/src/tokenize/index.ts +16 -0
- package/src/tokenize/token_stream.ts +163 -0
- package/src/tokenize/tokenizer.ts +152 -0
- package/src/tts/index.ts +1 -20
- package/src/tts/tts.ts +110 -57
- package/src/utils.ts +95 -25
- package/src/vad.ts +86 -45
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/stt/stream_adapter.d.ts +0 -19
- package/dist/stt/stream_adapter.d.ts.map +0 -1
- package/dist/stt/stream_adapter.js +0 -96
- package/dist/stt/stream_adapter.js.map +0 -1
- package/dist/tokenize.d.ts +0 -15
- package/dist/tokenize.d.ts.map +0 -1
- package/dist/tokenize.js +0 -12
- package/dist/tokenize.js.map +0 -1
- package/dist/tts/stream_adapter.d.ts +0 -19
- package/dist/tts/stream_adapter.d.ts.map +0 -1
- package/dist/tts/stream_adapter.js +0 -111
- package/dist/tts/stream_adapter.js.map +0 -1
- package/src/stt/stream_adapter.ts +0 -104
- package/src/tokenize.ts +0 -22
- package/src/tts/stream_adapter.ts +0 -93
package/src/llm/index.ts
CHANGED
|
@@ -1,11 +1,30 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
4
|
+
export {
|
|
5
5
|
type CallableFunction,
|
|
6
|
+
type FunctionCallInfo,
|
|
7
|
+
type CallableFunctionResult,
|
|
6
8
|
type FunctionContext,
|
|
7
9
|
type inferParameters,
|
|
8
10
|
oaiParams,
|
|
11
|
+
oaiBuildFunctionInfo,
|
|
9
12
|
} from './function_context.js';
|
|
10
13
|
|
|
11
|
-
export {
|
|
14
|
+
export {
|
|
15
|
+
type ChatImage,
|
|
16
|
+
type ChatAudio,
|
|
17
|
+
type ChatContent,
|
|
18
|
+
ChatRole,
|
|
19
|
+
ChatMessage,
|
|
20
|
+
ChatContext,
|
|
21
|
+
} from './chat_context.js';
|
|
22
|
+
|
|
23
|
+
export {
|
|
24
|
+
type ChoiceDelta,
|
|
25
|
+
type CompletionUsage,
|
|
26
|
+
type Choice,
|
|
27
|
+
type ChatChunk,
|
|
28
|
+
LLM,
|
|
29
|
+
LLMStream,
|
|
30
|
+
} from './llm.js';
|
package/src/llm/llm.ts
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
5
|
+
import type { ChatContext, ChatRole } from './chat_context.js';
|
|
6
|
+
import type { FunctionCallInfo, FunctionContext } from './function_context.js';
|
|
7
|
+
|
|
8
|
+
export interface ChoiceDelta {
|
|
9
|
+
role: ChatRole;
|
|
10
|
+
content?: string;
|
|
11
|
+
toolCalls?: FunctionCallInfo[];
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface CompletionUsage {
|
|
15
|
+
completionTokens: number;
|
|
16
|
+
promptTokens: number;
|
|
17
|
+
totalTokens: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface Choice {
|
|
21
|
+
delta: ChoiceDelta;
|
|
22
|
+
index: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface ChatChunk {
|
|
26
|
+
requestId: string;
|
|
27
|
+
choices: Choice[];
|
|
28
|
+
usage?: CompletionUsage;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export abstract class LLM {
|
|
32
|
+
/**
|
|
33
|
+
* Returns a {@link LLMStream} that can be used to push text and receive LLM responses.
|
|
34
|
+
*/
|
|
35
|
+
abstract chat({
|
|
36
|
+
chatCtx,
|
|
37
|
+
fncCtx,
|
|
38
|
+
temperature,
|
|
39
|
+
n,
|
|
40
|
+
parallelToolCalls,
|
|
41
|
+
}: {
|
|
42
|
+
chatCtx: ChatContext;
|
|
43
|
+
fncCtx?: FunctionContext;
|
|
44
|
+
temperature?: number;
|
|
45
|
+
n?: number;
|
|
46
|
+
parallelToolCalls?: boolean;
|
|
47
|
+
}): LLMStream;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
|
|
51
|
+
protected queue = new AsyncIterableQueue<ChatChunk>();
|
|
52
|
+
protected closed = false;
|
|
53
|
+
protected _functionCalls: FunctionCallInfo[] = [];
|
|
54
|
+
|
|
55
|
+
#chatCtx: ChatContext;
|
|
56
|
+
#fncCtx?: FunctionContext;
|
|
57
|
+
|
|
58
|
+
constructor(chatCtx: ChatContext, fncCtx?: FunctionContext) {
|
|
59
|
+
this.#chatCtx = chatCtx;
|
|
60
|
+
this.#fncCtx = fncCtx;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** List of called functions from this stream. */
|
|
64
|
+
get functionCalls(): FunctionCallInfo[] {
|
|
65
|
+
return this._functionCalls;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** The function context of this stream. */
|
|
69
|
+
get fncCtx(): FunctionContext | undefined {
|
|
70
|
+
return this.#fncCtx;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** The initial chat context of this stream. */
|
|
74
|
+
get chatCtx(): ChatContext {
|
|
75
|
+
return this.#chatCtx;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Execute all deferred functions of this stream concurrently. */
|
|
79
|
+
executeFunctions(): FunctionCallInfo[] {
|
|
80
|
+
this._functionCalls.forEach(
|
|
81
|
+
(f) =>
|
|
82
|
+
(f.task = f.func.execute(f.params).then(
|
|
83
|
+
(result) => ({ name: f.name, toolCallId: f.toolCallId, result }),
|
|
84
|
+
(error) => ({ name: f.name, toolCallId: f.toolCallId, error }),
|
|
85
|
+
)),
|
|
86
|
+
);
|
|
87
|
+
return this._functionCalls;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
next(): Promise<IteratorResult<ChatChunk>> {
|
|
91
|
+
return this.queue.next();
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
close() {
|
|
95
|
+
this.queue.close();
|
|
96
|
+
this.closed = true;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
[Symbol.asyncIterator](): LLMStream {
|
|
100
|
+
return this;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -64,13 +64,16 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
64
64
|
|
|
65
65
|
constructor({
|
|
66
66
|
model,
|
|
67
|
+
chatCtx,
|
|
67
68
|
fncCtx,
|
|
68
69
|
}: {
|
|
69
70
|
model: RealtimeModel;
|
|
70
|
-
|
|
71
|
+
chatCtx?: llm.ChatContext;
|
|
72
|
+
fncCtx?: llm.FunctionContext;
|
|
71
73
|
}) {
|
|
72
74
|
super();
|
|
73
75
|
this.model = model;
|
|
76
|
+
this.#chatCtx = chatCtx;
|
|
74
77
|
this.#fncCtx = fncCtx;
|
|
75
78
|
}
|
|
76
79
|
|
|
@@ -83,6 +86,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
83
86
|
#logger = log();
|
|
84
87
|
#session: RealtimeSession | null = null;
|
|
85
88
|
#fncCtx: llm.FunctionContext | undefined = undefined;
|
|
89
|
+
#chatCtx: llm.ChatContext | undefined = undefined;
|
|
86
90
|
|
|
87
91
|
#_started: boolean = false;
|
|
88
92
|
#_pendingFunctionCalls: Set<string> = new Set();
|
|
@@ -143,10 +147,19 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
143
147
|
}
|
|
144
148
|
this.#linkParticipant(participant.identity);
|
|
145
149
|
});
|
|
146
|
-
room.on(
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
+
room.on(
|
|
151
|
+
RoomEvent.TrackPublished,
|
|
152
|
+
(trackPublication: RemoteTrackPublication, participant: RemoteParticipant) => {
|
|
153
|
+
if (
|
|
154
|
+
this.linkedParticipant &&
|
|
155
|
+
participant.identity === this.linkedParticipant.identity &&
|
|
156
|
+
trackPublication.source === TrackSource.SOURCE_MICROPHONE &&
|
|
157
|
+
!trackPublication.subscribed
|
|
158
|
+
) {
|
|
159
|
+
trackPublication.setSubscribed(true);
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
);
|
|
150
163
|
room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
|
|
151
164
|
|
|
152
165
|
this.room = room;
|
|
@@ -200,7 +213,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
200
213
|
}
|
|
201
214
|
}
|
|
202
215
|
|
|
203
|
-
this.#session = this.model.session({ fncCtx: this.#fncCtx });
|
|
216
|
+
this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
|
|
204
217
|
this.#started = true;
|
|
205
218
|
|
|
206
219
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import { log } from '../log.js';
|
|
6
|
+
import { SynthesizeStream, type TTS } from '../tts/index.js';
|
|
7
|
+
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
|
|
8
|
+
import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
|
|
9
|
+
|
|
10
|
+
export type SpeechSource = AsyncIterable<string> | string | Promise<string>;
|
|
11
|
+
|
|
12
|
+
export class SynthesisHandle {
|
|
13
|
+
static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
14
|
+
|
|
15
|
+
#speechId: string;
|
|
16
|
+
ttsSource: SpeechSource;
|
|
17
|
+
#agentPlayout: AgentPlayout;
|
|
18
|
+
tts: TTS;
|
|
19
|
+
queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();
|
|
20
|
+
#playHandle?: PlayoutHandle;
|
|
21
|
+
intFut = new Future();
|
|
22
|
+
#logger = log();
|
|
23
|
+
|
|
24
|
+
constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {
|
|
25
|
+
this.#speechId = speechId;
|
|
26
|
+
this.ttsSource = ttsSource;
|
|
27
|
+
this.#agentPlayout = agentPlayout;
|
|
28
|
+
this.tts = tts;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
get speechId(): string {
|
|
32
|
+
return this.#speechId;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
get validated(): boolean {
|
|
36
|
+
return !!this.#playHandle;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
get interrupted(): boolean {
|
|
40
|
+
return this.intFut.done;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
get playHandle(): PlayoutHandle | undefined {
|
|
44
|
+
return this.#playHandle;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/** Validate the speech for playout. */
|
|
48
|
+
play(): PlayoutHandle {
|
|
49
|
+
if (this.interrupted) {
|
|
50
|
+
throw new Error('synthesis was interrupted');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
|
|
54
|
+
return this.#playHandle;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/** Interrupt the speech. */
|
|
58
|
+
interrupt() {
|
|
59
|
+
if (this.interrupted) {
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');
|
|
64
|
+
this.#playHandle?.interrupt();
|
|
65
|
+
this.intFut.resolve();
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export class AgentOutput {
|
|
70
|
+
#agentPlayout: AgentPlayout;
|
|
71
|
+
#tts: TTS;
|
|
72
|
+
#tasks: CancellablePromise<void>[] = [];
|
|
73
|
+
|
|
74
|
+
constructor(agentPlayout: AgentPlayout, tts: TTS) {
|
|
75
|
+
this.#agentPlayout = agentPlayout;
|
|
76
|
+
this.#tts = tts;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
get playout(): AgentPlayout {
|
|
80
|
+
return this.#agentPlayout;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async close() {
|
|
84
|
+
this.#tasks.forEach((task) => task.cancel());
|
|
85
|
+
await Promise.all(this.#tasks);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {
|
|
89
|
+
const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
|
|
90
|
+
const task = this.#synthesize(handle);
|
|
91
|
+
this.#tasks.push(task);
|
|
92
|
+
task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
|
|
93
|
+
return handle;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
#synthesize(handle: SynthesisHandle): CancellablePromise<void> {
|
|
97
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
98
|
+
return new CancellablePromise(async (resolve, _, onCancel) => {
|
|
99
|
+
const ttsSource = await handle.ttsSource;
|
|
100
|
+
let task: CancellablePromise<void>;
|
|
101
|
+
if (typeof ttsSource === 'string') {
|
|
102
|
+
task = stringSynthesisTask(ttsSource, handle);
|
|
103
|
+
} else {
|
|
104
|
+
task = streamSynthesisTask(ttsSource, handle);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
onCancel(() => {
|
|
108
|
+
gracefullyCancel(task);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
try {
|
|
112
|
+
await Promise.any([task, handle.intFut.await]);
|
|
113
|
+
} finally {
|
|
114
|
+
if (handle.intFut.done) {
|
|
115
|
+
gracefullyCancel(task);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
resolve();
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<void> => {
|
|
125
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
126
|
+
return new CancellablePromise<void>(async (resolve, _, onCancel) => {
|
|
127
|
+
let cancelled = false;
|
|
128
|
+
onCancel(() => {
|
|
129
|
+
cancelled = true;
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
const ttsStream = handle.tts.stream();
|
|
133
|
+
ttsStream.pushText(text);
|
|
134
|
+
ttsStream.flush();
|
|
135
|
+
ttsStream.endInput();
|
|
136
|
+
for await (const audio of ttsStream) {
|
|
137
|
+
if (cancelled || audio === SynthesizeStream.END_OF_STREAM) break;
|
|
138
|
+
handle.queue.put(audio.frame);
|
|
139
|
+
}
|
|
140
|
+
handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
|
|
141
|
+
|
|
142
|
+
resolve();
|
|
143
|
+
});
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const streamSynthesisTask = (
|
|
147
|
+
stream: AsyncIterable<string>,
|
|
148
|
+
handle: SynthesisHandle,
|
|
149
|
+
): CancellablePromise<void> => {
|
|
150
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
151
|
+
return new CancellablePromise<void>(async (resolve, _, onCancel) => {
|
|
152
|
+
let cancelled = false;
|
|
153
|
+
onCancel(() => {
|
|
154
|
+
cancelled = true;
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
const ttsStream = handle.tts.stream();
|
|
158
|
+
const readGeneratedAudio = async () => {
|
|
159
|
+
let started = false;
|
|
160
|
+
for await (const audio of ttsStream) {
|
|
161
|
+
if (cancelled) break;
|
|
162
|
+
if (audio === SynthesizeStream.END_OF_STREAM) {
|
|
163
|
+
if (started) {
|
|
164
|
+
break;
|
|
165
|
+
} else {
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
handle.queue.put(audio.frame);
|
|
170
|
+
started = true;
|
|
171
|
+
}
|
|
172
|
+
handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
|
|
173
|
+
};
|
|
174
|
+
readGeneratedAudio();
|
|
175
|
+
|
|
176
|
+
for await (const text of stream) {
|
|
177
|
+
if (cancelled) break;
|
|
178
|
+
ttsStream.pushText(text);
|
|
179
|
+
}
|
|
180
|
+
ttsStream.flush();
|
|
181
|
+
ttsStream.endInput();
|
|
182
|
+
|
|
183
|
+
resolve();
|
|
184
|
+
});
|
|
185
|
+
};
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
|
|
5
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
|
+
import EventEmitter from 'node:events';
|
|
7
|
+
import { log } from '../log.js';
|
|
8
|
+
import { CancellablePromise, Future, gracefullyCancel } from '../utils.js';
|
|
9
|
+
import { SynthesisHandle } from './agent_output.js';
|
|
10
|
+
|
|
11
|
+
export enum AgentPlayoutEvent {
|
|
12
|
+
PLAYOUT_STARTED,
|
|
13
|
+
PLAYOUT_STOPPED,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export type AgentPlayoutCallbacks = {
|
|
17
|
+
[AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;
|
|
18
|
+
[AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
export class PlayoutHandle {
|
|
22
|
+
#speechId: string;
|
|
23
|
+
#audioSource: AudioSource;
|
|
24
|
+
playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
|
|
25
|
+
totalPlayedTime?: number;
|
|
26
|
+
#interrupted = false;
|
|
27
|
+
pushedDuration = 0;
|
|
28
|
+
intFut = new Future();
|
|
29
|
+
doneFut = new Future();
|
|
30
|
+
|
|
31
|
+
constructor(
|
|
32
|
+
speechId: string,
|
|
33
|
+
audioSource: AudioSource,
|
|
34
|
+
playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
|
|
35
|
+
) {
|
|
36
|
+
this.#speechId = speechId;
|
|
37
|
+
this.#audioSource = audioSource;
|
|
38
|
+
this.playoutSource = playoutSource;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
get speechId(): string {
|
|
42
|
+
return this.#speechId;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
get interrupted(): boolean {
|
|
46
|
+
return this.#interrupted;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
get timePlayed(): number {
|
|
50
|
+
return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
get done(): boolean {
|
|
54
|
+
return this.doneFut.done || this.#interrupted;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
interrupt() {
|
|
58
|
+
if (this.done) {
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
this.intFut.resolve();
|
|
63
|
+
this.#interrupted = true;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
join(): Future {
|
|
67
|
+
return this.doneFut;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {
|
|
72
|
+
#closed = false;
|
|
73
|
+
#audioSource: AudioSource;
|
|
74
|
+
#targetVolume = 1;
|
|
75
|
+
#playoutTask?: CancellablePromise<void>;
|
|
76
|
+
#logger = log();
|
|
77
|
+
|
|
78
|
+
constructor(audioSource: AudioSource) {
|
|
79
|
+
super();
|
|
80
|
+
this.#audioSource = audioSource;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
get targetVolume(): number {
|
|
84
|
+
return this.#targetVolume;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
set targetVolume(vol: number) {
|
|
88
|
+
this.#targetVolume = vol;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
play(
|
|
92
|
+
speechId: string,
|
|
93
|
+
playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
|
|
94
|
+
): PlayoutHandle {
|
|
95
|
+
if (this.#closed) {
|
|
96
|
+
throw new Error('source closed');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
|
|
100
|
+
|
|
101
|
+
this.#playoutTask = this.#playout(handle, this.#playoutTask);
|
|
102
|
+
return handle;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {
|
|
106
|
+
return new CancellablePromise(async (resolve, _, onCancel) => {
|
|
107
|
+
const cancel = () => {
|
|
108
|
+
captureTask.cancel();
|
|
109
|
+
handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
|
|
110
|
+
|
|
111
|
+
if (handle.interrupted || captureTask.error) {
|
|
112
|
+
this.#audioSource.clearQueue(); // make sure to remove any queued frames
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (!firstFrame) {
|
|
116
|
+
this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
handle.doneFut.resolve();
|
|
120
|
+
|
|
121
|
+
this.#logger
|
|
122
|
+
.child({ speechId: handle.speechId, interrupted: handle.interrupted })
|
|
123
|
+
.debug('playout finished');
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
onCancel(() => {
|
|
127
|
+
cancel();
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
if (oldTask) {
|
|
131
|
+
await gracefullyCancel(oldTask);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (this.#audioSource.queuedDuration > 0) {
|
|
135
|
+
// this should not happen, but log it just in case
|
|
136
|
+
this.#logger
|
|
137
|
+
.child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })
|
|
138
|
+
.warn('new playout while the source is still playing');
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
let firstFrame = true;
|
|
142
|
+
|
|
143
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
144
|
+
const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {
|
|
145
|
+
let cancelled = false;
|
|
146
|
+
onCancel(() => {
|
|
147
|
+
cancelled = true;
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
for await (const frame of handle.playoutSource) {
|
|
151
|
+
if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) break;
|
|
152
|
+
if (firstFrame) {
|
|
153
|
+
this.#logger
|
|
154
|
+
.child({ speechId: handle.speechId })
|
|
155
|
+
.debug('started playing the first time');
|
|
156
|
+
this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);
|
|
157
|
+
firstFrame = false;
|
|
158
|
+
}
|
|
159
|
+
handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;
|
|
160
|
+
await this.#audioSource.captureFrame(frame);
|
|
161
|
+
await this.#audioSource.waitForPlayout();
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,
|
|
165
|
+
// but for some reason too many TTS frames can gunk up the buffer and lead to
|
|
166
|
+
// FFI errors. this works 🤷♀️
|
|
167
|
+
// if (this.#audioSource.queuedDuration > 0) {
|
|
168
|
+
// await this.#audioSource.waitForPlayout();
|
|
169
|
+
// }
|
|
170
|
+
|
|
171
|
+
resolve();
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
try {
|
|
175
|
+
await Promise.any([captureTask, handle.intFut.await]);
|
|
176
|
+
} finally {
|
|
177
|
+
cancel();
|
|
178
|
+
resolve();
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async close() {
|
|
184
|
+
this.#closed = true;
|
|
185
|
+
await this.#playoutTask;
|
|
186
|
+
}
|
|
187
|
+
}
|