@livekit/agents 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +40 -0
- package/dist/audio.js +17 -30
- package/dist/audio.js.map +1 -1
- package/dist/cli.js +3 -14
- package/dist/cli.js.map +1 -1
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.js +5 -9
- package/dist/http_server.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -2
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.js +3 -5
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_main.d.ts +1 -1
- package/dist/ipc/proc_job_executor.js +66 -80
- package/dist/ipc/proc_job_executor.js.map +1 -1
- package/dist/ipc/proc_pool.d.ts +3 -3
- package/dist/ipc/proc_pool.d.ts.map +1 -1
- package/dist/ipc/proc_pool.js +16 -11
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/job.js +56 -73
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.d.ts +66 -0
- package/dist/llm/chat_context.d.ts.map +1 -0
- package/dist/llm/chat_context.js +93 -0
- package/dist/llm/chat_context.js.map +1 -0
- package/dist/llm/function_context.d.ts +19 -1
- package/dist/llm/function_context.d.ts.map +1 -1
- package/dist/llm/function_context.js +54 -18
- package/dist/llm/function_context.js.map +1 -1
- package/dist/llm/function_context.test.d.ts +2 -0
- package/dist/llm/function_context.test.d.ts.map +1 -0
- package/dist/llm/function_context.test.js +218 -0
- package/dist/llm/function_context.test.js.map +1 -0
- package/dist/llm/index.d.ts +3 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +3 -2
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.d.ts +53 -0
- package/dist/llm/llm.d.ts.map +1 -0
- package/dist/llm/llm.js +45 -0
- package/dist/llm/llm.js.map +1 -0
- package/dist/multimodal/agent_playout.d.ts +1 -1
- package/dist/multimodal/agent_playout.js +116 -153
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +4 -3
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +214 -237
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.d.ts +30 -0
- package/dist/pipeline/agent_output.d.ts.map +1 -0
- package/dist/pipeline/agent_output.js +155 -0
- package/dist/pipeline/agent_output.js.map +1 -0
- package/dist/pipeline/agent_playout.d.ts +38 -0
- package/dist/pipeline/agent_playout.d.ts.map +1 -0
- package/dist/pipeline/agent_playout.js +142 -0
- package/dist/pipeline/agent_playout.js.map +1 -0
- package/dist/pipeline/human_input.d.ts +28 -0
- package/dist/pipeline/human_input.d.ts.map +1 -0
- package/dist/pipeline/human_input.js +134 -0
- package/dist/pipeline/human_input.js.map +1 -0
- package/dist/pipeline/index.d.ts +2 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +5 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/pipeline_agent.d.ts +134 -0
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
- package/dist/pipeline/pipeline_agent.js +661 -0
- package/dist/pipeline/pipeline_agent.js.map +1 -0
- package/dist/pipeline/speech_handle.d.ts +27 -0
- package/dist/pipeline/speech_handle.d.ts.map +1 -0
- package/dist/pipeline/speech_handle.js +102 -0
- package/dist/pipeline/speech_handle.js.map +1 -0
- package/dist/plugin.js +7 -20
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.d.ts +1 -2
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js +1 -2
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stt.d.ts +62 -24
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +77 -27
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.d.ts +16 -0
- package/dist/tokenize/basic/basic.d.ts.map +1 -0
- package/dist/tokenize/basic/basic.js +50 -0
- package/dist/tokenize/basic/basic.js.map +1 -0
- package/dist/tokenize/basic/hyphenator.d.ts +17 -0
- package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
- package/dist/tokenize/basic/hyphenator.js +420 -0
- package/dist/tokenize/basic/hyphenator.js.map +1 -0
- package/dist/tokenize/basic/index.d.ts +2 -0
- package/dist/tokenize/basic/index.d.ts.map +1 -0
- package/dist/tokenize/basic/index.js +5 -0
- package/dist/tokenize/basic/index.js.map +1 -0
- package/dist/tokenize/basic/paragraph.d.ts +5 -0
- package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
- package/dist/tokenize/basic/paragraph.js +38 -0
- package/dist/tokenize/basic/paragraph.js.map +1 -0
- package/dist/tokenize/basic/sentence.d.ts +5 -0
- package/dist/tokenize/basic/sentence.d.ts.map +1 -0
- package/dist/tokenize/basic/sentence.js +60 -0
- package/dist/tokenize/basic/sentence.js.map +1 -0
- package/dist/tokenize/basic/word.d.ts +5 -0
- package/dist/tokenize/basic/word.d.ts.map +1 -0
- package/dist/tokenize/basic/word.js +23 -0
- package/dist/tokenize/basic/word.js.map +1 -0
- package/dist/tokenize/index.d.ts +5 -0
- package/dist/tokenize/index.d.ts.map +1 -0
- package/dist/tokenize/index.js +8 -0
- package/dist/tokenize/index.js.map +1 -0
- package/dist/tokenize/token_stream.d.ts +36 -0
- package/dist/tokenize/token_stream.d.ts.map +1 -0
- package/dist/tokenize/token_stream.js +136 -0
- package/dist/tokenize/token_stream.js.map +1 -0
- package/dist/tokenize/tokenizer.d.ts +55 -0
- package/dist/tokenize/tokenizer.d.ts.map +1 -0
- package/dist/tokenize/tokenizer.js +117 -0
- package/dist/tokenize/tokenizer.js.map +1 -0
- package/dist/transcription.js +78 -89
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.d.ts +1 -3
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +1 -3
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/tts.d.ts +66 -37
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +79 -74
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.d.ts +21 -6
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +120 -76
- package/dist/utils.js.map +1 -1
- package/dist/vad.d.ts +43 -39
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +51 -4
- package/dist/vad.js.map +1 -1
- package/dist/worker.d.ts +1 -1
- package/dist/worker.js +257 -247
- package/dist/worker.js.map +1 -1
- package/package.json +4 -3
- package/src/index.ts +16 -2
- package/src/ipc/proc_pool.ts +4 -4
- package/src/llm/chat_context.ts +147 -0
- package/src/llm/function_context.test.ts +248 -0
- package/src/llm/function_context.ts +77 -18
- package/src/llm/index.ts +21 -2
- package/src/llm/llm.ts +102 -0
- package/src/multimodal/multimodal_agent.ts +19 -6
- package/src/pipeline/agent_output.ts +185 -0
- package/src/pipeline/agent_playout.ts +187 -0
- package/src/pipeline/human_input.ts +166 -0
- package/src/pipeline/index.ts +15 -0
- package/src/pipeline/pipeline_agent.ts +917 -0
- package/src/pipeline/speech_handle.ts +136 -0
- package/src/stt/index.ts +8 -2
- package/src/stt/stt.ts +98 -31
- package/src/tokenize/basic/basic.ts +73 -0
- package/src/tokenize/basic/hyphenator.ts +436 -0
- package/src/tokenize/basic/index.ts +5 -0
- package/src/tokenize/basic/paragraph.ts +43 -0
- package/src/tokenize/basic/sentence.ts +69 -0
- package/src/tokenize/basic/word.ts +27 -0
- package/src/tokenize/index.ts +16 -0
- package/src/tokenize/token_stream.ts +163 -0
- package/src/tokenize/tokenizer.ts +152 -0
- package/src/tts/index.ts +1 -20
- package/src/tts/tts.ts +110 -57
- package/src/utils.ts +95 -25
- package/src/vad.ts +86 -45
- package/tsconfig.tsbuildinfo +1 -1
- package/dist/stt/stream_adapter.d.ts +0 -19
- package/dist/stt/stream_adapter.d.ts.map +0 -1
- package/dist/stt/stream_adapter.js +0 -96
- package/dist/stt/stream_adapter.js.map +0 -1
- package/dist/tokenize.d.ts +0 -15
- package/dist/tokenize.d.ts.map +0 -1
- package/dist/tokenize.js +0 -12
- package/dist/tokenize.js.map +0 -1
- package/dist/tts/stream_adapter.d.ts +0 -19
- package/dist/tts/stream_adapter.d.ts.map +0 -1
- package/dist/tts/stream_adapter.js +0 -111
- package/dist/tts/stream_adapter.js.map +0 -1
- package/src/stt/stream_adapter.ts +0 -104
- package/src/tokenize.ts +0 -22
- package/src/tts/stream_adapter.ts +0 -93
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { randomUUID } from 'node:crypto';
|
|
5
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
6
|
+
import type { TokenData } from './tokenizer.js';
|
|
7
|
+
import { SentenceStream, WordStream } from './tokenizer.js';
|
|
8
|
+
|
|
9
|
+
type TokenizeFunc = (x: string) => string[] | [string, number, number][];
|
|
10
|
+
|
|
11
|
+
export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
|
|
12
|
+
protected queue = new AsyncIterableQueue<TokenData>();
|
|
13
|
+
protected closed = false;
|
|
14
|
+
|
|
15
|
+
#func: TokenizeFunc;
|
|
16
|
+
#minTokenLength: number;
|
|
17
|
+
#minContextLength: number;
|
|
18
|
+
#bufTokens: string[] = [];
|
|
19
|
+
#inBuf = '';
|
|
20
|
+
#outBuf = '';
|
|
21
|
+
#currentSegmentId: string;
|
|
22
|
+
|
|
23
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
|
|
24
|
+
this.#func = func;
|
|
25
|
+
this.#minTokenLength = minTokenLength;
|
|
26
|
+
this.#minContextLength = minContextLength;
|
|
27
|
+
|
|
28
|
+
this.#currentSegmentId = randomUUID();
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** Push a string of text into the token stream */
|
|
32
|
+
pushText(text: string) {
|
|
33
|
+
if (this.closed) {
|
|
34
|
+
throw new Error('Stream is closed');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
this.#inBuf += text;
|
|
38
|
+
if (this.#inBuf.length < this.#minContextLength) return;
|
|
39
|
+
|
|
40
|
+
while (true) {
|
|
41
|
+
const tokens = this.#func(this.#inBuf);
|
|
42
|
+
if (tokens.length === 0) break;
|
|
43
|
+
|
|
44
|
+
if (this.#outBuf) this.#outBuf += ' ';
|
|
45
|
+
|
|
46
|
+
const tok = tokens.shift()!;
|
|
47
|
+
let tokText = tok as string;
|
|
48
|
+
if (tok.length > 1 && typeof tok[1] === 'number') {
|
|
49
|
+
tokText = tok[0];
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
this.#outBuf += tokText;
|
|
53
|
+
if (this.#outBuf.length >= this.#minTokenLength) {
|
|
54
|
+
this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
|
|
55
|
+
this.#outBuf = '';
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (typeof tok! !== 'string') {
|
|
59
|
+
this.#inBuf = this.#inBuf.slice(tok![2]);
|
|
60
|
+
} else {
|
|
61
|
+
this.#inBuf = this.#inBuf
|
|
62
|
+
.slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)
|
|
63
|
+
.trimStart();
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Flush the stream, causing it to process all pending text */
|
|
69
|
+
flush() {
|
|
70
|
+
if (this.closed) {
|
|
71
|
+
throw new Error('Stream is closed');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (this.#inBuf || this.#outBuf) {
|
|
75
|
+
const tokens = this.#func(this.#inBuf);
|
|
76
|
+
if (tokens) {
|
|
77
|
+
if (this.#outBuf) this.#outBuf += ' ';
|
|
78
|
+
|
|
79
|
+
if (typeof tokens[0] !== 'string') {
|
|
80
|
+
this.#outBuf += tokens.map((tok) => tok[0]).join(' ');
|
|
81
|
+
} else {
|
|
82
|
+
this.#outBuf += tokens.join(' ');
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (this.#outBuf) {
|
|
87
|
+
this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
this.#currentSegmentId = randomUUID();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
this.#inBuf = '';
|
|
94
|
+
this.#outBuf = '';
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
98
|
+
endInput() {
|
|
99
|
+
if (this.closed) {
|
|
100
|
+
throw new Error('Stream is closed');
|
|
101
|
+
}
|
|
102
|
+
this.flush();
|
|
103
|
+
this.closed = true;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
next(): Promise<IteratorResult<TokenData>> {
|
|
107
|
+
return this.queue.next();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/** Close both the input and output of the token stream */
|
|
111
|
+
close() {
|
|
112
|
+
this.queue.close();
|
|
113
|
+
this.closed = true;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
[Symbol.asyncIterator](): BufferedTokenStream {
|
|
117
|
+
return this;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export class BufferedSentenceStream extends SentenceStream {
|
|
122
|
+
#stream: BufferedTokenStream;
|
|
123
|
+
|
|
124
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
|
|
125
|
+
super();
|
|
126
|
+
this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
pushText(text: string) {
|
|
130
|
+
this.#stream.pushText(text);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
close() {
|
|
134
|
+
super.close();
|
|
135
|
+
this.#stream.close();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
next(): Promise<IteratorResult<TokenData>> {
|
|
139
|
+
return this.#stream.next();
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export class BufferedWordStream extends WordStream {
|
|
144
|
+
#stream: BufferedTokenStream;
|
|
145
|
+
|
|
146
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
|
|
147
|
+
super();
|
|
148
|
+
this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
pushText(text: string) {
|
|
152
|
+
this.#stream.pushText(text);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
close() {
|
|
156
|
+
super.close();
|
|
157
|
+
this.#stream.close();
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
next(): Promise<IteratorResult<TokenData>> {
|
|
161
|
+
return this.#stream.next();
|
|
162
|
+
}
|
|
163
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
5
|
+
|
|
6
|
+
// prettier-ignore
|
|
7
|
+
export const PUNCTUATIONS = [
|
|
8
|
+
'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=',
|
|
9
|
+
'>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '±', '—', '‘', '’', '“', '”',
|
|
10
|
+
'…',
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
export interface TokenData {
|
|
14
|
+
segmentId: string;
|
|
15
|
+
token: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export abstract class SentenceTokenizer {
|
|
19
|
+
abstract tokenize(text: string, language?: string): string[];
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Returns a {@link SentenceStream} that can be used to push strings and receive smaller segments.
|
|
23
|
+
*/
|
|
24
|
+
abstract stream(): SentenceStream;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export abstract class SentenceStream {
|
|
28
|
+
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
29
|
+
protected input = new AsyncIterableQueue<string | typeof SentenceStream.FLUSH_SENTINEL>();
|
|
30
|
+
protected queue = new AsyncIterableQueue<TokenData>();
|
|
31
|
+
#closed = false;
|
|
32
|
+
|
|
33
|
+
get closed(): boolean {
|
|
34
|
+
return this.#closed;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/** Push a string of text to the tokenizer */
|
|
38
|
+
pushText(text: string) {
|
|
39
|
+
if (this.input.closed) {
|
|
40
|
+
throw new Error('Input is closed');
|
|
41
|
+
}
|
|
42
|
+
if (this.#closed) {
|
|
43
|
+
throw new Error('Stream is closed');
|
|
44
|
+
}
|
|
45
|
+
this.input.put(text);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Flush the tokenizer, causing it to process all pending text */
|
|
49
|
+
flush() {
|
|
50
|
+
if (this.input.closed) {
|
|
51
|
+
throw new Error('Input is closed');
|
|
52
|
+
}
|
|
53
|
+
if (this.#closed) {
|
|
54
|
+
throw new Error('Stream is closed');
|
|
55
|
+
}
|
|
56
|
+
this.input.put(SentenceStream.FLUSH_SENTINEL);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
60
|
+
endInput() {
|
|
61
|
+
if (this.input.closed) {
|
|
62
|
+
throw new Error('Input is closed');
|
|
63
|
+
}
|
|
64
|
+
if (this.#closed) {
|
|
65
|
+
throw new Error('Stream is closed');
|
|
66
|
+
}
|
|
67
|
+
this.input.close();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
next(): Promise<IteratorResult<TokenData>> {
|
|
71
|
+
return this.queue.next();
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Close both the input and output of the tokenizer stream */
|
|
75
|
+
close() {
|
|
76
|
+
this.input.close();
|
|
77
|
+
this.queue.close();
|
|
78
|
+
this.#closed = true;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
[Symbol.asyncIterator](): SentenceStream {
|
|
82
|
+
return this;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export abstract class WordTokenizer {
|
|
87
|
+
abstract tokenize(text: string, language?: string): string[];
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Returns a {@link WordStream} that can be used to push words and receive smaller segments.
|
|
91
|
+
*/
|
|
92
|
+
abstract stream(): WordStream;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export abstract class WordStream {
|
|
96
|
+
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
97
|
+
protected input = new AsyncIterableQueue<string | typeof WordStream.FLUSH_SENTINEL>();
|
|
98
|
+
protected queue = new AsyncIterableQueue<TokenData>();
|
|
99
|
+
#closed = false;
|
|
100
|
+
|
|
101
|
+
get closed(): boolean {
|
|
102
|
+
return this.#closed;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/** Push a string of text to the tokenizer */
|
|
106
|
+
pushText(text: string) {
|
|
107
|
+
if (this.input.closed) {
|
|
108
|
+
throw new Error('Input is closed');
|
|
109
|
+
}
|
|
110
|
+
if (this.#closed) {
|
|
111
|
+
throw new Error('Stream is closed');
|
|
112
|
+
}
|
|
113
|
+
this.input.put(text);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/** Flush the tokenizer, causing it to process all pending text */
|
|
117
|
+
flush() {
|
|
118
|
+
if (this.input.closed) {
|
|
119
|
+
throw new Error('Input is closed');
|
|
120
|
+
}
|
|
121
|
+
if (this.#closed) {
|
|
122
|
+
throw new Error('Stream is closed');
|
|
123
|
+
}
|
|
124
|
+
this.input.put(WordStream.FLUSH_SENTINEL);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
128
|
+
endInput() {
|
|
129
|
+
if (this.input.closed) {
|
|
130
|
+
throw new Error('Input is closed');
|
|
131
|
+
}
|
|
132
|
+
if (this.#closed) {
|
|
133
|
+
throw new Error('Stream is closed');
|
|
134
|
+
}
|
|
135
|
+
this.input.close();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
next(): Promise<IteratorResult<TokenData>> {
|
|
139
|
+
return this.queue.next();
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/** Close both the input and output of the tokenizer stream */
|
|
143
|
+
close() {
|
|
144
|
+
this.input.close();
|
|
145
|
+
this.queue.close();
|
|
146
|
+
this.#closed = true;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
[Symbol.asyncIterator](): WordStream {
|
|
150
|
+
return this;
|
|
151
|
+
}
|
|
152
|
+
}
|
package/src/tts/index.ts
CHANGED
|
@@ -1,23 +1,4 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import {
|
|
6
|
-
ChunkedStream,
|
|
7
|
-
SynthesisEvent,
|
|
8
|
-
SynthesisEventType,
|
|
9
|
-
SynthesizeStream,
|
|
10
|
-
type SynthesizedAudio,
|
|
11
|
-
TTS,
|
|
12
|
-
} from './tts.js';
|
|
13
|
-
|
|
14
|
-
export {
|
|
15
|
-
TTS,
|
|
16
|
-
SynthesisEvent,
|
|
17
|
-
SynthesisEventType,
|
|
18
|
-
SynthesizedAudio,
|
|
19
|
-
SynthesizeStream,
|
|
20
|
-
StreamAdapter,
|
|
21
|
-
StreamAdapterWrapper,
|
|
22
|
-
ChunkedStream,
|
|
23
|
-
};
|
|
4
|
+
export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream } from './tts.js';
|
package/src/tts/tts.ts
CHANGED
|
@@ -2,87 +2,140 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
-
import {
|
|
5
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
6
6
|
|
|
7
|
+
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
7
8
|
export interface SynthesizedAudio {
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
/** Request ID (one segment could be made up of multiple requests) */
|
|
10
|
+
requestId: string;
|
|
11
|
+
/** Segment ID, each segment is separated by a flush */
|
|
12
|
+
segmentId: string;
|
|
13
|
+
/** Synthesized audio frame */
|
|
14
|
+
frame: AudioFrame;
|
|
15
|
+
/** Current segment of the synthesized audio */
|
|
16
|
+
deltaText?: string;
|
|
10
17
|
}
|
|
11
18
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
AUDIO = 1,
|
|
22
|
-
/**
|
|
23
|
-
* Indicate the end of synthesis. Does not necessarily mean stream is done.
|
|
24
|
-
*/
|
|
25
|
-
FINISHED = 2,
|
|
19
|
+
/**
|
|
20
|
+
* Describes the capabilities of the TTS provider.
|
|
21
|
+
*
|
|
22
|
+
* @remarks
|
|
23
|
+
* At present, only `streaming` is supplied to this interface, and the framework only supports
|
|
24
|
+
* providers that do have a streaming endpoint.
|
|
25
|
+
*/
|
|
26
|
+
export interface TTSCapabilities {
|
|
27
|
+
streaming: boolean;
|
|
26
28
|
}
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
/**
|
|
31
|
+
* An instance of a text-to-speech adapter.
|
|
32
|
+
*
|
|
33
|
+
* @remarks
|
|
34
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
35
|
+
* exports its own child TTS class, which inherits this class's methods.
|
|
36
|
+
*/
|
|
37
|
+
export abstract class TTS {
|
|
38
|
+
#capabilities: TTSCapabilities;
|
|
39
|
+
#sampleRate: number;
|
|
40
|
+
#numChannels: number;
|
|
41
|
+
|
|
42
|
+
constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {
|
|
43
|
+
this.#capabilities = capabilities;
|
|
44
|
+
this.#sampleRate = sampleRate;
|
|
45
|
+
this.#numChannels = numChannels;
|
|
35
46
|
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export abstract class SynthesizeStream implements IterableIterator<SynthesisEvent> {
|
|
39
|
-
abstract pushText(token?: string): void;
|
|
40
47
|
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
/** Returns this TTS's capabilities */
|
|
49
|
+
get capabilities(): TTSCapabilities {
|
|
50
|
+
return this.#capabilities;
|
|
43
51
|
}
|
|
44
52
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
[Symbol.iterator](): SynthesizeStream {
|
|
49
|
-
return this;
|
|
53
|
+
/** Returns the sample rate of audio frames returned by this TTS */
|
|
54
|
+
get sampleRate(): number {
|
|
55
|
+
return this.#sampleRate;
|
|
50
56
|
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
export abstract class TTS {
|
|
54
|
-
#streamingSupported: boolean;
|
|
55
57
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
/** Returns the channel count of audio frames returned by this TTS */
|
|
59
|
+
get numChannels(): number {
|
|
60
|
+
return this.#numChannels;
|
|
58
61
|
}
|
|
59
62
|
|
|
60
|
-
|
|
61
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
65
|
+
*/
|
|
62
66
|
abstract stream(): SynthesizeStream;
|
|
67
|
+
}
|
|
63
68
|
|
|
64
|
-
|
|
65
|
-
|
|
69
|
+
/**
|
|
70
|
+
* An instance of a text-to-speech stream, as an asynchronous iterable iterator.
|
|
71
|
+
*
|
|
72
|
+
* @example Looping through frames
|
|
73
|
+
* ```ts
|
|
74
|
+
* for await (const event of stream) {
|
|
75
|
+
* await source.captureFrame(event.frame);
|
|
76
|
+
* }
|
|
77
|
+
* ```
|
|
78
|
+
*
|
|
79
|
+
* @remarks
|
|
80
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
81
|
+
* exports its own child SynthesizeStream class, which inherits this class's methods.
|
|
82
|
+
*/
|
|
83
|
+
export abstract class SynthesizeStream
|
|
84
|
+
implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>
|
|
85
|
+
{
|
|
86
|
+
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
87
|
+
static readonly END_OF_STREAM = Symbol('END_OF_STREAM');
|
|
88
|
+
protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();
|
|
89
|
+
protected queue = new AsyncIterableQueue<
|
|
90
|
+
SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
|
|
91
|
+
>();
|
|
92
|
+
protected closed = false;
|
|
93
|
+
|
|
94
|
+
/** Push a string of text to the TTS */
|
|
95
|
+
pushText(text: string) {
|
|
96
|
+
if (this.input.closed) {
|
|
97
|
+
throw new Error('Input is closed');
|
|
98
|
+
}
|
|
99
|
+
if (this.closed) {
|
|
100
|
+
throw new Error('Stream is closed');
|
|
101
|
+
}
|
|
102
|
+
this.input.put(text);
|
|
66
103
|
}
|
|
67
|
-
}
|
|
68
104
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
frames.push(ev.data);
|
|
105
|
+
/** Flush the TTS, causing it to process all pending text */
|
|
106
|
+
flush() {
|
|
107
|
+
if (this.input.closed) {
|
|
108
|
+
throw new Error('Input is closed');
|
|
74
109
|
}
|
|
75
|
-
|
|
110
|
+
if (this.closed) {
|
|
111
|
+
throw new Error('Stream is closed');
|
|
112
|
+
}
|
|
113
|
+
this.input.put(SynthesizeStream.FLUSH_SENTINEL);
|
|
76
114
|
}
|
|
77
115
|
|
|
78
|
-
|
|
79
|
-
|
|
116
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
117
|
+
endInput() {
|
|
118
|
+
if (this.input.closed) {
|
|
119
|
+
throw new Error('Input is closed');
|
|
120
|
+
}
|
|
121
|
+
if (this.closed) {
|
|
122
|
+
throw new Error('Stream is closed');
|
|
123
|
+
}
|
|
124
|
+
this.input.close();
|
|
125
|
+
}
|
|
80
126
|
|
|
81
|
-
|
|
82
|
-
return this;
|
|
127
|
+
next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {
|
|
128
|
+
return this.queue.next();
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/** Close both the input and output of the TTS stream */
|
|
132
|
+
close() {
|
|
133
|
+
this.input.close();
|
|
134
|
+
this.queue.close();
|
|
135
|
+
this.closed = true;
|
|
83
136
|
}
|
|
84
137
|
|
|
85
|
-
[Symbol.asyncIterator]():
|
|
138
|
+
[Symbol.asyncIterator](): SynthesizeStream {
|
|
86
139
|
return this;
|
|
87
140
|
}
|
|
88
141
|
}
|
package/src/utils.ts
CHANGED
|
@@ -225,39 +225,109 @@ export async function gracefullyCancel<T>(promise: CancellablePromise<T>): Promi
|
|
|
225
225
|
}
|
|
226
226
|
|
|
227
227
|
/** @internal */
|
|
228
|
-
export class AsyncIterableQueue<T> implements
|
|
229
|
-
private
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
export class AsyncIterableQueue<T> implements AsyncIterableIterator<T> {
|
|
229
|
+
private static readonly CLOSE_SENTINEL = Symbol('CLOSE_SENTINEL');
|
|
230
|
+
#queue = new Queue<T | typeof AsyncIterableQueue.CLOSE_SENTINEL>();
|
|
231
|
+
#closed = false;
|
|
232
232
|
|
|
233
|
-
|
|
234
|
-
this
|
|
233
|
+
get closed(): boolean {
|
|
234
|
+
return this.#closed;
|
|
235
235
|
}
|
|
236
236
|
|
|
237
237
|
put(item: T): void {
|
|
238
|
-
if (this
|
|
238
|
+
if (this.#closed) {
|
|
239
239
|
throw new Error('Queue is closed');
|
|
240
240
|
}
|
|
241
|
-
this
|
|
241
|
+
this.#queue.put(item);
|
|
242
242
|
}
|
|
243
243
|
|
|
244
244
|
close(): void {
|
|
245
|
-
this
|
|
246
|
-
this
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
245
|
+
this.#closed = true;
|
|
246
|
+
this.#queue.put(AsyncIterableQueue.CLOSE_SENTINEL);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
async next(): Promise<IteratorResult<T>> {
|
|
250
|
+
if (this.#closed && this.#queue.items.length === 0) {
|
|
251
|
+
return { value: undefined, done: true };
|
|
252
|
+
}
|
|
253
|
+
const item = await this.#queue.get();
|
|
254
|
+
if (item === AsyncIterableQueue.CLOSE_SENTINEL && this.#closed) {
|
|
255
|
+
return { value: undefined, done: true };
|
|
256
|
+
}
|
|
257
|
+
return { value: item as T, done: false };
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
[Symbol.asyncIterator](): AsyncIterableQueue<T> {
|
|
261
|
+
return this;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/** @internal */
|
|
266
|
+
export class ExpFilter {
|
|
267
|
+
#alpha: number;
|
|
268
|
+
#max?: number;
|
|
269
|
+
#filtered?: number = undefined;
|
|
270
|
+
|
|
271
|
+
constructor(alpha: number, max?: number) {
|
|
272
|
+
this.#alpha = alpha;
|
|
273
|
+
this.#max = max;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
reset(alpha?: number) {
|
|
277
|
+
if (alpha) {
|
|
278
|
+
this.#alpha = alpha;
|
|
279
|
+
}
|
|
280
|
+
this.#filtered = undefined;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
apply(exp: number, sample: number): number {
|
|
284
|
+
if (this.#filtered) {
|
|
285
|
+
const a = this.#alpha ** exp;
|
|
286
|
+
this.#filtered = a * this.#filtered + (1 - a) * sample;
|
|
287
|
+
} else {
|
|
288
|
+
this.#filtered = sample;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if (this.#max && this.#filtered > this.#max) {
|
|
292
|
+
this.#filtered = this.#max;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
return this.#filtered;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
get filtered(): number | undefined {
|
|
299
|
+
return this.#filtered;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
set alpha(alpha: number) {
|
|
303
|
+
this.#alpha = alpha;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/** @internal */
|
|
308
|
+
export class AudioEnergyFilter {
|
|
309
|
+
#cooldownSeconds: number;
|
|
310
|
+
#cooldown: number;
|
|
311
|
+
|
|
312
|
+
constructor(cooldownSeconds = 1) {
|
|
313
|
+
this.#cooldownSeconds = cooldownSeconds;
|
|
314
|
+
this.#cooldown = cooldownSeconds;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
pushFrame(frame: AudioFrame): boolean {
|
|
318
|
+
const arr = Float32Array.from(frame.data, (x) => x / 32768);
|
|
319
|
+
const rms = (arr.map((x) => x ** 2).reduce((acc, x) => acc + x) / arr.length) ** 0.5;
|
|
320
|
+
if (rms > 0.004) {
|
|
321
|
+
this.#cooldown = this.#cooldownSeconds;
|
|
322
|
+
return true;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const durationSeconds = frame.samplesPerChannel / frame.sampleRate;
|
|
326
|
+
this.#cooldown -= durationSeconds;
|
|
327
|
+
if (this.#cooldown > 0) {
|
|
328
|
+
return true;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return false;
|
|
262
332
|
}
|
|
263
333
|
}
|