@livekit/agents 0.7.6 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio.d.cts +9 -0
- package/dist/cli.d.cts +14 -0
- package/dist/constants.d.cts +5 -0
- package/dist/generator.d.cts +23 -0
- package/dist/http_server.cjs.map +1 -1
- package/dist/http_server.d.cts +19 -0
- package/dist/http_server.d.ts +1 -0
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js.map +1 -1
- package/dist/index.d.cts +29 -0
- package/dist/inference_runner.d.cts +12 -0
- package/dist/ipc/index.d.cts +2 -0
- package/dist/ipc/inference_executor.d.cts +4 -0
- package/dist/ipc/inference_proc_executor.cjs +3 -2
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/inference_proc_executor.d.cts +23 -0
- package/dist/ipc/inference_proc_executor.js +1 -1
- package/dist/ipc/inference_proc_executor.js.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.d.cts +2 -0
- package/dist/ipc/job_executor.d.cts +18 -0
- package/dist/ipc/job_proc_executor.cjs +3 -2
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.d.cts +19 -0
- package/dist/ipc/job_proc_executor.js +1 -1
- package/dist/ipc/job_proc_executor.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.d.cts +2 -0
- package/dist/ipc/job_proc_lazy_main.js +1 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/message.d.cts +58 -0
- package/dist/ipc/proc_pool.d.cts +31 -0
- package/dist/ipc/supervised_proc.d.cts +30 -0
- package/dist/job.d.cts +113 -0
- package/dist/llm/chat_context.d.cts +66 -0
- package/dist/llm/function_context.d.cts +47 -0
- package/dist/llm/index.d.cts +4 -0
- package/dist/llm/llm.cjs +3 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +66 -0
- package/dist/llm/llm.js +3 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.d.cts +13 -0
- package/dist/metrics/base.d.cts +96 -0
- package/dist/metrics/index.d.cts +5 -0
- package/dist/metrics/usage_collector.d.cts +14 -0
- package/dist/metrics/utils.d.cts +10 -0
- package/dist/multimodal/agent_playout.d.cts +34 -0
- package/dist/multimodal/index.d.cts +3 -0
- package/dist/multimodal/multimodal_agent.d.cts +48 -0
- package/dist/pipeline/agent_output.d.cts +33 -0
- package/dist/pipeline/agent_playout.d.cts +40 -0
- package/dist/pipeline/human_input.d.cts +30 -0
- package/dist/pipeline/index.d.cts +2 -0
- package/dist/pipeline/pipeline_agent.d.cts +151 -0
- package/dist/pipeline/speech_handle.d.cts +37 -0
- package/dist/plugin.d.cts +10 -0
- package/dist/stt/index.d.cts +3 -0
- package/dist/stt/stream_adapter.d.cts +18 -0
- package/dist/stt/stt.d.cts +124 -0
- package/dist/tokenize/basic/basic.d.cts +18 -0
- package/dist/tokenize/basic/hyphenator.d.cts +17 -0
- package/dist/tokenize/basic/index.d.cts +2 -0
- package/dist/tokenize/basic/paragraph.d.cts +5 -0
- package/dist/tokenize/basic/sentence.d.cts +5 -0
- package/dist/tokenize/basic/word.d.cts +5 -0
- package/dist/tokenize/index.d.cts +5 -0
- package/dist/tokenize/token_stream.d.cts +39 -0
- package/dist/tokenize/tokenizer.d.cts +55 -0
- package/dist/transcription.d.cts +31 -0
- package/dist/tts/index.d.cts +3 -0
- package/dist/tts/stream_adapter.d.cts +17 -0
- package/dist/tts/tts.cjs +4 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +127 -0
- package/dist/tts/tts.js +4 -4
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.d.cts +72 -0
- package/dist/vad.d.cts +78 -0
- package/dist/version.d.cts +2 -0
- package/dist/worker.cjs +2 -1
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +109 -0
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +2 -1
- package/dist/worker.js.map +1 -1
- package/package.json +9 -5
- package/src/http_server.ts +1 -0
- package/src/ipc/inference_proc_executor.ts +1 -1
- package/src/ipc/job_proc_executor.ts +1 -1
- package/src/ipc/job_proc_lazy_main.ts +1 -1
- package/src/llm/llm.ts +3 -3
- package/src/tts/tts.ts +4 -4
- package/src/worker.ts +1 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
2
|
+
import type { TokenData } from './tokenizer.js';
|
|
3
|
+
import { SentenceStream, WordStream } from './tokenizer.js';
|
|
4
|
+
type TokenizeFunc = (x: string) => string[] | [string, number, number][];
|
|
5
|
+
export declare class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
|
|
6
|
+
#private;
|
|
7
|
+
protected queue: AsyncIterableQueue<TokenData>;
|
|
8
|
+
protected closed: boolean;
|
|
9
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number);
|
|
10
|
+
/** Push a string of text into the token stream */
|
|
11
|
+
pushText(text: string): void;
|
|
12
|
+
/** Flush the stream, causing it to process all pending text */
|
|
13
|
+
flush(): void;
|
|
14
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
15
|
+
endInput(): void;
|
|
16
|
+
next(): Promise<IteratorResult<TokenData>>;
|
|
17
|
+
/** Close both the input and output of the token stream */
|
|
18
|
+
close(): void;
|
|
19
|
+
[Symbol.asyncIterator](): BufferedTokenStream;
|
|
20
|
+
}
|
|
21
|
+
export declare class BufferedSentenceStream extends SentenceStream {
|
|
22
|
+
#private;
|
|
23
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number);
|
|
24
|
+
pushText(text: string): void;
|
|
25
|
+
flush(): void;
|
|
26
|
+
close(): void;
|
|
27
|
+
next(): Promise<IteratorResult<TokenData>>;
|
|
28
|
+
}
|
|
29
|
+
export declare class BufferedWordStream extends WordStream {
|
|
30
|
+
#private;
|
|
31
|
+
constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number);
|
|
32
|
+
pushText(text: string): void;
|
|
33
|
+
flush(): void;
|
|
34
|
+
endInput(): void;
|
|
35
|
+
close(): void;
|
|
36
|
+
next(): Promise<IteratorResult<TokenData>>;
|
|
37
|
+
}
|
|
38
|
+
export {};
|
|
39
|
+
//# sourceMappingURL=token_stream.d.ts.map
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
2
|
+
export declare const PUNCTUATIONS: string[];
|
|
3
|
+
export interface TokenData {
|
|
4
|
+
segmentId: string;
|
|
5
|
+
token: string;
|
|
6
|
+
}
|
|
7
|
+
export declare abstract class SentenceTokenizer {
|
|
8
|
+
abstract tokenize(text: string, language?: string): string[];
|
|
9
|
+
/**
|
|
10
|
+
* Returns a {@link SentenceStream} that can be used to push strings and receive smaller segments.
|
|
11
|
+
*/
|
|
12
|
+
abstract stream(): SentenceStream;
|
|
13
|
+
}
|
|
14
|
+
export declare abstract class SentenceStream {
|
|
15
|
+
#private;
|
|
16
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
17
|
+
protected input: AsyncIterableQueue<string | typeof SentenceStream.FLUSH_SENTINEL>;
|
|
18
|
+
protected queue: AsyncIterableQueue<TokenData>;
|
|
19
|
+
get closed(): boolean;
|
|
20
|
+
/** Push a string of text to the tokenizer */
|
|
21
|
+
pushText(text: string): void;
|
|
22
|
+
/** Flush the tokenizer, causing it to process all pending text */
|
|
23
|
+
flush(): void;
|
|
24
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
25
|
+
endInput(): void;
|
|
26
|
+
next(): Promise<IteratorResult<TokenData>>;
|
|
27
|
+
/** Close both the input and output of the tokenizer stream */
|
|
28
|
+
close(): void;
|
|
29
|
+
[Symbol.asyncIterator](): SentenceStream;
|
|
30
|
+
}
|
|
31
|
+
export declare abstract class WordTokenizer {
|
|
32
|
+
abstract tokenize(text: string, language?: string): string[];
|
|
33
|
+
/**
|
|
34
|
+
* Returns a {@link WordStream} that can be used to push words and receive smaller segments.
|
|
35
|
+
*/
|
|
36
|
+
abstract stream(): WordStream;
|
|
37
|
+
}
|
|
38
|
+
export declare abstract class WordStream {
|
|
39
|
+
#private;
|
|
40
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
41
|
+
protected input: AsyncIterableQueue<string | typeof WordStream.FLUSH_SENTINEL>;
|
|
42
|
+
protected queue: AsyncIterableQueue<TokenData>;
|
|
43
|
+
get closed(): boolean;
|
|
44
|
+
/** Push a string of text to the tokenizer */
|
|
45
|
+
pushText(text: string): void;
|
|
46
|
+
/** Flush the tokenizer, causing it to process all pending text */
|
|
47
|
+
flush(): void;
|
|
48
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
49
|
+
endInput(): void;
|
|
50
|
+
next(): Promise<IteratorResult<TokenData>>;
|
|
51
|
+
/** Close both the input and output of the tokenizer stream */
|
|
52
|
+
close(): void;
|
|
53
|
+
[Symbol.asyncIterator](): WordStream;
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { TranscriptionSegment } from '@livekit/protocol';
|
|
2
|
+
import { AudioFrame } from '@livekit/rtc-node';
|
|
3
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
4
|
+
import type { SentenceTokenizer } from './tokenize/tokenizer.js';
|
|
5
|
+
export interface TextSyncOptions {
|
|
6
|
+
language: string;
|
|
7
|
+
speed: number;
|
|
8
|
+
newSentenceDelay: number;
|
|
9
|
+
sentenceTokenizer: SentenceTokenizer;
|
|
10
|
+
hyphenateWord: (word: string) => string[];
|
|
11
|
+
splitWords: (words: string) => [string, number, number][];
|
|
12
|
+
}
|
|
13
|
+
export declare const defaultTextSyncOptions: TextSyncOptions;
|
|
14
|
+
type SyncCallbacks = {
|
|
15
|
+
textUpdated: (text: TranscriptionSegment) => void;
|
|
16
|
+
};
|
|
17
|
+
declare const TextAudioSynchronizer_base: new () => TypedEmitter<SyncCallbacks>;
|
|
18
|
+
export declare class TextAudioSynchronizer extends TextAudioSynchronizer_base {
|
|
19
|
+
#private;
|
|
20
|
+
constructor(opts: TextSyncOptions);
|
|
21
|
+
pushAudio(frame: AudioFrame): void;
|
|
22
|
+
pushText(text: string): void;
|
|
23
|
+
markAudioSegmentEnd(): void;
|
|
24
|
+
markTextSegmentEnd(): void;
|
|
25
|
+
segmentPlayoutStarted(): void;
|
|
26
|
+
segmentPlayoutFinished(): void;
|
|
27
|
+
get playedText(): string;
|
|
28
|
+
close(interrupt: boolean): Promise<void>;
|
|
29
|
+
}
|
|
30
|
+
export {};
|
|
31
|
+
//# sourceMappingURL=transcription.d.ts.map
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { SentenceTokenizer } from '../tokenize/index.js';
|
|
2
|
+
import type { ChunkedStream } from './tts.js';
|
|
3
|
+
import { SynthesizeStream, TTS } from './tts.js';
|
|
4
|
+
export declare class StreamAdapter extends TTS {
|
|
5
|
+
#private;
|
|
6
|
+
label: string;
|
|
7
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer);
|
|
8
|
+
synthesize(text: string): ChunkedStream;
|
|
9
|
+
stream(): StreamAdapterWrapper;
|
|
10
|
+
}
|
|
11
|
+
export declare class StreamAdapterWrapper extends SynthesizeStream {
|
|
12
|
+
#private;
|
|
13
|
+
label: string;
|
|
14
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer);
|
|
15
|
+
monitorMetrics(): Promise<void>;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=stream_adapter.d.ts.map
|
package/dist/tts/tts.cjs
CHANGED
|
@@ -79,7 +79,7 @@ class SynthesizeStream {
|
|
|
79
79
|
const metrics = {
|
|
80
80
|
timestamp: Date.now(),
|
|
81
81
|
requestId,
|
|
82
|
-
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
82
|
+
ttfb: Math.trunc(Number((ttfb || BigInt(0)) / BigInt(1e6))),
|
|
83
83
|
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
84
84
|
charactersCount: text.length,
|
|
85
85
|
audioDuration,
|
|
@@ -173,12 +173,12 @@ class ChunkedStream {
|
|
|
173
173
|
async monitorMetrics() {
|
|
174
174
|
const startTime = process.hrtime.bigint();
|
|
175
175
|
let audioDuration = 0;
|
|
176
|
-
let ttfb;
|
|
176
|
+
let ttfb = BigInt(-1);
|
|
177
177
|
let requestId = "";
|
|
178
178
|
for await (const audio of this.queue) {
|
|
179
179
|
this.output.put(audio);
|
|
180
180
|
requestId = audio.requestId;
|
|
181
|
-
if (
|
|
181
|
+
if (ttfb === BigInt(-1)) {
|
|
182
182
|
ttfb = process.hrtime.bigint() - startTime;
|
|
183
183
|
}
|
|
184
184
|
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
@@ -188,7 +188,7 @@ class ChunkedStream {
|
|
|
188
188
|
const metrics = {
|
|
189
189
|
timestamp: Date.now(),
|
|
190
190
|
requestId,
|
|
191
|
-
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
191
|
+
ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
192
192
|
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
193
193
|
charactersCount: this.#text.length,
|
|
194
194
|
audioDuration,
|
package/dist/tts/tts.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAE7B,mBAAgD;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,gCAAoE;AAAA,EAChF,QAAQ,IAAI,gCAEpB;AAAA,EACQ,SAAS,IAAI,gCAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,UAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,gCAAqC;AAAA,EACjD,SAAS,IAAI,gCAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,MAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,eAAO,0BAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
|
|
1
|
+
{"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number((ttfb || BigInt(0)) / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint = BigInt(-1);\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (ttfb === BigInt(-1)) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAE7B,mBAAgD;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,gCAAoE;AAAA,EAChF,QAAQ,IAAI,gCAEpB;AAAA,EACQ,SAAS,IAAI,gCAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,QAAQ,QAAQ,OAAO,CAAC,KAAK,OAAO,GAAO,CAAC,CAAC;AAAA,UAC9D,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,gCAAqC;AAAA,EACjD,SAAS,IAAI,gCAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI,OAAe,OAAO,EAAE;AAC5B,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,SAAS,OAAO,EAAE,GAAG;AACvB,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,SAAS,OAAO,EAAE,IAAI,KAAK,KAAK,MAAM,OAAO,OAAO,OAAO,GAAO,CAAC,CAAC;AAAA,MAC1E,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,eAAO,0BAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { TTSMetrics } from '../metrics/base.js';
|
|
4
|
+
import { AsyncIterableQueue } from '../utils.js';
|
|
5
|
+
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
6
|
+
export interface SynthesizedAudio {
|
|
7
|
+
/** Request ID (one segment could be made up of multiple requests) */
|
|
8
|
+
requestId: string;
|
|
9
|
+
/** Segment ID, each segment is separated by a flush */
|
|
10
|
+
segmentId: string;
|
|
11
|
+
/** Synthesized audio frame */
|
|
12
|
+
frame: AudioFrame;
|
|
13
|
+
/** Current segment of the synthesized audio */
|
|
14
|
+
deltaText?: string;
|
|
15
|
+
/** Whether this is the last frame of the segment (streaming only) */
|
|
16
|
+
final: boolean;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Describes the capabilities of the TTS provider.
|
|
20
|
+
*
|
|
21
|
+
* @remarks
|
|
22
|
+
* At present, only `streaming` is supplied to this interface, and the framework only supports
|
|
23
|
+
* providers that do have a streaming endpoint.
|
|
24
|
+
*/
|
|
25
|
+
export interface TTSCapabilities {
|
|
26
|
+
streaming: boolean;
|
|
27
|
+
}
|
|
28
|
+
export declare enum TTSEvent {
|
|
29
|
+
METRICS_COLLECTED = 0
|
|
30
|
+
}
|
|
31
|
+
export type TTSCallbacks = {
|
|
32
|
+
[TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;
|
|
33
|
+
};
|
|
34
|
+
declare const TTS_base: new () => TypedEmitter<TTSCallbacks>;
|
|
35
|
+
/**
|
|
36
|
+
* An instance of a text-to-speech adapter.
|
|
37
|
+
*
|
|
38
|
+
* @remarks
|
|
39
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
40
|
+
* exports its own child TTS class, which inherits this class's methods.
|
|
41
|
+
*/
|
|
42
|
+
export declare abstract class TTS extends TTS_base {
|
|
43
|
+
#private;
|
|
44
|
+
abstract label: string;
|
|
45
|
+
constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities);
|
|
46
|
+
/** Returns this TTS's capabilities */
|
|
47
|
+
get capabilities(): TTSCapabilities;
|
|
48
|
+
/** Returns the sample rate of audio frames returned by this TTS */
|
|
49
|
+
get sampleRate(): number;
|
|
50
|
+
/** Returns the channel count of audio frames returned by this TTS */
|
|
51
|
+
get numChannels(): number;
|
|
52
|
+
/**
|
|
53
|
+
* Receives text and returns synthesis in the form of a {@link ChunkedStream}
|
|
54
|
+
*/
|
|
55
|
+
abstract synthesize(text: string): ChunkedStream;
|
|
56
|
+
/**
|
|
57
|
+
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
58
|
+
*/
|
|
59
|
+
abstract stream(): SynthesizeStream;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* An instance of a text-to-speech stream, as an asynchronous iterable iterator.
|
|
63
|
+
*
|
|
64
|
+
* @example Looping through frames
|
|
65
|
+
* ```ts
|
|
66
|
+
* for await (const event of stream) {
|
|
67
|
+
* await source.captureFrame(event.frame);
|
|
68
|
+
* }
|
|
69
|
+
* ```
|
|
70
|
+
*
|
|
71
|
+
* @remarks
|
|
72
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
73
|
+
* exports its own child SynthesizeStream class, which inherits this class's methods.
|
|
74
|
+
*/
|
|
75
|
+
export declare abstract class SynthesizeStream implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM> {
|
|
76
|
+
#private;
|
|
77
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
78
|
+
static readonly END_OF_STREAM: unique symbol;
|
|
79
|
+
protected input: AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>;
|
|
80
|
+
protected queue: AsyncIterableQueue<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>;
|
|
81
|
+
protected output: AsyncIterableQueue<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>;
|
|
82
|
+
protected closed: boolean;
|
|
83
|
+
abstract label: string;
|
|
84
|
+
constructor(tts: TTS);
|
|
85
|
+
protected monitorMetrics(): Promise<void>;
|
|
86
|
+
/** Push a string of text to the TTS */
|
|
87
|
+
pushText(text: string): void;
|
|
88
|
+
/** Flush the TTS, causing it to process all pending text */
|
|
89
|
+
flush(): void;
|
|
90
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
91
|
+
endInput(): void;
|
|
92
|
+
next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>>;
|
|
93
|
+
/** Close both the input and output of the TTS stream */
|
|
94
|
+
close(): void;
|
|
95
|
+
[Symbol.asyncIterator](): SynthesizeStream;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* An instance of a text-to-speech response, as an asynchronous iterable iterator.
|
|
99
|
+
*
|
|
100
|
+
* @example Looping through frames
|
|
101
|
+
* ```ts
|
|
102
|
+
* for await (const event of stream) {
|
|
103
|
+
* await source.captureFrame(event.frame);
|
|
104
|
+
* }
|
|
105
|
+
* ```
|
|
106
|
+
*
|
|
107
|
+
* @remarks
|
|
108
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
109
|
+
* exports its own child ChunkedStream class, which inherits this class's methods.
|
|
110
|
+
*/
|
|
111
|
+
export declare abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
|
|
112
|
+
#private;
|
|
113
|
+
protected queue: AsyncIterableQueue<SynthesizedAudio>;
|
|
114
|
+
protected output: AsyncIterableQueue<SynthesizedAudio>;
|
|
115
|
+
protected closed: boolean;
|
|
116
|
+
abstract label: string;
|
|
117
|
+
constructor(text: string, tts: TTS);
|
|
118
|
+
protected monitorMetrics(): Promise<void>;
|
|
119
|
+
/** Collect every frame into one in a single call */
|
|
120
|
+
collect(): Promise<AudioFrame>;
|
|
121
|
+
next(): Promise<IteratorResult<SynthesizedAudio>>;
|
|
122
|
+
/** Close both the input and output of the TTS stream */
|
|
123
|
+
close(): void;
|
|
124
|
+
[Symbol.asyncIterator](): ChunkedStream;
|
|
125
|
+
}
|
|
126
|
+
export {};
|
|
127
|
+
//# sourceMappingURL=tts.d.ts.map
|
package/dist/tts/tts.js
CHANGED
|
@@ -53,7 +53,7 @@ class SynthesizeStream {
|
|
|
53
53
|
const metrics = {
|
|
54
54
|
timestamp: Date.now(),
|
|
55
55
|
requestId,
|
|
56
|
-
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
56
|
+
ttfb: Math.trunc(Number((ttfb || BigInt(0)) / BigInt(1e6))),
|
|
57
57
|
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
58
58
|
charactersCount: text.length,
|
|
59
59
|
audioDuration,
|
|
@@ -147,12 +147,12 @@ class ChunkedStream {
|
|
|
147
147
|
async monitorMetrics() {
|
|
148
148
|
const startTime = process.hrtime.bigint();
|
|
149
149
|
let audioDuration = 0;
|
|
150
|
-
let ttfb;
|
|
150
|
+
let ttfb = BigInt(-1);
|
|
151
151
|
let requestId = "";
|
|
152
152
|
for await (const audio of this.queue) {
|
|
153
153
|
this.output.put(audio);
|
|
154
154
|
requestId = audio.requestId;
|
|
155
|
-
if (
|
|
155
|
+
if (ttfb === BigInt(-1)) {
|
|
156
156
|
ttfb = process.hrtime.bigint() - startTime;
|
|
157
157
|
}
|
|
158
158
|
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
@@ -162,7 +162,7 @@ class ChunkedStream {
|
|
|
162
162
|
const metrics = {
|
|
163
163
|
timestamp: Date.now(),
|
|
164
164
|
requestId,
|
|
165
|
-
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
165
|
+
ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
166
166
|
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
167
167
|
charactersCount: this.#text.length,
|
|
168
168
|
audioDuration,
|
package/dist/tts/tts.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,mBAAmB;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,mBAAoE;AAAA,EAChF,QAAQ,IAAI,mBAEpB;AAAA,EACQ,SAAS,IAAI,mBAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,UAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,mBAAqC;AAAA,EACjD,SAAS,IAAI,mBAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,MAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,WAAO,YAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
|
|
1
|
+
{"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number((ttfb || BigInt(0)) / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint = BigInt(-1);\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (ttfb === BigInt(-1)) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,mBAAmB;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,mBAAoE;AAAA,EAChF,QAAQ,IAAI,mBAEpB;AAAA,EACQ,SAAS,IAAI,mBAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,QAAQ,QAAQ,OAAO,CAAC,KAAK,OAAO,GAAO,CAAC,CAAC;AAAA,UAC9D,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,mBAAqC;AAAA,EACjD,SAAS,IAAI,mBAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI,OAAe,OAAO,EAAE;AAC5B,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,SAAS,OAAO,EAAE,GAAG;AACvB,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,SAAS,OAAO,EAAE,IAAI,KAAK,KAAK,MAAM,OAAO,OAAO,OAAO,GAAO,CAAC,CAAC;AAAA,MAC1E,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,WAAO,YAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
|
package/dist/utils.d.cts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import type { Room } from '@livekit/rtc-node';
|
|
2
|
+
import { AudioFrame } from '@livekit/rtc-node';
|
|
3
|
+
/** Union of a single and a list of {@link AudioFrame}s */
|
|
4
|
+
export type AudioBuffer = AudioFrame[] | AudioFrame;
|
|
5
|
+
/**
|
|
6
|
+
* Merge one or more {@link AudioFrame}s into a single one.
|
|
7
|
+
*
|
|
8
|
+
* @param buffer Either an {@link AudioFrame} or a list thereof
|
|
9
|
+
* @throws
|
|
10
|
+
* {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypeError
|
|
11
|
+
* | TypeError} if sample rate or channel count are mismatched
|
|
12
|
+
*/
|
|
13
|
+
export declare const mergeFrames: (buffer: AudioBuffer) => AudioFrame;
|
|
14
|
+
export declare const findMicroTrackId: (room: Room, identity: string) => string;
|
|
15
|
+
/** @internal */
|
|
16
|
+
export declare class Queue<T> {
|
|
17
|
+
#private;
|
|
18
|
+
/** @internal */
|
|
19
|
+
items: T[];
|
|
20
|
+
constructor(limit?: number);
|
|
21
|
+
get(): Promise<T>;
|
|
22
|
+
put(item: T): Promise<void>;
|
|
23
|
+
}
|
|
24
|
+
/** @internal */
|
|
25
|
+
export declare class Future {
|
|
26
|
+
#private;
|
|
27
|
+
constructor();
|
|
28
|
+
get await(): Promise<void>;
|
|
29
|
+
get done(): boolean;
|
|
30
|
+
resolve(): void;
|
|
31
|
+
reject(error: Error): void;
|
|
32
|
+
}
|
|
33
|
+
/** @internal */
|
|
34
|
+
export declare class CancellablePromise<T> {
|
|
35
|
+
#private;
|
|
36
|
+
constructor(executor: (resolve: (value: T | PromiseLike<T>) => void, reject: (reason?: any) => void, onCancel: (cancelFn: () => void) => void) => void);
|
|
37
|
+
get isCancelled(): boolean;
|
|
38
|
+
get error(): Error | null;
|
|
39
|
+
then<TResult1 = T, TResult2 = never>(onfulfilled?: ((value: T) => TResult1 | Promise<TResult1>) | null, onrejected?: ((reason: any) => TResult2 | Promise<TResult2>) | null): Promise<TResult1 | TResult2>;
|
|
40
|
+
catch<TResult = never>(onrejected?: ((reason: any) => TResult | Promise<TResult>) | null): Promise<T | TResult>;
|
|
41
|
+
finally(onfinally?: (() => void) | null): Promise<T>;
|
|
42
|
+
cancel(): void;
|
|
43
|
+
static from<T>(promise: Promise<T>): CancellablePromise<T>;
|
|
44
|
+
}
|
|
45
|
+
/** @internal */
|
|
46
|
+
export declare function gracefullyCancel<T>(promise: CancellablePromise<T>): Promise<void>;
|
|
47
|
+
/** @internal */
|
|
48
|
+
export declare class AsyncIterableQueue<T> implements AsyncIterableIterator<T> {
|
|
49
|
+
#private;
|
|
50
|
+
private static readonly CLOSE_SENTINEL;
|
|
51
|
+
get closed(): boolean;
|
|
52
|
+
put(item: T): void;
|
|
53
|
+
close(): void;
|
|
54
|
+
next(): Promise<IteratorResult<T>>;
|
|
55
|
+
[Symbol.asyncIterator](): AsyncIterableQueue<T>;
|
|
56
|
+
}
|
|
57
|
+
/** @internal */
|
|
58
|
+
export declare class ExpFilter {
|
|
59
|
+
#private;
|
|
60
|
+
constructor(alpha: number, max?: number);
|
|
61
|
+
reset(alpha?: number): void;
|
|
62
|
+
apply(exp: number, sample: number): number;
|
|
63
|
+
get filtered(): number | undefined;
|
|
64
|
+
set alpha(alpha: number);
|
|
65
|
+
}
|
|
66
|
+
/** @internal */
|
|
67
|
+
export declare class AudioEnergyFilter {
|
|
68
|
+
#private;
|
|
69
|
+
constructor(cooldownSeconds?: number);
|
|
70
|
+
pushFrame(frame: AudioFrame): boolean;
|
|
71
|
+
}
|
|
72
|
+
//# sourceMappingURL=utils.d.ts.map
|
package/dist/vad.d.cts
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { VADMetrics } from './metrics/base.js';
|
|
4
|
+
import { AsyncIterableQueue } from './utils.js';
|
|
5
|
+
export declare enum VADEventType {
|
|
6
|
+
START_OF_SPEECH = 0,
|
|
7
|
+
INFERENCE_DONE = 1,
|
|
8
|
+
END_OF_SPEECH = 2,
|
|
9
|
+
METRICS_COLLECTED = 3
|
|
10
|
+
}
|
|
11
|
+
export interface VADEvent {
|
|
12
|
+
/** Type of the VAD event (e.g., start of speech, end of speech, inference done). */
|
|
13
|
+
type: VADEventType;
|
|
14
|
+
/**
|
|
15
|
+
* Index of the audio sample where the event occurred, relative to the inference sample rate.
|
|
16
|
+
*/
|
|
17
|
+
samplesIndex: number;
|
|
18
|
+
/** Timestamp when the event was fired. */
|
|
19
|
+
timestamp: number;
|
|
20
|
+
/** Duration of the speech segment. */
|
|
21
|
+
speechDuration: number;
|
|
22
|
+
/** Duration of the silence segment. */
|
|
23
|
+
silenceDuration: number;
|
|
24
|
+
/**
|
|
25
|
+
* List of audio frames associated with the speech.
|
|
26
|
+
*
|
|
27
|
+
* @remarks
|
|
28
|
+
* - For `start_of_speech` events, this contains the audio chunks that triggered the detection.
|
|
29
|
+
* - For `inference_done` events, this contains the audio chunks that were processed.
|
|
30
|
+
* - For `end_of_speech` events, this contains the complete user speech.
|
|
31
|
+
*/
|
|
32
|
+
frames: AudioFrame[];
|
|
33
|
+
/** Probability that speech is present (only for `INFERENCE_DONE` events). */
|
|
34
|
+
probability: number;
|
|
35
|
+
/** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */
|
|
36
|
+
inferenceDuration: number;
|
|
37
|
+
/** Indicates whether speech was detected in the frames. */
|
|
38
|
+
speaking: boolean;
|
|
39
|
+
/** Threshold used to detect silence. */
|
|
40
|
+
rawAccumulatedSilence: number;
|
|
41
|
+
/** Threshold used to detect speech. */
|
|
42
|
+
rawAccumulatedSpeech: number;
|
|
43
|
+
}
|
|
44
|
+
export interface VADCapabilities {
|
|
45
|
+
updateInterval: number;
|
|
46
|
+
}
|
|
47
|
+
export type VADCallbacks = {
|
|
48
|
+
[VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;
|
|
49
|
+
};
|
|
50
|
+
declare const VAD_base: new () => TypedEmitter<VADCallbacks>;
|
|
51
|
+
export declare abstract class VAD extends VAD_base {
|
|
52
|
+
#private;
|
|
53
|
+
abstract label: string;
|
|
54
|
+
constructor(capabilities: VADCapabilities);
|
|
55
|
+
get capabilities(): VADCapabilities;
|
|
56
|
+
/**
|
|
57
|
+
* Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.
|
|
58
|
+
*/
|
|
59
|
+
abstract stream(): VADStream;
|
|
60
|
+
}
|
|
61
|
+
export declare abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
62
|
+
#private;
|
|
63
|
+
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
64
|
+
protected input: AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;
|
|
65
|
+
protected queue: AsyncIterableQueue<VADEvent>;
|
|
66
|
+
protected output: AsyncIterableQueue<VADEvent>;
|
|
67
|
+
protected closed: boolean;
|
|
68
|
+
constructor(vad: VAD);
|
|
69
|
+
protected monitorMetrics(): Promise<void>;
|
|
70
|
+
pushFrame(frame: AudioFrame): void;
|
|
71
|
+
flush(): void;
|
|
72
|
+
endInput(): void;
|
|
73
|
+
next(): Promise<IteratorResult<VADEvent>>;
|
|
74
|
+
close(): void;
|
|
75
|
+
[Symbol.asyncIterator](): VADStream;
|
|
76
|
+
}
|
|
77
|
+
export {};
|
|
78
|
+
//# sourceMappingURL=vad.d.ts.map
|
package/dist/worker.cjs
CHANGED
|
@@ -263,7 +263,8 @@ class Worker {
|
|
|
263
263
|
this.#httpServer = new import_http_server.HTTPServer(opts.host, opts.port, () => ({
|
|
264
264
|
agent_name: opts.agentName,
|
|
265
265
|
worker_type: import_protocol.JobType[opts.workerType],
|
|
266
|
-
active_jobs: this.activeJobs.length
|
|
266
|
+
active_jobs: this.activeJobs.length,
|
|
267
|
+
sdk_version: import_version.version
|
|
267
268
|
}));
|
|
268
269
|
}
|
|
269
270
|
/* @throws {@link WorkerError} if worker failed to connect or already running */
|