@livekit/agents 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +7 -13
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +1 -4
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +7 -13
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +166 -66
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +169 -69
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +2 -1
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +12 -17
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +206 -87
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/vad.ts +61 -4
package/src/stt/stt.ts
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
|
+
import { EventEmitter } from 'node:events';
|
|
7
|
+
import type { STTMetrics } from '../metrics/base.js';
|
|
5
8
|
import type { AudioBuffer } from '../utils.js';
|
|
6
9
|
import { AsyncIterableQueue } from '../utils.js';
|
|
7
10
|
|
|
@@ -27,6 +30,9 @@ export enum SpeechEventType {
|
|
|
27
30
|
* The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.
|
|
28
31
|
*/
|
|
29
32
|
END_OF_SPEECH = 3,
|
|
33
|
+
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
34
|
+
RECOGNITION_USAGE = 4,
|
|
35
|
+
METRICS_COLLECTED = 5,
|
|
30
36
|
}
|
|
31
37
|
|
|
32
38
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
@@ -38,10 +44,16 @@ export interface SpeechData {
|
|
|
38
44
|
confidence: number;
|
|
39
45
|
}
|
|
40
46
|
|
|
47
|
+
export interface RecognitionUsage {
|
|
48
|
+
audioDuration: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
41
51
|
/** SpeechEvent is a packet of speech-to-text data. */
|
|
42
52
|
export interface SpeechEvent {
|
|
43
53
|
type: SpeechEventType;
|
|
44
54
|
alternatives?: [SpeechData, ...SpeechData[]];
|
|
55
|
+
requestId?: string;
|
|
56
|
+
recognitionUsage?: RecognitionUsage;
|
|
45
57
|
}
|
|
46
58
|
|
|
47
59
|
/**
|
|
@@ -55,6 +67,10 @@ export interface STTCapabilities {
|
|
|
55
67
|
interimResults: boolean;
|
|
56
68
|
}
|
|
57
69
|
|
|
70
|
+
export type STTCallbacks = {
|
|
71
|
+
[SpeechEventType.METRICS_COLLECTED]: (metrics: STTMetrics) => void;
|
|
72
|
+
};
|
|
73
|
+
|
|
58
74
|
/**
|
|
59
75
|
* An instance of a speech-to-text adapter.
|
|
60
76
|
*
|
|
@@ -62,10 +78,12 @@ export interface STTCapabilities {
|
|
|
62
78
|
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
63
79
|
* exports its own child STT class, which inherits this class's methods.
|
|
64
80
|
*/
|
|
65
|
-
export abstract class STT {
|
|
81
|
+
export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {
|
|
82
|
+
abstract label: string;
|
|
66
83
|
#capabilities: STTCapabilities;
|
|
67
84
|
|
|
68
85
|
constructor(capabilities: STTCapabilities) {
|
|
86
|
+
super();
|
|
69
87
|
this.#capabilities = capabilities;
|
|
70
88
|
}
|
|
71
89
|
|
|
@@ -75,7 +93,24 @@ export abstract class STT {
|
|
|
75
93
|
}
|
|
76
94
|
|
|
77
95
|
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
78
|
-
|
|
96
|
+
async recognize(frame: AudioBuffer): Promise<SpeechEvent> {
|
|
97
|
+
const startTime = process.hrtime.bigint();
|
|
98
|
+
const event = await this._recognize(frame);
|
|
99
|
+
const duration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
|
|
100
|
+
this.emit(SpeechEventType.METRICS_COLLECTED, {
|
|
101
|
+
requestId: event.requestId ?? '',
|
|
102
|
+
timestamp: Date.now(),
|
|
103
|
+
duration,
|
|
104
|
+
label: this.label,
|
|
105
|
+
audioDuration: Array.isArray(frame)
|
|
106
|
+
? frame.reduce((sum, a) => sum + a.samplesPerChannel / a.sampleRate, 0)
|
|
107
|
+
: frame.samplesPerChannel / frame.sampleRate,
|
|
108
|
+
streamed: false,
|
|
109
|
+
});
|
|
110
|
+
return event;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
79
114
|
|
|
80
115
|
/**
|
|
81
116
|
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
@@ -103,8 +138,36 @@ export abstract class STT {
|
|
|
103
138
|
export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
|
|
104
139
|
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
105
140
|
protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();
|
|
141
|
+
protected output = new AsyncIterableQueue<SpeechEvent>();
|
|
106
142
|
protected queue = new AsyncIterableQueue<SpeechEvent>();
|
|
143
|
+
abstract label: string;
|
|
107
144
|
protected closed = false;
|
|
145
|
+
#stt: STT;
|
|
146
|
+
|
|
147
|
+
constructor(stt: STT) {
|
|
148
|
+
this.#stt = stt;
|
|
149
|
+
this.monitorMetrics();
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
protected async monitorMetrics() {
|
|
153
|
+
const startTime = process.hrtime.bigint();
|
|
154
|
+
|
|
155
|
+
for await (const event of this.queue) {
|
|
156
|
+
this.output.put(event);
|
|
157
|
+
if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;
|
|
158
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
159
|
+
const metrics: STTMetrics = {
|
|
160
|
+
timestamp: Date.now(),
|
|
161
|
+
requestId: event.requestId!,
|
|
162
|
+
duration: Math.trunc(Number(duration / BigInt(1000000))),
|
|
163
|
+
label: this.label,
|
|
164
|
+
audioDuration: event.recognitionUsage!.audioDuration,
|
|
165
|
+
streamed: true,
|
|
166
|
+
};
|
|
167
|
+
this.#stt.emit(SpeechEventType.METRICS_COLLECTED, metrics);
|
|
168
|
+
}
|
|
169
|
+
this.output.close();
|
|
170
|
+
}
|
|
108
171
|
|
|
109
172
|
/** Push an audio frame to the STT */
|
|
110
173
|
pushFrame(frame: AudioFrame) {
|
|
@@ -140,13 +203,14 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
140
203
|
}
|
|
141
204
|
|
|
142
205
|
next(): Promise<IteratorResult<SpeechEvent>> {
|
|
143
|
-
return this.
|
|
206
|
+
return this.output.next();
|
|
144
207
|
}
|
|
145
208
|
|
|
146
209
|
/** Close both the input and output of the STT stream */
|
|
147
210
|
close() {
|
|
148
211
|
this.input.close();
|
|
149
212
|
this.queue.close();
|
|
213
|
+
this.output.close();
|
|
150
214
|
this.closed = true;
|
|
151
215
|
}
|
|
152
216
|
|
package/src/tts/index.ts
CHANGED
|
@@ -3,16 +3,23 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
|
|
5
5
|
import type { ChunkedStream } from './tts.js';
|
|
6
|
-
import { SynthesizeStream, TTS } from './tts.js';
|
|
6
|
+
import { SynthesizeStream, TTS, TTSEvent } from './tts.js';
|
|
7
7
|
|
|
8
8
|
export class StreamAdapter extends TTS {
|
|
9
9
|
#tts: TTS;
|
|
10
10
|
#sentenceTokenizer: SentenceTokenizer;
|
|
11
|
+
label: string;
|
|
11
12
|
|
|
12
13
|
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
13
14
|
super(tts.sampleRate, tts.numChannels, { streaming: true });
|
|
14
15
|
this.#tts = tts;
|
|
15
16
|
this.#sentenceTokenizer = sentenceTokenizer;
|
|
17
|
+
this.label = this.#tts.label;
|
|
18
|
+
this.label = `tts.StreamAdapter<${this.#tts.label}>`;
|
|
19
|
+
|
|
20
|
+
this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
21
|
+
this.emit(TTSEvent.METRICS_COLLECTED, metrics);
|
|
22
|
+
});
|
|
16
23
|
}
|
|
17
24
|
|
|
18
25
|
synthesize(text: string): ChunkedStream {
|
|
@@ -27,15 +34,21 @@ export class StreamAdapter extends TTS {
|
|
|
27
34
|
export class StreamAdapterWrapper extends SynthesizeStream {
|
|
28
35
|
#tts: TTS;
|
|
29
36
|
#sentenceStream: SentenceStream;
|
|
37
|
+
label: string;
|
|
30
38
|
|
|
31
39
|
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
32
|
-
super();
|
|
40
|
+
super(tts);
|
|
33
41
|
this.#tts = tts;
|
|
34
42
|
this.#sentenceStream = sentenceTokenizer.stream();
|
|
43
|
+
this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
|
|
35
44
|
|
|
36
45
|
this.#run();
|
|
37
46
|
}
|
|
38
47
|
|
|
48
|
+
async monitorMetrics() {
|
|
49
|
+
return; // do nothing
|
|
50
|
+
}
|
|
51
|
+
|
|
39
52
|
async #run() {
|
|
40
53
|
const forwardInput = async () => {
|
|
41
54
|
for await (const input of this.input) {
|
|
@@ -52,10 +65,10 @@ export class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
52
65
|
const synthesize = async () => {
|
|
53
66
|
for await (const ev of this.#sentenceStream) {
|
|
54
67
|
for await (const audio of this.#tts.synthesize(ev.token)) {
|
|
55
|
-
this.
|
|
68
|
+
this.output.put(audio);
|
|
56
69
|
}
|
|
57
70
|
}
|
|
58
|
-
this.
|
|
71
|
+
this.output.put(SynthesizeStream.END_OF_STREAM);
|
|
59
72
|
};
|
|
60
73
|
|
|
61
74
|
Promise.all([forwardInput(), synthesize()]);
|
package/src/tts/tts.ts
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
|
+
import { EventEmitter } from 'node:events';
|
|
7
|
+
import type { TTSMetrics } from '../metrics/base.js';
|
|
5
8
|
import { AsyncIterableQueue, mergeFrames } from '../utils.js';
|
|
6
9
|
|
|
7
10
|
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
@@ -14,6 +17,8 @@ export interface SynthesizedAudio {
|
|
|
14
17
|
frame: AudioFrame;
|
|
15
18
|
/** Current segment of the synthesized audio */
|
|
16
19
|
deltaText?: string;
|
|
20
|
+
/** Whether this is the last frame of the segment (streaming only) */
|
|
21
|
+
final: boolean;
|
|
17
22
|
}
|
|
18
23
|
|
|
19
24
|
/**
|
|
@@ -27,6 +32,14 @@ export interface TTSCapabilities {
|
|
|
27
32
|
streaming: boolean;
|
|
28
33
|
}
|
|
29
34
|
|
|
35
|
+
export enum TTSEvent {
|
|
36
|
+
METRICS_COLLECTED,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type TTSCallbacks = {
|
|
40
|
+
[TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;
|
|
41
|
+
};
|
|
42
|
+
|
|
30
43
|
/**
|
|
31
44
|
* An instance of a text-to-speech adapter.
|
|
32
45
|
*
|
|
@@ -34,12 +47,14 @@ export interface TTSCapabilities {
|
|
|
34
47
|
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
35
48
|
* exports its own child TTS class, which inherits this class's methods.
|
|
36
49
|
*/
|
|
37
|
-
export abstract class TTS {
|
|
50
|
+
export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {
|
|
38
51
|
#capabilities: TTSCapabilities;
|
|
39
52
|
#sampleRate: number;
|
|
40
53
|
#numChannels: number;
|
|
54
|
+
abstract label: string;
|
|
41
55
|
|
|
42
56
|
constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {
|
|
57
|
+
super();
|
|
43
58
|
this.#capabilities = capabilities;
|
|
44
59
|
this.#sampleRate = sampleRate;
|
|
45
60
|
this.#numChannels = numChannels;
|
|
@@ -94,10 +109,71 @@ export abstract class SynthesizeStream
|
|
|
94
109
|
protected queue = new AsyncIterableQueue<
|
|
95
110
|
SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
|
|
96
111
|
>();
|
|
112
|
+
protected output = new AsyncIterableQueue<
|
|
113
|
+
SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
|
|
114
|
+
>();
|
|
97
115
|
protected closed = false;
|
|
116
|
+
abstract label: string;
|
|
117
|
+
#tts: TTS;
|
|
118
|
+
#metricsPendingTexts: string[] = [];
|
|
119
|
+
#metricsText = '';
|
|
120
|
+
#monitorMetricsTask?: Promise<void>;
|
|
121
|
+
|
|
122
|
+
constructor(tts: TTS) {
|
|
123
|
+
this.#tts = tts;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
protected async monitorMetrics() {
|
|
127
|
+
const startTime = process.hrtime.bigint();
|
|
128
|
+
let audioDuration = 0;
|
|
129
|
+
let ttfb: bigint | undefined;
|
|
130
|
+
let requestId = '';
|
|
131
|
+
|
|
132
|
+
const emit = () => {
|
|
133
|
+
if (this.#metricsPendingTexts.length) {
|
|
134
|
+
const text = this.#metricsPendingTexts.shift()!;
|
|
135
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
136
|
+
const metrics: TTSMetrics = {
|
|
137
|
+
timestamp: Date.now(),
|
|
138
|
+
requestId,
|
|
139
|
+
ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),
|
|
140
|
+
duration: Math.trunc(Number(duration / BigInt(1000000))),
|
|
141
|
+
charactersCount: text.length,
|
|
142
|
+
audioDuration,
|
|
143
|
+
cancelled: false, // XXX(nbsp)
|
|
144
|
+
label: this.label,
|
|
145
|
+
streamed: false,
|
|
146
|
+
};
|
|
147
|
+
this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);
|
|
148
|
+
}
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
for await (const audio of this.queue) {
|
|
152
|
+
this.output.put(audio);
|
|
153
|
+
if (audio === SynthesizeStream.END_OF_STREAM) continue;
|
|
154
|
+
requestId = audio.requestId;
|
|
155
|
+
if (!ttfb) {
|
|
156
|
+
ttfb = process.hrtime.bigint() - startTime;
|
|
157
|
+
}
|
|
158
|
+
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
159
|
+
if (audio.final) {
|
|
160
|
+
emit();
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (requestId) {
|
|
165
|
+
emit();
|
|
166
|
+
}
|
|
167
|
+
this.output.close();
|
|
168
|
+
}
|
|
98
169
|
|
|
99
170
|
/** Push a string of text to the TTS */
|
|
100
171
|
pushText(text: string) {
|
|
172
|
+
if (!this.#monitorMetricsTask) {
|
|
173
|
+
this.#monitorMetricsTask = this.monitorMetrics();
|
|
174
|
+
}
|
|
175
|
+
this.#metricsText += text;
|
|
176
|
+
|
|
101
177
|
if (this.input.closed) {
|
|
102
178
|
throw new Error('Input is closed');
|
|
103
179
|
}
|
|
@@ -109,6 +185,10 @@ export abstract class SynthesizeStream
|
|
|
109
185
|
|
|
110
186
|
/** Flush the TTS, causing it to process all pending text */
|
|
111
187
|
flush() {
|
|
188
|
+
if (this.#metricsText) {
|
|
189
|
+
this.#metricsPendingTexts.push(this.#metricsText);
|
|
190
|
+
this.#metricsText = '';
|
|
191
|
+
}
|
|
112
192
|
if (this.input.closed) {
|
|
113
193
|
throw new Error('Input is closed');
|
|
114
194
|
}
|
|
@@ -130,13 +210,13 @@ export abstract class SynthesizeStream
|
|
|
130
210
|
}
|
|
131
211
|
|
|
132
212
|
next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {
|
|
133
|
-
return this.
|
|
213
|
+
return this.output.next();
|
|
134
214
|
}
|
|
135
215
|
|
|
136
216
|
/** Close both the input and output of the TTS stream */
|
|
137
217
|
close() {
|
|
138
218
|
this.input.close();
|
|
139
|
-
this.
|
|
219
|
+
this.output.close();
|
|
140
220
|
this.closed = true;
|
|
141
221
|
}
|
|
142
222
|
|
|
@@ -161,7 +241,49 @@ export abstract class SynthesizeStream
|
|
|
161
241
|
*/
|
|
162
242
|
export abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
|
|
163
243
|
protected queue = new AsyncIterableQueue<SynthesizedAudio>();
|
|
244
|
+
protected output = new AsyncIterableQueue<SynthesizedAudio>();
|
|
164
245
|
protected closed = false;
|
|
246
|
+
abstract label: string;
|
|
247
|
+
#text: string;
|
|
248
|
+
#tts: TTS;
|
|
249
|
+
|
|
250
|
+
constructor(text: string, tts: TTS) {
|
|
251
|
+
this.#text = text;
|
|
252
|
+
this.#tts = tts;
|
|
253
|
+
|
|
254
|
+
this.monitorMetrics();
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
protected async monitorMetrics() {
|
|
258
|
+
const startTime = process.hrtime.bigint();
|
|
259
|
+
let audioDuration = 0;
|
|
260
|
+
let ttfb: bigint | undefined;
|
|
261
|
+
let requestId = '';
|
|
262
|
+
|
|
263
|
+
for await (const audio of this.queue) {
|
|
264
|
+
this.output.put(audio);
|
|
265
|
+
requestId = audio.requestId;
|
|
266
|
+
if (!ttfb) {
|
|
267
|
+
ttfb = process.hrtime.bigint() - startTime;
|
|
268
|
+
}
|
|
269
|
+
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
270
|
+
}
|
|
271
|
+
this.output.close();
|
|
272
|
+
|
|
273
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
274
|
+
const metrics: TTSMetrics = {
|
|
275
|
+
timestamp: Date.now(),
|
|
276
|
+
requestId,
|
|
277
|
+
ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),
|
|
278
|
+
duration: Math.trunc(Number(duration / BigInt(1000000))),
|
|
279
|
+
charactersCount: this.#text.length,
|
|
280
|
+
audioDuration,
|
|
281
|
+
cancelled: false, // XXX(nbsp)
|
|
282
|
+
label: this.label,
|
|
283
|
+
streamed: false,
|
|
284
|
+
};
|
|
285
|
+
this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);
|
|
286
|
+
}
|
|
165
287
|
|
|
166
288
|
/** Collect every frame into one in a single call */
|
|
167
289
|
async collect(): Promise<AudioFrame> {
|
|
@@ -173,12 +295,13 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
173
295
|
}
|
|
174
296
|
|
|
175
297
|
next(): Promise<IteratorResult<SynthesizedAudio>> {
|
|
176
|
-
return this.
|
|
298
|
+
return this.output.next();
|
|
177
299
|
}
|
|
178
300
|
|
|
179
301
|
/** Close both the input and output of the TTS stream */
|
|
180
302
|
close() {
|
|
181
303
|
this.queue.close();
|
|
304
|
+
this.output.close();
|
|
182
305
|
this.closed = true;
|
|
183
306
|
}
|
|
184
307
|
|
package/src/vad.ts
CHANGED
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
|
+
import { EventEmitter } from 'node:events';
|
|
7
|
+
import type { VADMetrics } from './metrics/base.js';
|
|
5
8
|
import { AsyncIterableQueue } from './utils.js';
|
|
6
9
|
|
|
7
10
|
export enum VADEventType {
|
|
8
11
|
START_OF_SPEECH,
|
|
9
12
|
INFERENCE_DONE,
|
|
10
13
|
END_OF_SPEECH,
|
|
14
|
+
METRICS_COLLECTED,
|
|
11
15
|
}
|
|
12
16
|
|
|
13
17
|
export interface VADEvent {
|
|
@@ -19,9 +23,9 @@ export interface VADEvent {
|
|
|
19
23
|
samplesIndex: number;
|
|
20
24
|
/** Timestamp when the event was fired. */
|
|
21
25
|
timestamp: number;
|
|
22
|
-
/** Duration of the
|
|
26
|
+
/** Duration of the speech segment. */
|
|
23
27
|
speechDuration: number;
|
|
24
|
-
/** Duration of the silence segment
|
|
28
|
+
/** Duration of the silence segment. */
|
|
25
29
|
silenceDuration: number;
|
|
26
30
|
/**
|
|
27
31
|
* List of audio frames associated with the speech.
|
|
@@ -38,15 +42,26 @@ export interface VADEvent {
|
|
|
38
42
|
inferenceDuration: number;
|
|
39
43
|
/** Indicates whether speech was detected in the frames. */
|
|
40
44
|
speaking: boolean;
|
|
45
|
+
/** Threshold used to detect silence. */
|
|
46
|
+
rawAccumulatedSilence: number;
|
|
47
|
+
/** Threshold used to detect speech. */
|
|
48
|
+
rawAccumulatedSpeech: number;
|
|
41
49
|
}
|
|
42
50
|
|
|
43
51
|
export interface VADCapabilities {
|
|
44
52
|
updateInterval: number;
|
|
45
53
|
}
|
|
46
54
|
|
|
47
|
-
export
|
|
55
|
+
export type VADCallbacks = {
|
|
56
|
+
[VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
export abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {
|
|
48
60
|
#capabilities: VADCapabilities;
|
|
61
|
+
abstract label: string;
|
|
62
|
+
|
|
49
63
|
constructor(capabilities: VADCapabilities) {
|
|
64
|
+
super();
|
|
50
65
|
this.#capabilities = capabilities;
|
|
51
66
|
}
|
|
52
67
|
|
|
@@ -64,7 +79,48 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
64
79
|
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
|
|
65
80
|
protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();
|
|
66
81
|
protected queue = new AsyncIterableQueue<VADEvent>();
|
|
82
|
+
protected output = new AsyncIterableQueue<VADEvent>();
|
|
67
83
|
protected closed = false;
|
|
84
|
+
#vad: VAD;
|
|
85
|
+
#lastActivityTime = BigInt(0);
|
|
86
|
+
|
|
87
|
+
constructor(vad: VAD) {
|
|
88
|
+
this.#vad = vad;
|
|
89
|
+
this.monitorMetrics();
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
protected async monitorMetrics() {
|
|
93
|
+
let inferenceDurationTotal = 0;
|
|
94
|
+
let inferenceCount = 0;
|
|
95
|
+
|
|
96
|
+
for await (const event of this.queue) {
|
|
97
|
+
this.output.put(event);
|
|
98
|
+
switch (event.type) {
|
|
99
|
+
case VADEventType.START_OF_SPEECH:
|
|
100
|
+
inferenceCount++;
|
|
101
|
+
if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
|
|
102
|
+
this.#vad.emit(VADEventType.METRICS_COLLECTED, {
|
|
103
|
+
timestamp: Date.now(),
|
|
104
|
+
idleTime: Math.trunc(
|
|
105
|
+
Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),
|
|
106
|
+
),
|
|
107
|
+
inferenceDurationTotal,
|
|
108
|
+
inferenceCount,
|
|
109
|
+
label: this.#vad.label,
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
inferenceCount = 0;
|
|
113
|
+
inferenceDurationTotal = 0;
|
|
114
|
+
}
|
|
115
|
+
break;
|
|
116
|
+
case VADEventType.INFERENCE_DONE:
|
|
117
|
+
case VADEventType.END_OF_SPEECH:
|
|
118
|
+
this.#lastActivityTime = process.hrtime.bigint();
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
this.output.close();
|
|
123
|
+
}
|
|
68
124
|
|
|
69
125
|
pushFrame(frame: AudioFrame) {
|
|
70
126
|
if (this.input.closed) {
|
|
@@ -97,12 +153,13 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
97
153
|
}
|
|
98
154
|
|
|
99
155
|
next(): Promise<IteratorResult<VADEvent>> {
|
|
100
|
-
return this.
|
|
156
|
+
return this.output.next();
|
|
101
157
|
}
|
|
102
158
|
|
|
103
159
|
close() {
|
|
104
160
|
this.input.close();
|
|
105
161
|
this.queue.close();
|
|
162
|
+
this.output.close();
|
|
106
163
|
this.closed = true;
|
|
107
164
|
}
|
|
108
165
|
|