@livekit/agents 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +7 -13
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +1 -4
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +7 -13
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +166 -66
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +169 -69
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +2 -1
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +12 -17
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +206 -87
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/vad.ts +61 -4
package/dist/stt/stt.d.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
2
|
+
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
3
|
+
import type { STTMetrics } from '../metrics/base.js';
|
|
2
4
|
import type { AudioBuffer } from '../utils.js';
|
|
3
5
|
import { AsyncIterableQueue } from '../utils.js';
|
|
4
6
|
/** Indicates start/middle/end of speech */
|
|
@@ -22,7 +24,10 @@ export declare enum SpeechEventType {
|
|
|
22
24
|
* Indicate the end of speech, emitted when the user stops speaking.
|
|
23
25
|
* The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.
|
|
24
26
|
*/
|
|
25
|
-
END_OF_SPEECH = 3
|
|
27
|
+
END_OF_SPEECH = 3,
|
|
28
|
+
/** Usage event, emitted periodically to indicate usage metrics. */
|
|
29
|
+
RECOGNITION_USAGE = 4,
|
|
30
|
+
METRICS_COLLECTED = 5
|
|
26
31
|
}
|
|
27
32
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
28
33
|
export interface SpeechData {
|
|
@@ -32,10 +37,15 @@ export interface SpeechData {
|
|
|
32
37
|
endTime: number;
|
|
33
38
|
confidence: number;
|
|
34
39
|
}
|
|
40
|
+
export interface RecognitionUsage {
|
|
41
|
+
audioDuration: number;
|
|
42
|
+
}
|
|
35
43
|
/** SpeechEvent is a packet of speech-to-text data. */
|
|
36
44
|
export interface SpeechEvent {
|
|
37
45
|
type: SpeechEventType;
|
|
38
46
|
alternatives?: [SpeechData, ...SpeechData[]];
|
|
47
|
+
requestId?: string;
|
|
48
|
+
recognitionUsage?: RecognitionUsage;
|
|
39
49
|
}
|
|
40
50
|
/**
|
|
41
51
|
* Describes the capabilities of the STT provider.
|
|
@@ -47,6 +57,10 @@ export interface STTCapabilities {
|
|
|
47
57
|
streaming: boolean;
|
|
48
58
|
interimResults: boolean;
|
|
49
59
|
}
|
|
60
|
+
export type STTCallbacks = {
|
|
61
|
+
[SpeechEventType.METRICS_COLLECTED]: (metrics: STTMetrics) => void;
|
|
62
|
+
};
|
|
63
|
+
declare const STT_base: new () => TypedEmitter<STTCallbacks>;
|
|
50
64
|
/**
|
|
51
65
|
* An instance of a speech-to-text adapter.
|
|
52
66
|
*
|
|
@@ -54,13 +68,15 @@ export interface STTCapabilities {
|
|
|
54
68
|
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
55
69
|
* exports its own child STT class, which inherits this class's methods.
|
|
56
70
|
*/
|
|
57
|
-
export declare abstract class STT {
|
|
71
|
+
export declare abstract class STT extends STT_base {
|
|
58
72
|
#private;
|
|
73
|
+
abstract label: string;
|
|
59
74
|
constructor(capabilities: STTCapabilities);
|
|
60
75
|
/** Returns this STT's capabilities */
|
|
61
76
|
get capabilities(): STTCapabilities;
|
|
62
77
|
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
63
|
-
|
|
78
|
+
recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
79
|
+
protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
64
80
|
/**
|
|
65
81
|
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
66
82
|
* transcriptions
|
|
@@ -84,10 +100,15 @@ export declare abstract class STT {
|
|
|
84
100
|
* exports its own child SpeechStream class, which inherits this class's methods.
|
|
85
101
|
*/
|
|
86
102
|
export declare abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
|
|
103
|
+
#private;
|
|
87
104
|
protected static readonly FLUSH_SENTINEL: unique symbol;
|
|
88
105
|
protected input: AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>;
|
|
106
|
+
protected output: AsyncIterableQueue<SpeechEvent>;
|
|
89
107
|
protected queue: AsyncIterableQueue<SpeechEvent>;
|
|
108
|
+
abstract label: string;
|
|
90
109
|
protected closed: boolean;
|
|
110
|
+
constructor(stt: STT);
|
|
111
|
+
protected monitorMetrics(): Promise<void>;
|
|
91
112
|
/** Push an audio frame to the STT */
|
|
92
113
|
pushFrame(frame: AudioFrame): void;
|
|
93
114
|
/** Flush the STT, causing it to process all pending text */
|
|
@@ -99,4 +120,5 @@ export declare abstract class SpeechStream implements AsyncIterableIterator<Spee
|
|
|
99
120
|
close(): void;
|
|
100
121
|
[Symbol.asyncIterator](): SpeechStream;
|
|
101
122
|
}
|
|
123
|
+
export {};
|
|
102
124
|
//# sourceMappingURL=stt.d.ts.map
|
package/dist/stt/stt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../src/stt/stt.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAEjD,2CAA2C;AAC3C,oBAAY,eAAe;IACzB;;;;OAIG;IACH,eAAe,IAAI;IACnB;;OAEG;IACH,kBAAkB,IAAI;IACtB;;;OAGG;IACH,gBAAgB,IAAI;IACpB;;;OAGG;IACH,aAAa,IAAI;
|
|
1
|
+
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../src/stt/stt.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAEjD,2CAA2C;AAC3C,oBAAY,eAAe;IACzB;;;;OAIG;IACH,eAAe,IAAI;IACnB;;OAEG;IACH,kBAAkB,IAAI;IACtB;;;OAGG;IACH,gBAAgB,IAAI;IACpB;;;OAGG;IACH,aAAa,IAAI;IACjB,mEAAmE;IACnE,iBAAiB,IAAI;IACrB,iBAAiB,IAAI;CACtB;AAED,mEAAmE;AACnE,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,sDAAsD;AACtD,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,eAAe,CAAC;IACtB,YAAY,CAAC,EAAE,CAAC,UAAU,EAAE,GAAG,UAAU,EAAE,CAAC,CAAC;IAC7C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,gBAAgB,CAAC;CACrC;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;IACnB,cAAc,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,eAAe,CAAC,iBAAiB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CACpE,CAAC;kCAS2D,aAAa,YAAY,CAAC;AAPvF;;;;;;GAMG;AACH,8BAAsB,GAAI,SAAQ,QAAsD;;IACtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAGX,YAAY,EAAE,eAAe;IAKzC,sCAAsC;IACtC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED,8FAA8F;IACxF,SAAS,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;IAiBzD,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;IAEvE;;;OAGG;IACH,QAAQ,CAAC,MAAM,IAAI,YAAY;CAChC;AAED;;;;;;;;;;;;;;;GAeG;AACH,8BAAsB,YAAa,YAAW,qBAAqB,CAAC,WAAW,CAAC;;IAC9E,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,sEAA6E;IAC5F,SAAS,CAAC,MAAM,kCAAyC;IACzD,SAAS,CAAC,KAAK,kCAAyC;IACxD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,MAAM,UAAS;gBAGb,GAAG,EAAE,GAAG;cAKJ,cAAc;IAoB9B,qCAAqC;IACrC,SAAS,CAAC,KAAK,EAAE,UAAU;IAU3B,4DAA4D;IAC5D,KAAK;IAUL,2DAA2D;IAC3D,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,WAAW,CAAC,CAAC;IAI5C,wDAAwD;IACxD,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,YAAY;CAGvC"}
|
package/dist/stt/stt.js
CHANGED
|
@@ -1,26 +1,69 @@
|
|
|
1
|
+
import { EventEmitter } from "node:events";
|
|
1
2
|
import { AsyncIterableQueue } from "../utils.js";
|
|
2
3
|
var SpeechEventType = /* @__PURE__ */ ((SpeechEventType2) => {
|
|
3
4
|
SpeechEventType2[SpeechEventType2["START_OF_SPEECH"] = 0] = "START_OF_SPEECH";
|
|
4
5
|
SpeechEventType2[SpeechEventType2["INTERIM_TRANSCRIPT"] = 1] = "INTERIM_TRANSCRIPT";
|
|
5
6
|
SpeechEventType2[SpeechEventType2["FINAL_TRANSCRIPT"] = 2] = "FINAL_TRANSCRIPT";
|
|
6
7
|
SpeechEventType2[SpeechEventType2["END_OF_SPEECH"] = 3] = "END_OF_SPEECH";
|
|
8
|
+
SpeechEventType2[SpeechEventType2["RECOGNITION_USAGE"] = 4] = "RECOGNITION_USAGE";
|
|
9
|
+
SpeechEventType2[SpeechEventType2["METRICS_COLLECTED"] = 5] = "METRICS_COLLECTED";
|
|
7
10
|
return SpeechEventType2;
|
|
8
11
|
})(SpeechEventType || {});
|
|
9
|
-
class STT {
|
|
12
|
+
class STT extends EventEmitter {
|
|
10
13
|
#capabilities;
|
|
11
14
|
constructor(capabilities) {
|
|
15
|
+
super();
|
|
12
16
|
this.#capabilities = capabilities;
|
|
13
17
|
}
|
|
14
18
|
/** Returns this STT's capabilities */
|
|
15
19
|
get capabilities() {
|
|
16
20
|
return this.#capabilities;
|
|
17
21
|
}
|
|
22
|
+
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
23
|
+
async recognize(frame) {
|
|
24
|
+
const startTime = process.hrtime.bigint();
|
|
25
|
+
const event = await this._recognize(frame);
|
|
26
|
+
const duration = Number((process.hrtime.bigint() - startTime) / BigInt(1e6));
|
|
27
|
+
this.emit(5 /* METRICS_COLLECTED */, {
|
|
28
|
+
requestId: event.requestId ?? "",
|
|
29
|
+
timestamp: Date.now(),
|
|
30
|
+
duration,
|
|
31
|
+
label: this.label,
|
|
32
|
+
audioDuration: Array.isArray(frame) ? frame.reduce((sum, a) => sum + a.samplesPerChannel / a.sampleRate, 0) : frame.samplesPerChannel / frame.sampleRate,
|
|
33
|
+
streamed: false
|
|
34
|
+
});
|
|
35
|
+
return event;
|
|
36
|
+
}
|
|
18
37
|
}
|
|
19
38
|
class SpeechStream {
|
|
20
39
|
static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
|
|
21
40
|
input = new AsyncIterableQueue();
|
|
41
|
+
output = new AsyncIterableQueue();
|
|
22
42
|
queue = new AsyncIterableQueue();
|
|
23
43
|
closed = false;
|
|
44
|
+
#stt;
|
|
45
|
+
constructor(stt) {
|
|
46
|
+
this.#stt = stt;
|
|
47
|
+
this.monitorMetrics();
|
|
48
|
+
}
|
|
49
|
+
async monitorMetrics() {
|
|
50
|
+
const startTime = process.hrtime.bigint();
|
|
51
|
+
for await (const event of this.queue) {
|
|
52
|
+
this.output.put(event);
|
|
53
|
+
if (event.type !== 4 /* RECOGNITION_USAGE */) continue;
|
|
54
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
55
|
+
const metrics = {
|
|
56
|
+
timestamp: Date.now(),
|
|
57
|
+
requestId: event.requestId,
|
|
58
|
+
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
59
|
+
label: this.label,
|
|
60
|
+
audioDuration: event.recognitionUsage.audioDuration,
|
|
61
|
+
streamed: true
|
|
62
|
+
};
|
|
63
|
+
this.#stt.emit(5 /* METRICS_COLLECTED */, metrics);
|
|
64
|
+
}
|
|
65
|
+
this.output.close();
|
|
66
|
+
}
|
|
24
67
|
/** Push an audio frame to the STT */
|
|
25
68
|
pushFrame(frame) {
|
|
26
69
|
if (this.input.closed) {
|
|
@@ -52,12 +95,13 @@ class SpeechStream {
|
|
|
52
95
|
this.input.close();
|
|
53
96
|
}
|
|
54
97
|
next() {
|
|
55
|
-
return this.
|
|
98
|
+
return this.output.next();
|
|
56
99
|
}
|
|
57
100
|
/** Close both the input and output of the STT stream */
|
|
58
101
|
close() {
|
|
59
102
|
this.input.close();
|
|
60
103
|
this.queue.close();
|
|
104
|
+
this.output.close();
|
|
61
105
|
this.closed = true;
|
|
62
106
|
}
|
|
63
107
|
[Symbol.asyncIterator]() {
|
package/dist/stt/stt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT {\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n
|
|
1
|
+
{"version":3,"sources":["../../src/stt/stt.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { STTMetrics } from '../metrics/base.js';\nimport type { AudioBuffer } from '../utils.js';\nimport { AsyncIterableQueue } from '../utils.js';\n\n/** Indicates start/middle/end of speech */\nexport enum SpeechEventType {\n /**\n * Indicate the start of speech.\n * If the STT doesn't support this event, this will be emitted at the same time\n * as the first INTERIM_TRANSCRIPT.\n */\n START_OF_SPEECH = 0,\n /**\n * Interim transcript, useful for real-time transcription.\n */\n INTERIM_TRANSCRIPT = 1,\n /**\n * Final transcript, emitted when the STT is confident enough that a certain\n * portion of the speech will not change.\n */\n FINAL_TRANSCRIPT = 2,\n /**\n * Indicate the end of speech, emitted when the user stops speaking.\n * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.\n */\n END_OF_SPEECH = 3,\n /** Usage event, emitted periodically to indicate usage metrics. */\n RECOGNITION_USAGE = 4,\n METRICS_COLLECTED = 5,\n}\n\n/** SpeechData contains metadata about this {@link SpeechEvent}. */\nexport interface SpeechData {\n language: string;\n text: string;\n startTime: number;\n endTime: number;\n confidence: number;\n}\n\nexport interface RecognitionUsage {\n audioDuration: number;\n}\n\n/** SpeechEvent is a packet of speech-to-text data. */\nexport interface SpeechEvent {\n type: SpeechEventType;\n alternatives?: [SpeechData, ...SpeechData[]];\n requestId?: string;\n recognitionUsage?: RecognitionUsage;\n}\n\n/**\n * Describes the capabilities of the STT provider.\n *\n * @remarks\n * At present, the framework only supports providers that have a streaming endpoint.\n */\nexport interface STTCapabilities {\n streaming: boolean;\n interimResults: boolean;\n}\n\nexport type STTCallbacks = {\n [SpeechEventType.METRICS_COLLECTED]: (metrics: STTMetrics) => void;\n};\n\n/**\n * An instance of a speech-to-text adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child STT class, which inherits this class's methods.\n */\nexport abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {\n abstract label: string;\n #capabilities: STTCapabilities;\n\n constructor(capabilities: STTCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n /** Returns this STT's capabilities */\n get capabilities(): STTCapabilities {\n return this.#capabilities;\n }\n\n /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */\n async recognize(frame: AudioBuffer): Promise<SpeechEvent> {\n const startTime = process.hrtime.bigint();\n const event = await this._recognize(frame);\n const duration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));\n this.emit(SpeechEventType.METRICS_COLLECTED, {\n requestId: event.requestId ?? '',\n timestamp: Date.now(),\n duration,\n label: this.label,\n audioDuration: Array.isArray(frame)\n ? frame.reduce((sum, a) => sum + a.samplesPerChannel / a.sampleRate, 0)\n : frame.samplesPerChannel / frame.sampleRate,\n streamed: false,\n });\n return event;\n }\n\n protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;\n\n /**\n * Returns a {@link SpeechStream} that can be used to push audio frames and receive\n * transcriptions\n */\n abstract stream(): SpeechStream;\n}\n\n/**\n * An instance of a speech-to-text stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {\n * console.log(event.alternatives[0].text)\n * }\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SpeechStream class, which inherits this class's methods.\n */\nexport abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();\n protected output = new AsyncIterableQueue<SpeechEvent>();\n protected queue = new AsyncIterableQueue<SpeechEvent>();\n abstract label: string;\n protected closed = false;\n #stt: STT;\n\n constructor(stt: STT) {\n this.#stt = stt;\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n\n for await (const event of this.queue) {\n this.output.put(event);\n if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: STTMetrics = {\n timestamp: Date.now(),\n requestId: event.requestId!,\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n label: this.label,\n audioDuration: event.recognitionUsage!.audioDuration,\n streamed: true,\n };\n this.#stt.emit(SpeechEventType.METRICS_COLLECTED, metrics);\n }\n this.output.close();\n }\n\n /** Push an audio frame to the STT */\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n /** Flush the STT, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SpeechStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SpeechEvent>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the STT stream */\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SpeechStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAG7B,SAAS,0BAA0B;AAG5B,IAAK,kBAAL,kBAAKA,qBAAL;AAML,EAAAA,kCAAA,qBAAkB,KAAlB;AAIA,EAAAA,kCAAA,wBAAqB,KAArB;AAKA,EAAAA,kCAAA,sBAAmB,KAAnB;AAKA,EAAAA,kCAAA,mBAAgB,KAAhB;AAEA,EAAAA,kCAAA,uBAAoB,KAApB;AACA,EAAAA,kCAAA,uBAAoB,KAApB;AAvBU,SAAAA;AAAA,GAAA;AAqEL,MAAe,YAAa,aAAsD;AAAA,EAEvF;AAAA,EAEA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAAM,UAAU,OAA0C;AACxD,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,UAAM,QAAQ,MAAM,KAAK,WAAW,KAAK;AACzC,UAAM,WAAW,QAAQ,QAAQ,OAAO,OAAO,IAAI,aAAa,OAAO,GAAO,CAAC;AAC/E,SAAK,KAAK,2BAAmC;AAAA,MAC3C,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,eAAe,MAAM,QAAQ,KAAK,IAC9B,MAAM,OAAO,CAAC,KAAK,MAAM,MAAM,EAAE,oBAAoB,EAAE,YAAY,CAAC,IACpE,MAAM,oBAAoB,MAAM;AAAA,MACpC,UAAU;AAAA,IACZ,CAAC;AACD,WAAO;AAAA,EACT;AASF;AAkBO,MAAe,aAA2D;AAAA,EAC/E,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAoE;AAAA,EAChF,SAAS,IAAI,mBAAgC;AAAA,EAC7C,QAAQ,IAAI,mBAAgC;AAAA,EAE5C,SAAS;AAAA,EACnB;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AAExC,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,MAAM,SAAS,0BAAmC;AACtD,YAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,YAAM,UAAsB;AAAA,QAC1B,WAAW,KAAK,IAAI;AAAA,QACpB,WAAW,MAAM;AAAA,QACjB,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,QACvD,OAAO,KAAK;AAAA,QACZ,eAAe,MAAM,iBAAkB;AAAA,QACvC,UAAU;AAAA,MACZ;AACA,WAAK,KAAK,KAAK,2BAAmC,OAAO;AAAA,IAC3D;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,aAAa,cAAc;AAAA,EAC5C;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA6C;AAC3C,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAkB;AACrC,WAAO;AAAA,EACT;AACF;","names":["SpeechEventType"]}
|
package/dist/tts/index.cjs
CHANGED
|
@@ -22,7 +22,8 @@ __export(tts_exports, {
|
|
|
22
22
|
StreamAdapter: () => import_stream_adapter.StreamAdapter,
|
|
23
23
|
StreamAdapterWrapper: () => import_stream_adapter.StreamAdapterWrapper,
|
|
24
24
|
SynthesizeStream: () => import_tts.SynthesizeStream,
|
|
25
|
-
TTS: () => import_tts.TTS
|
|
25
|
+
TTS: () => import_tts.TTS,
|
|
26
|
+
TTSEvent: () => import_tts.TTSEvent
|
|
26
27
|
});
|
|
27
28
|
module.exports = __toCommonJS(tts_exports);
|
|
28
29
|
var import_tts = require("./tts.cjs");
|
|
@@ -33,6 +34,7 @@ var import_stream_adapter = require("./stream_adapter.cjs");
|
|
|
33
34
|
StreamAdapter,
|
|
34
35
|
StreamAdapterWrapper,
|
|
35
36
|
SynthesizeStream,
|
|
36
|
-
TTS
|
|
37
|
+
TTS,
|
|
38
|
+
TTSEvent
|
|
37
39
|
});
|
|
38
40
|
//# sourceMappingURL=index.cjs.map
|
package/dist/tts/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport {\n type SynthesizedAudio,\n type TTSCapabilities,\n TTS,\n SynthesizeStream,\n ChunkedStream,\n} from './tts.js';\nexport { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,
|
|
1
|
+
{"version":3,"sources":["../../src/tts/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport {\n type SynthesizedAudio,\n type TTSCapabilities,\n type TTSCallbacks,\n TTS,\n TTSEvent,\n SynthesizeStream,\n ChunkedStream,\n} from './tts.js';\nexport { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,iBAQO;AACP,4BAAoD;","names":[]}
|
package/dist/tts/index.d.ts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream, ChunkedStream, } from './tts.js';
|
|
1
|
+
export { type SynthesizedAudio, type TTSCapabilities, type TTSCallbacks, TTS, TTSEvent, SynthesizeStream, ChunkedStream, } from './tts.js';
|
|
2
2
|
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
|
|
3
3
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/tts/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AAGA,OAAO,EACL,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,GAAG,EACH,gBAAgB,EAChB,aAAa,GACd,MAAM,UAAU,CAAC;AAClB,OAAO,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AAGA,OAAO,EACL,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,KAAK,YAAY,EACjB,GAAG,EACH,QAAQ,EACR,gBAAgB,EAChB,aAAa,GACd,MAAM,UAAU,CAAC;AAClB,OAAO,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC"}
|
package/dist/tts/index.js
CHANGED
package/dist/tts/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport {\n type SynthesizedAudio,\n type TTSCapabilities,\n TTS,\n SynthesizeStream,\n ChunkedStream,\n} from './tts.js';\nexport { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';\n"],"mappings":"AAGA;AAAA,
|
|
1
|
+
{"version":3,"sources":["../../src/tts/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport {\n type SynthesizedAudio,\n type TTSCapabilities,\n type TTSCallbacks,\n TTS,\n TTSEvent,\n SynthesizeStream,\n ChunkedStream,\n} from './tts.js';\nexport { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';\n"],"mappings":"AAGA;AAAA,EAIE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP,SAAS,eAAe,4BAA4B;","names":[]}
|
|
@@ -26,10 +26,16 @@ var import_tts = require("./tts.cjs");
|
|
|
26
26
|
class StreamAdapter extends import_tts.TTS {
|
|
27
27
|
#tts;
|
|
28
28
|
#sentenceTokenizer;
|
|
29
|
+
label;
|
|
29
30
|
constructor(tts, sentenceTokenizer) {
|
|
30
31
|
super(tts.sampleRate, tts.numChannels, { streaming: true });
|
|
31
32
|
this.#tts = tts;
|
|
32
33
|
this.#sentenceTokenizer = sentenceTokenizer;
|
|
34
|
+
this.label = this.#tts.label;
|
|
35
|
+
this.label = `tts.StreamAdapter<${this.#tts.label}>`;
|
|
36
|
+
this.#tts.on(import_tts.TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
37
|
+
this.emit(import_tts.TTSEvent.METRICS_COLLECTED, metrics);
|
|
38
|
+
});
|
|
33
39
|
}
|
|
34
40
|
synthesize(text) {
|
|
35
41
|
return this.#tts.synthesize(text);
|
|
@@ -41,12 +47,17 @@ class StreamAdapter extends import_tts.TTS {
|
|
|
41
47
|
class StreamAdapterWrapper extends import_tts.SynthesizeStream {
|
|
42
48
|
#tts;
|
|
43
49
|
#sentenceStream;
|
|
50
|
+
label;
|
|
44
51
|
constructor(tts, sentenceTokenizer) {
|
|
45
|
-
super();
|
|
52
|
+
super(tts);
|
|
46
53
|
this.#tts = tts;
|
|
47
54
|
this.#sentenceStream = sentenceTokenizer.stream();
|
|
55
|
+
this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
|
|
48
56
|
this.#run();
|
|
49
57
|
}
|
|
58
|
+
async monitorMetrics() {
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
50
61
|
async #run() {
|
|
51
62
|
const forwardInput = async () => {
|
|
52
63
|
for await (const input of this.input) {
|
|
@@ -62,10 +73,10 @@ class StreamAdapterWrapper extends import_tts.SynthesizeStream {
|
|
|
62
73
|
const synthesize = async () => {
|
|
63
74
|
for await (const ev of this.#sentenceStream) {
|
|
64
75
|
for await (const audio of this.#tts.synthesize(ev.token)) {
|
|
65
|
-
this.
|
|
76
|
+
this.output.put(audio);
|
|
66
77
|
}
|
|
67
78
|
}
|
|
68
|
-
this.
|
|
79
|
+
this.output.put(import_tts.SynthesizeStream.END_OF_STREAM);
|
|
69
80
|
};
|
|
70
81
|
Promise.all([forwardInput(), synthesize()]);
|
|
71
82
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';\nimport type { ChunkedStream } from './tts.js';\nimport { SynthesizeStream, TTS } from './tts.js';\n\nexport class StreamAdapter extends TTS {\n #tts: TTS;\n #sentenceTokenizer: SentenceTokenizer;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts.sampleRate, tts.numChannels, { streaming: true });\n this.#tts = tts;\n this.#sentenceTokenizer = sentenceTokenizer;\n }\n\n synthesize(text: string): ChunkedStream {\n return this.#tts.synthesize(text);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);\n }\n}\n\nexport class StreamAdapterWrapper extends SynthesizeStream {\n #tts: TTS;\n #sentenceStream: SentenceStream;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super();\n this.#tts = tts;\n this.#sentenceStream = sentenceTokenizer.stream();\n\n this.#run();\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SynthesizeStream.FLUSH_SENTINEL) {\n this.#sentenceStream.flush();\n } else {\n this.#sentenceStream.pushText(input);\n }\n }\n this.#sentenceStream.endInput();\n this.#sentenceStream.close();\n };\n\n const synthesize = async () => {\n for await (const ev of this.#sentenceStream) {\n for await (const audio of this.#tts.synthesize(ev.token)) {\n this.
|
|
1
|
+
{"version":3,"sources":["../../src/tts/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';\nimport type { ChunkedStream } from './tts.js';\nimport { SynthesizeStream, TTS, TTSEvent } from './tts.js';\n\nexport class StreamAdapter extends TTS {\n #tts: TTS;\n #sentenceTokenizer: SentenceTokenizer;\n label: string;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts.sampleRate, tts.numChannels, { streaming: true });\n this.#tts = tts;\n this.#sentenceTokenizer = sentenceTokenizer;\n this.label = this.#tts.label;\n this.label = `tts.StreamAdapter<${this.#tts.label}>`;\n\n this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {\n this.emit(TTSEvent.METRICS_COLLECTED, metrics);\n });\n }\n\n synthesize(text: string): ChunkedStream {\n return this.#tts.synthesize(text);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);\n }\n}\n\nexport class StreamAdapterWrapper extends SynthesizeStream {\n #tts: TTS;\n #sentenceStream: SentenceStream;\n label: string;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts);\n this.#tts = tts;\n this.#sentenceStream = sentenceTokenizer.stream();\n this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;\n\n this.#run();\n }\n\n async monitorMetrics() {\n return; // do nothing\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SynthesizeStream.FLUSH_SENTINEL) {\n this.#sentenceStream.flush();\n } else {\n this.#sentenceStream.pushText(input);\n }\n }\n this.#sentenceStream.endInput();\n this.#sentenceStream.close();\n };\n\n const synthesize = async () => {\n for await (const ev of this.#sentenceStream) {\n for await (const audio of this.#tts.synthesize(ev.token)) {\n this.output.put(audio);\n }\n }\n this.output.put(SynthesizeStream.END_OF_STREAM);\n };\n\n Promise.all([forwardInput(), synthesize()]);\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,iBAAgD;AAEzC,MAAM,sBAAsB,eAAI;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,mBAAsC;AAC1D,UAAM,IAAI,YAAY,IAAI,aAAa,EAAE,WAAW,KAAK,CAAC;AAC1D,SAAK,OAAO;AACZ,SAAK,qBAAqB;AAC1B,SAAK,QAAQ,KAAK,KAAK;AACvB,SAAK,QAAQ,qBAAqB,KAAK,KAAK,KAAK;AAEjD,SAAK,KAAK,GAAG,oBAAS,mBAAmB,CAAC,YAAY;AACpD,WAAK,KAAK,oBAAS,mBAAmB,OAAO;AAAA,IAC/C,CAAC;AAAA,EACH;AAAA,EAEA,WAAW,MAA6B;AACtC,WAAO,KAAK,KAAK,WAAW,IAAI;AAAA,EAClC;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,qBAAqB,KAAK,MAAM,KAAK,kBAAkB;AAAA,EACpE;AACF;AAEO,MAAM,6BAA6B,4BAAiB;AAAA,EACzD;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,mBAAsC;AAC1D,UAAM,GAAG;AACT,SAAK,OAAO;AACZ,SAAK,kBAAkB,kBAAkB,OAAO;AAChD,SAAK,QAAQ,4BAA4B,KAAK,KAAK,KAAK;AAExD,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,iBAAiB;AACrB;AAAA,EACF;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,eAAe,YAAY;AAC/B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,UAAU,4BAAiB,gBAAgB;AAC7C,eAAK,gBAAgB,MAAM;AAAA,QAC7B,OAAO;AACL,eAAK,gBAAgB,SAAS,KAAK;AAAA,QACrC;AAAA,MACF;AACA,WAAK,gBAAgB,SAAS;AAC9B,WAAK,gBAAgB,MAAM;AAAA,IAC7B;AAEA,UAAM,aAAa,YAAY;AAC7B,uBAAiB,MAAM,KAAK,iBAAiB;AAC3C,yBAAiB,SAAS,KAAK,KAAK,WAAW,GAAG,KAAK,GAAG;AACxD,eAAK,OAAO,IAAI,KAAK;AAAA,QACvB;AAAA,MACF;AACA,WAAK,OAAO,IAAI,4BAAiB,aAAa;AAAA,IAChD;AAEA,YAAQ,IAAI,CAAC,aAAa,GAAG,WAAW,CAAC,CAAC;AAAA,EAC5C;AACF;","names":[]}
|
|
@@ -3,12 +3,15 @@ import type { ChunkedStream } from './tts.js';
|
|
|
3
3
|
import { SynthesizeStream, TTS } from './tts.js';
|
|
4
4
|
export declare class StreamAdapter extends TTS {
|
|
5
5
|
#private;
|
|
6
|
+
label: string;
|
|
6
7
|
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer);
|
|
7
8
|
synthesize(text: string): ChunkedStream;
|
|
8
9
|
stream(): StreamAdapterWrapper;
|
|
9
10
|
}
|
|
10
11
|
export declare class StreamAdapterWrapper extends SynthesizeStream {
|
|
11
12
|
#private;
|
|
13
|
+
label: string;
|
|
12
14
|
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer);
|
|
15
|
+
monitorMetrics(): Promise<void>;
|
|
13
16
|
}
|
|
14
17
|
//# sourceMappingURL=stream_adapter.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stream_adapter.d.ts","sourceRoot":"","sources":["../../src/tts/stream_adapter.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAkB,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAC9E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,GAAG,
|
|
1
|
+
{"version":3,"file":"stream_adapter.d.ts","sourceRoot":"","sources":["../../src/tts/stream_adapter.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAkB,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAC9E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,GAAG,EAAY,MAAM,UAAU,CAAC;AAE3D,qBAAa,aAAc,SAAQ,GAAG;;IAGpC,KAAK,EAAE,MAAM,CAAC;gBAEF,GAAG,EAAE,GAAG,EAAE,iBAAiB,EAAE,iBAAiB;IAY1D,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAIvC,MAAM,IAAI,oBAAoB;CAG/B;AAED,qBAAa,oBAAqB,SAAQ,gBAAgB;;IAGxD,KAAK,EAAE,MAAM,CAAC;gBAEF,GAAG,EAAE,GAAG,EAAE,iBAAiB,EAAE,iBAAiB;IASpD,cAAc;CA4BrB"}
|
|
@@ -1,11 +1,17 @@
|
|
|
1
|
-
import { SynthesizeStream, TTS } from "./tts.js";
|
|
1
|
+
import { SynthesizeStream, TTS, TTSEvent } from "./tts.js";
|
|
2
2
|
class StreamAdapter extends TTS {
|
|
3
3
|
#tts;
|
|
4
4
|
#sentenceTokenizer;
|
|
5
|
+
label;
|
|
5
6
|
constructor(tts, sentenceTokenizer) {
|
|
6
7
|
super(tts.sampleRate, tts.numChannels, { streaming: true });
|
|
7
8
|
this.#tts = tts;
|
|
8
9
|
this.#sentenceTokenizer = sentenceTokenizer;
|
|
10
|
+
this.label = this.#tts.label;
|
|
11
|
+
this.label = `tts.StreamAdapter<${this.#tts.label}>`;
|
|
12
|
+
this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
13
|
+
this.emit(TTSEvent.METRICS_COLLECTED, metrics);
|
|
14
|
+
});
|
|
9
15
|
}
|
|
10
16
|
synthesize(text) {
|
|
11
17
|
return this.#tts.synthesize(text);
|
|
@@ -17,12 +23,17 @@ class StreamAdapter extends TTS {
|
|
|
17
23
|
class StreamAdapterWrapper extends SynthesizeStream {
|
|
18
24
|
#tts;
|
|
19
25
|
#sentenceStream;
|
|
26
|
+
label;
|
|
20
27
|
constructor(tts, sentenceTokenizer) {
|
|
21
|
-
super();
|
|
28
|
+
super(tts);
|
|
22
29
|
this.#tts = tts;
|
|
23
30
|
this.#sentenceStream = sentenceTokenizer.stream();
|
|
31
|
+
this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
|
|
24
32
|
this.#run();
|
|
25
33
|
}
|
|
34
|
+
async monitorMetrics() {
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
26
37
|
async #run() {
|
|
27
38
|
const forwardInput = async () => {
|
|
28
39
|
for await (const input of this.input) {
|
|
@@ -38,10 +49,10 @@ class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
38
49
|
const synthesize = async () => {
|
|
39
50
|
for await (const ev of this.#sentenceStream) {
|
|
40
51
|
for await (const audio of this.#tts.synthesize(ev.token)) {
|
|
41
|
-
this.
|
|
52
|
+
this.output.put(audio);
|
|
42
53
|
}
|
|
43
54
|
}
|
|
44
|
-
this.
|
|
55
|
+
this.output.put(SynthesizeStream.END_OF_STREAM);
|
|
45
56
|
};
|
|
46
57
|
Promise.all([forwardInput(), synthesize()]);
|
|
47
58
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tts/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';\nimport type { ChunkedStream } from './tts.js';\nimport { SynthesizeStream, TTS } from './tts.js';\n\nexport class StreamAdapter extends TTS {\n #tts: TTS;\n #sentenceTokenizer: SentenceTokenizer;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts.sampleRate, tts.numChannels, { streaming: true });\n this.#tts = tts;\n this.#sentenceTokenizer = sentenceTokenizer;\n }\n\n synthesize(text: string): ChunkedStream {\n return this.#tts.synthesize(text);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);\n }\n}\n\nexport class StreamAdapterWrapper extends SynthesizeStream {\n #tts: TTS;\n #sentenceStream: SentenceStream;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super();\n this.#tts = tts;\n this.#sentenceStream = sentenceTokenizer.stream();\n\n this.#run();\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SynthesizeStream.FLUSH_SENTINEL) {\n this.#sentenceStream.flush();\n } else {\n this.#sentenceStream.pushText(input);\n }\n }\n this.#sentenceStream.endInput();\n this.#sentenceStream.close();\n };\n\n const synthesize = async () => {\n for await (const ev of this.#sentenceStream) {\n for await (const audio of this.#tts.synthesize(ev.token)) {\n this.
|
|
1
|
+
{"version":3,"sources":["../../src/tts/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';\nimport type { ChunkedStream } from './tts.js';\nimport { SynthesizeStream, TTS, TTSEvent } from './tts.js';\n\nexport class StreamAdapter extends TTS {\n #tts: TTS;\n #sentenceTokenizer: SentenceTokenizer;\n label: string;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts.sampleRate, tts.numChannels, { streaming: true });\n this.#tts = tts;\n this.#sentenceTokenizer = sentenceTokenizer;\n this.label = this.#tts.label;\n this.label = `tts.StreamAdapter<${this.#tts.label}>`;\n\n this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {\n this.emit(TTSEvent.METRICS_COLLECTED, metrics);\n });\n }\n\n synthesize(text: string): ChunkedStream {\n return this.#tts.synthesize(text);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);\n }\n}\n\nexport class StreamAdapterWrapper extends SynthesizeStream {\n #tts: TTS;\n #sentenceStream: SentenceStream;\n label: string;\n\n constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {\n super(tts);\n this.#tts = tts;\n this.#sentenceStream = sentenceTokenizer.stream();\n this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;\n\n this.#run();\n }\n\n async monitorMetrics() {\n return; // do nothing\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SynthesizeStream.FLUSH_SENTINEL) {\n this.#sentenceStream.flush();\n } else {\n this.#sentenceStream.pushText(input);\n }\n }\n this.#sentenceStream.endInput();\n this.#sentenceStream.close();\n };\n\n const synthesize = async () => {\n for await (const ev of this.#sentenceStream) {\n for await (const audio of this.#tts.synthesize(ev.token)) {\n this.output.put(audio);\n }\n }\n this.output.put(SynthesizeStream.END_OF_STREAM);\n };\n\n Promise.all([forwardInput(), synthesize()]);\n }\n}\n"],"mappings":"AAKA,SAAS,kBAAkB,KAAK,gBAAgB;AAEzC,MAAM,sBAAsB,IAAI;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,mBAAsC;AAC1D,UAAM,IAAI,YAAY,IAAI,aAAa,EAAE,WAAW,KAAK,CAAC;AAC1D,SAAK,OAAO;AACZ,SAAK,qBAAqB;AAC1B,SAAK,QAAQ,KAAK,KAAK;AACvB,SAAK,QAAQ,qBAAqB,KAAK,KAAK,KAAK;AAEjD,SAAK,KAAK,GAAG,SAAS,mBAAmB,CAAC,YAAY;AACpD,WAAK,KAAK,SAAS,mBAAmB,OAAO;AAAA,IAC/C,CAAC;AAAA,EACH;AAAA,EAEA,WAAW,MAA6B;AACtC,WAAO,KAAK,KAAK,WAAW,IAAI;AAAA,EAClC;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,qBAAqB,KAAK,MAAM,KAAK,kBAAkB;AAAA,EACpE;AACF;AAEO,MAAM,6BAA6B,iBAAiB;AAAA,EACzD;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,mBAAsC;AAC1D,UAAM,GAAG;AACT,SAAK,OAAO;AACZ,SAAK,kBAAkB,kBAAkB,OAAO;AAChD,SAAK,QAAQ,4BAA4B,KAAK,KAAK,KAAK;AAExD,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,iBAAiB;AACrB;AAAA,EACF;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,eAAe,YAAY;AAC/B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,UAAU,iBAAiB,gBAAgB;AAC7C,eAAK,gBAAgB,MAAM;AAAA,QAC7B,OAAO;AACL,eAAK,gBAAgB,SAAS,KAAK;AAAA,QACrC;AAAA,MACF;AACA,WAAK,gBAAgB,SAAS;AAC9B,WAAK,gBAAgB,MAAM;AAAA,IAC7B;AAEA,UAAM,aAAa,YAAY;AAC7B,uBAAiB,MAAM,KAAK,iBAAiB;AAC3C,yBAAiB,SAAS,KAAK,KAAK,WAAW,GAAG,KAAK,GAAG;AACxD,eAAK,OAAO,IAAI,KAAK;AAAA,QACvB;AAAA,MACF;AACA,WAAK,OAAO,IAAI,iBAAiB,aAAa;AAAA,IAChD;AAEA,YAAQ,IAAI,CAAC,aAAa,GAAG,WAAW,CAAC,CAAC;AAAA,EAC5C;AACF;","names":[]}
|
package/dist/tts/tts.cjs
CHANGED
|
@@ -20,15 +20,22 @@ var tts_exports = {};
|
|
|
20
20
|
__export(tts_exports, {
|
|
21
21
|
ChunkedStream: () => ChunkedStream,
|
|
22
22
|
SynthesizeStream: () => SynthesizeStream,
|
|
23
|
-
TTS: () => TTS
|
|
23
|
+
TTS: () => TTS,
|
|
24
|
+
TTSEvent: () => TTSEvent
|
|
24
25
|
});
|
|
25
26
|
module.exports = __toCommonJS(tts_exports);
|
|
27
|
+
var import_node_events = require("node:events");
|
|
26
28
|
var import_utils = require("../utils.cjs");
|
|
27
|
-
|
|
29
|
+
var TTSEvent = /* @__PURE__ */ ((TTSEvent2) => {
|
|
30
|
+
TTSEvent2[TTSEvent2["METRICS_COLLECTED"] = 0] = "METRICS_COLLECTED";
|
|
31
|
+
return TTSEvent2;
|
|
32
|
+
})(TTSEvent || {});
|
|
33
|
+
class TTS extends import_node_events.EventEmitter {
|
|
28
34
|
#capabilities;
|
|
29
35
|
#sampleRate;
|
|
30
36
|
#numChannels;
|
|
31
37
|
constructor(sampleRate, numChannels, capabilities) {
|
|
38
|
+
super();
|
|
32
39
|
this.#capabilities = capabilities;
|
|
33
40
|
this.#sampleRate = sampleRate;
|
|
34
41
|
this.#numChannels = numChannels;
|
|
@@ -51,9 +58,62 @@ class SynthesizeStream {
|
|
|
51
58
|
static END_OF_STREAM = Symbol("END_OF_STREAM");
|
|
52
59
|
input = new import_utils.AsyncIterableQueue();
|
|
53
60
|
queue = new import_utils.AsyncIterableQueue();
|
|
61
|
+
output = new import_utils.AsyncIterableQueue();
|
|
54
62
|
closed = false;
|
|
63
|
+
#tts;
|
|
64
|
+
#metricsPendingTexts = [];
|
|
65
|
+
#metricsText = "";
|
|
66
|
+
#monitorMetricsTask;
|
|
67
|
+
constructor(tts) {
|
|
68
|
+
this.#tts = tts;
|
|
69
|
+
}
|
|
70
|
+
async monitorMetrics() {
|
|
71
|
+
const startTime = process.hrtime.bigint();
|
|
72
|
+
let audioDuration = 0;
|
|
73
|
+
let ttfb;
|
|
74
|
+
let requestId = "";
|
|
75
|
+
const emit = () => {
|
|
76
|
+
if (this.#metricsPendingTexts.length) {
|
|
77
|
+
const text = this.#metricsPendingTexts.shift();
|
|
78
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
79
|
+
const metrics = {
|
|
80
|
+
timestamp: Date.now(),
|
|
81
|
+
requestId,
|
|
82
|
+
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
83
|
+
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
84
|
+
charactersCount: text.length,
|
|
85
|
+
audioDuration,
|
|
86
|
+
cancelled: false,
|
|
87
|
+
// XXX(nbsp)
|
|
88
|
+
label: this.label,
|
|
89
|
+
streamed: false
|
|
90
|
+
};
|
|
91
|
+
this.#tts.emit(0 /* METRICS_COLLECTED */, metrics);
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
for await (const audio of this.queue) {
|
|
95
|
+
this.output.put(audio);
|
|
96
|
+
if (audio === SynthesizeStream.END_OF_STREAM) continue;
|
|
97
|
+
requestId = audio.requestId;
|
|
98
|
+
if (!ttfb) {
|
|
99
|
+
ttfb = process.hrtime.bigint() - startTime;
|
|
100
|
+
}
|
|
101
|
+
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
102
|
+
if (audio.final) {
|
|
103
|
+
emit();
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
if (requestId) {
|
|
107
|
+
emit();
|
|
108
|
+
}
|
|
109
|
+
this.output.close();
|
|
110
|
+
}
|
|
55
111
|
/** Push a string of text to the TTS */
|
|
56
112
|
pushText(text) {
|
|
113
|
+
if (!this.#monitorMetricsTask) {
|
|
114
|
+
this.#monitorMetricsTask = this.monitorMetrics();
|
|
115
|
+
}
|
|
116
|
+
this.#metricsText += text;
|
|
57
117
|
if (this.input.closed) {
|
|
58
118
|
throw new Error("Input is closed");
|
|
59
119
|
}
|
|
@@ -64,6 +124,10 @@ class SynthesizeStream {
|
|
|
64
124
|
}
|
|
65
125
|
/** Flush the TTS, causing it to process all pending text */
|
|
66
126
|
flush() {
|
|
127
|
+
if (this.#metricsText) {
|
|
128
|
+
this.#metricsPendingTexts.push(this.#metricsText);
|
|
129
|
+
this.#metricsText = "";
|
|
130
|
+
}
|
|
67
131
|
if (this.input.closed) {
|
|
68
132
|
throw new Error("Input is closed");
|
|
69
133
|
}
|
|
@@ -83,12 +147,12 @@ class SynthesizeStream {
|
|
|
83
147
|
this.input.close();
|
|
84
148
|
}
|
|
85
149
|
next() {
|
|
86
|
-
return this.
|
|
150
|
+
return this.output.next();
|
|
87
151
|
}
|
|
88
152
|
/** Close both the input and output of the TTS stream */
|
|
89
153
|
close() {
|
|
90
154
|
this.input.close();
|
|
91
|
-
this.
|
|
155
|
+
this.output.close();
|
|
92
156
|
this.closed = true;
|
|
93
157
|
}
|
|
94
158
|
[Symbol.asyncIterator]() {
|
|
@@ -97,7 +161,44 @@ class SynthesizeStream {
|
|
|
97
161
|
}
|
|
98
162
|
class ChunkedStream {
|
|
99
163
|
queue = new import_utils.AsyncIterableQueue();
|
|
164
|
+
output = new import_utils.AsyncIterableQueue();
|
|
100
165
|
closed = false;
|
|
166
|
+
#text;
|
|
167
|
+
#tts;
|
|
168
|
+
constructor(text, tts) {
|
|
169
|
+
this.#text = text;
|
|
170
|
+
this.#tts = tts;
|
|
171
|
+
this.monitorMetrics();
|
|
172
|
+
}
|
|
173
|
+
async monitorMetrics() {
|
|
174
|
+
const startTime = process.hrtime.bigint();
|
|
175
|
+
let audioDuration = 0;
|
|
176
|
+
let ttfb;
|
|
177
|
+
let requestId = "";
|
|
178
|
+
for await (const audio of this.queue) {
|
|
179
|
+
this.output.put(audio);
|
|
180
|
+
requestId = audio.requestId;
|
|
181
|
+
if (!ttfb) {
|
|
182
|
+
ttfb = process.hrtime.bigint() - startTime;
|
|
183
|
+
}
|
|
184
|
+
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
|
185
|
+
}
|
|
186
|
+
this.output.close();
|
|
187
|
+
const duration = process.hrtime.bigint() - startTime;
|
|
188
|
+
const metrics = {
|
|
189
|
+
timestamp: Date.now(),
|
|
190
|
+
requestId,
|
|
191
|
+
ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
|
|
192
|
+
duration: Math.trunc(Number(duration / BigInt(1e6))),
|
|
193
|
+
charactersCount: this.#text.length,
|
|
194
|
+
audioDuration,
|
|
195
|
+
cancelled: false,
|
|
196
|
+
// XXX(nbsp)
|
|
197
|
+
label: this.label,
|
|
198
|
+
streamed: false
|
|
199
|
+
};
|
|
200
|
+
this.#tts.emit(0 /* METRICS_COLLECTED */, metrics);
|
|
201
|
+
}
|
|
101
202
|
/** Collect every frame into one in a single call */
|
|
102
203
|
async collect() {
|
|
103
204
|
const frames = [];
|
|
@@ -107,11 +208,12 @@ class ChunkedStream {
|
|
|
107
208
|
return (0, import_utils.mergeFrames)(frames);
|
|
108
209
|
}
|
|
109
210
|
next() {
|
|
110
|
-
return this.
|
|
211
|
+
return this.output.next();
|
|
111
212
|
}
|
|
112
213
|
/** Close both the input and output of the TTS stream */
|
|
113
214
|
close() {
|
|
114
215
|
this.queue.close();
|
|
216
|
+
this.output.close();
|
|
115
217
|
this.closed = true;
|
|
116
218
|
}
|
|
117
219
|
[Symbol.asyncIterator]() {
|
|
@@ -122,6 +224,7 @@ class ChunkedStream {
|
|
|
122
224
|
0 && (module.exports = {
|
|
123
225
|
ChunkedStream,
|
|
124
226
|
SynthesizeStream,
|
|
125
|
-
TTS
|
|
227
|
+
TTS,
|
|
228
|
+
TTSEvent
|
|
126
229
|
});
|
|
127
230
|
//# sourceMappingURL=tts.cjs.map
|