@livekit/agents 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +12 -0
- package/dist/pipeline/agent_output.js +3 -9
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.d.ts.map +1 -1
- package/dist/pipeline/agent_playout.js +2 -1
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +1 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +1 -1
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +11 -2
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/stt/index.d.ts +1 -0
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js +1 -0
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.d.ts +15 -0
- package/dist/stt/stream_adapter.d.ts.map +1 -0
- package/dist/stt/stream_adapter.js +59 -0
- package/dist/stt/stream_adapter.js.map +1 -0
- package/dist/stt/stt.d.ts +3 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/token_stream.d.ts +1 -0
- package/dist/tokenize/token_stream.d.ts.map +1 -1
- package/dist/tokenize/token_stream.js +4 -1
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/tts/index.d.ts +2 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.d.ts +14 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -0
- package/dist/tts/stream_adapter.js +50 -0
- package/dist/tts/stream_adapter.js.map +1 -0
- package/dist/tts/tts.d.ts +28 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +38 -1
- package/dist/tts/tts.js.map +1 -1
- package/package.json +1 -1
- package/src/pipeline/agent_output.ts +4 -8
- package/src/pipeline/agent_playout.ts +3 -1
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +13 -3
- package/src/stt/index.ts +1 -0
- package/src/stt/stream_adapter.ts +75 -0
- package/src/stt/stt.ts +4 -0
- package/src/tokenize/token_stream.ts +5 -1
- package/src/tts/index.ts +8 -1
- package/src/tts/stream_adapter.ts +63 -0
- package/src/tts/tts.ts +48 -1
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { SynthesizeStream, TTS } from './tts.js';
|
|
2
|
+
export class StreamAdapter extends TTS {
|
|
3
|
+
#tts;
|
|
4
|
+
#sentenceTokenizer;
|
|
5
|
+
constructor(tts, sentenceTokenizer) {
|
|
6
|
+
super(tts.sampleRate, tts.numChannels, { streaming: true });
|
|
7
|
+
this.#tts = tts;
|
|
8
|
+
this.#sentenceTokenizer = sentenceTokenizer;
|
|
9
|
+
}
|
|
10
|
+
synthesize(text) {
|
|
11
|
+
return this.#tts.synthesize(text);
|
|
12
|
+
}
|
|
13
|
+
stream() {
|
|
14
|
+
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
export class StreamAdapterWrapper extends SynthesizeStream {
|
|
18
|
+
#tts;
|
|
19
|
+
#sentenceStream;
|
|
20
|
+
constructor(tts, sentenceTokenizer) {
|
|
21
|
+
super();
|
|
22
|
+
this.#tts = tts;
|
|
23
|
+
this.#sentenceStream = sentenceTokenizer.stream();
|
|
24
|
+
this.#run();
|
|
25
|
+
}
|
|
26
|
+
async #run() {
|
|
27
|
+
const forwardInput = async () => {
|
|
28
|
+
for await (const input of this.input) {
|
|
29
|
+
if (input === SynthesizeStream.FLUSH_SENTINEL) {
|
|
30
|
+
this.#sentenceStream.flush();
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
this.#sentenceStream.pushText(input);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
this.#sentenceStream.endInput();
|
|
37
|
+
this.#sentenceStream.close();
|
|
38
|
+
};
|
|
39
|
+
const synthesize = async () => {
|
|
40
|
+
for await (const ev of this.#sentenceStream) {
|
|
41
|
+
for await (const audio of this.#tts.synthesize(ev.token)) {
|
|
42
|
+
this.queue.put(audio);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
46
|
+
};
|
|
47
|
+
Promise.all([forwardInput(), synthesize()]);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=stream_adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stream_adapter.js","sourceRoot":"","sources":["../../src/tts/stream_adapter.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,gBAAgB,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAEjD,MAAM,OAAO,aAAc,SAAQ,GAAG;IACpC,IAAI,CAAM;IACV,kBAAkB,CAAoB;IAEtC,YAAY,GAAQ,EAAE,iBAAoC;QACxD,KAAK,CAAC,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5D,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC;QAChB,IAAI,CAAC,kBAAkB,GAAG,iBAAiB,CAAC;IAC9C,CAAC;IAED,UAAU,CAAC,IAAY;QACrB,OAAO,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAED,MAAM;QACJ,OAAO,IAAI,oBAAoB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACtE,CAAC;CACF;AAED,MAAM,OAAO,oBAAqB,SAAQ,gBAAgB;IACxD,IAAI,CAAM;IACV,eAAe,CAAiB;IAEhC,YAAY,GAAQ,EAAE,iBAAoC;QACxD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC;QAChB,IAAI,CAAC,eAAe,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC;QAElD,IAAI,CAAC,IAAI,EAAE,CAAC;IACd,CAAC;IAED,KAAK,CAAC,IAAI;QACR,MAAM,YAAY,GAAG,KAAK,IAAI,EAAE;YAC9B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBACrC,IAAI,KAAK,KAAK,gBAAgB,CAAC,cAAc,EAAE,CAAC;oBAC9C,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;gBAC/B,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;gBACvC,CAAC;YACH,CAAC;YACD,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC;YAChC,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;QAC/B,CAAC,CAAC;QAEF,MAAM,UAAU,GAAG,KAAK,IAAI,EAAE;YAC5B,IAAI,KAAK,EAAE,MAAM,EAAE,IAAI,IAAI,CAAC,eAAe,EAAE,CAAC;gBAC5C,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC;oBACzD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;gBACxB,CAAC;YACH,CAAC;YACD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;QACjD,CAAC,CAAC;QAEF,OAAO,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC;IAC9C,CAAC;CACF"}
|
package/dist/tts/tts.d.ts
CHANGED
|
@@ -37,6 +37,10 @@ export declare abstract class TTS {
|
|
|
37
37
|
get sampleRate(): number;
|
|
38
38
|
/** Returns the channel count of audio frames returned by this TTS */
|
|
39
39
|
get numChannels(): number;
|
|
40
|
+
/**
|
|
41
|
+
* Receives text and returns synthesis in the form of a {@link ChunkedStream}
|
|
42
|
+
*/
|
|
43
|
+
abstract synthesize(text: string): ChunkedStream;
|
|
40
44
|
/**
|
|
41
45
|
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
42
46
|
*/
|
|
@@ -73,4 +77,28 @@ export declare abstract class SynthesizeStream implements AsyncIterableIterator<
|
|
|
73
77
|
close(): void;
|
|
74
78
|
[Symbol.asyncIterator](): SynthesizeStream;
|
|
75
79
|
}
|
|
80
|
+
/**
|
|
81
|
+
* An instance of a text-to-speech response, as an asynchronous iterable iterator.
|
|
82
|
+
*
|
|
83
|
+
* @example Looping through frames
|
|
84
|
+
* ```ts
|
|
85
|
+
* for await (const event of stream) {
|
|
86
|
+
* await source.captureFrame(event.frame);
|
|
87
|
+
* }
|
|
88
|
+
* ```
|
|
89
|
+
*
|
|
90
|
+
* @remarks
|
|
91
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
92
|
+
* exports its own child ChunkedStream class, which inherits this class's methods.
|
|
93
|
+
*/
|
|
94
|
+
export declare abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
|
|
95
|
+
protected queue: AsyncIterableQueue<SynthesizedAudio>;
|
|
96
|
+
protected closed: boolean;
|
|
97
|
+
/** Collect every frame into one in a single call */
|
|
98
|
+
collect(): Promise<AudioFrame>;
|
|
99
|
+
next(): Promise<IteratorResult<SynthesizedAudio>>;
|
|
100
|
+
/** Close both the input and output of the TTS stream */
|
|
101
|
+
close(): void;
|
|
102
|
+
[Symbol.asyncIterator](): ChunkedStream;
|
|
103
|
+
}
|
|
76
104
|
//# sourceMappingURL=tts.d.ts.map
|
package/dist/tts/tts.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,
|
|
1
|
+
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAe,MAAM,aAAa,CAAC;AAE9D,+EAA+E;AAC/E,MAAM,WAAW,gBAAgB;IAC/B,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;IAClB,uDAAuD;IACvD,SAAS,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,KAAK,EAAE,UAAU,CAAC;IAClB,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;;;;;GAMG;AACH,8BAAsB,GAAG;;gBAKX,UAAU,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,eAAe;IAMlF,sCAAsC;IACtC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED,mEAAmE;IACnE,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,qEAAqE;IACrE,IAAI,WAAW,IAAI,MAAM,CAExB;IAED;;OAEG;IACH,QAAQ,CAAC,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAEhD;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,gBAAgB;CACpC;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,gBACpB,YAAW,qBAAqB,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC;IAE1F,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,MAAM,CAAC,QAAQ,CAAC,aAAa,gBAA2B;IACxD,SAAS,CAAC,KAAK,sEAA6E;IAC5F,SAAS,CAAC,KAAK,+EAEX;IACJ,SAAS,CAAC,MAAM,UAAS;IAEzB,uCAAuC;IACvC,QAAQ,CAAC,IAAI,EAAE,MAAM;IAUrB,4DAA4D;IAC5D,KAAK;IAUL,2DAA2D;IAC3D,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAIzF,wDAAwD;IACxD,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,gBAAgB;CAG3C;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,aAAc,YAAW,qBAAqB,CAAC,gBAAgB,CAAC;IACpF,SAAS,CAAC,KAAK,uCAA8C;IAC7D,SAAS,CAAC,MAAM,UAAS;IAEzB,oDAAoD;IAC9C,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC;IAQpC,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,CAAC,CAAC;IAIjD,wDAAwD;IACxD,KAAK;IAKL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa;CAGxC"}
|
package/dist/tts/tts.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { AsyncIterableQueue } from '../utils.js';
|
|
1
|
+
import { AsyncIterableQueue, mergeFrames } from '../utils.js';
|
|
2
2
|
/**
|
|
3
3
|
* An instance of a text-to-speech adapter.
|
|
4
4
|
*
|
|
@@ -91,4 +91,41 @@ export class SynthesizeStream {
|
|
|
91
91
|
return this;
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
|
+
/**
|
|
95
|
+
* An instance of a text-to-speech response, as an asynchronous iterable iterator.
|
|
96
|
+
*
|
|
97
|
+
* @example Looping through frames
|
|
98
|
+
* ```ts
|
|
99
|
+
* for await (const event of stream) {
|
|
100
|
+
* await source.captureFrame(event.frame);
|
|
101
|
+
* }
|
|
102
|
+
* ```
|
|
103
|
+
*
|
|
104
|
+
* @remarks
|
|
105
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
106
|
+
* exports its own child ChunkedStream class, which inherits this class's methods.
|
|
107
|
+
*/
|
|
108
|
+
export class ChunkedStream {
|
|
109
|
+
queue = new AsyncIterableQueue();
|
|
110
|
+
closed = false;
|
|
111
|
+
/** Collect every frame into one in a single call */
|
|
112
|
+
async collect() {
|
|
113
|
+
const frames = [];
|
|
114
|
+
for await (const event of this) {
|
|
115
|
+
frames.push(event.frame);
|
|
116
|
+
}
|
|
117
|
+
return mergeFrames(frames);
|
|
118
|
+
}
|
|
119
|
+
next() {
|
|
120
|
+
return this.queue.next();
|
|
121
|
+
}
|
|
122
|
+
/** Close both the input and output of the TTS stream */
|
|
123
|
+
close() {
|
|
124
|
+
this.queue.close();
|
|
125
|
+
this.closed = true;
|
|
126
|
+
}
|
|
127
|
+
[Symbol.asyncIterator]() {
|
|
128
|
+
return this;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
94
131
|
//# sourceMappingURL=tts.js.map
|
package/dist/tts/tts.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tts.js","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"tts.js","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,kBAAkB,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAyB9D;;;;;;GAMG;AACH,MAAM,OAAgB,GAAG;IACvB,aAAa,CAAkB;IAC/B,WAAW,CAAS;IACpB,YAAY,CAAS;IAErB,YAAY,UAAkB,EAAE,WAAmB,EAAE,YAA6B;QAChF,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;QAC9B,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC;IAClC,CAAC;IAED,sCAAsC;IACtC,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,aAAa,CAAC;IAC5B,CAAC;IAED,mEAAmE;IACnE,IAAI,UAAU;QACZ,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,qEAAqE;IACrE,IAAI,WAAW;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;CAWF;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAgB,gBAAgB;IAG1B,MAAM,CAAU,cAAc,GAAG,MAAM,CAAC,gBAAgB,CAAC,CAAC;IACpE,MAAM,CAAU,aAAa,GAAG,MAAM,CAAC,eAAe,CAAC,CAAC;IAC9C,KAAK,GAAG,IAAI,kBAAkB,EAAmD,CAAC;IAClF,KAAK,GAAG,IAAI,kBAAkB,EAErC,CAAC;IACM,MAAM,GAAG,KAAK,CAAC;IAEzB,uCAAuC;IACvC,QAAQ,CAAC,IAAY;QACnB,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACrC,CAAC;QACD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACtC,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACvB,CAAC;IAED,4DAA4D;IAC5D,KAAK;QACH,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACrC,CAAC;QACD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACtC,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAClD,CAAC;IAED,2DAA2D;IAC3D,QAAQ;QACN,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACrC,CAAC;QACD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACtC,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,IAAI;QACF,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;IAC3B,CAAC;IAED,wDAAwD;IACxD,KAAK;QACH,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;IACrB,CAAC;IAED,CAAC,MAAM,CAAC,aAAa,CAAC;QACpB,OAAO,IAAI,CAAC;IACd,CAAC;;AAGH;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAgB,aAAa;IACvB,KAAK,GAAG,IAAI,kBAAkB,EAAoB,CAAC;IACnD,MAAM,GAAG,KAAK,CAAC;IAEzB,oDAAoD;IACpD,KAAK,CAAC,OAAO;QACX,MAAM,MAAM,GAAG,EAAE,CAAC;QAClB,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAI,EAAE,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QACD,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI;QACF,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;IAC3B,CAAC;IAED,wDAAwD;IACxD,KAAK;QACH,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC;IACrB,CAAC;IAED,CAAC,MAAM,CAAC,aAAa,CAAC;QACpB,OAAO,IAAI,CAAC;IACd,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -134,7 +134,9 @@ const stringSynthesisTask = (text: string, handle: SynthesisHandle): Cancellable
|
|
|
134
134
|
ttsStream.flush();
|
|
135
135
|
ttsStream.endInput();
|
|
136
136
|
for await (const audio of ttsStream) {
|
|
137
|
-
if (cancelled || audio === SynthesizeStream.END_OF_STREAM)
|
|
137
|
+
if (cancelled || audio === SynthesizeStream.END_OF_STREAM) {
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
138
140
|
handle.queue.put(audio.frame);
|
|
139
141
|
}
|
|
140
142
|
handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
|
|
@@ -156,18 +158,12 @@ const streamSynthesisTask = (
|
|
|
156
158
|
|
|
157
159
|
const ttsStream = handle.tts.stream();
|
|
158
160
|
const readGeneratedAudio = async () => {
|
|
159
|
-
let started = false;
|
|
160
161
|
for await (const audio of ttsStream) {
|
|
161
162
|
if (cancelled) break;
|
|
162
163
|
if (audio === SynthesizeStream.END_OF_STREAM) {
|
|
163
|
-
|
|
164
|
-
break;
|
|
165
|
-
} else {
|
|
166
|
-
continue;
|
|
167
|
-
}
|
|
164
|
+
break;
|
|
168
165
|
}
|
|
169
166
|
handle.queue.put(audio.frame);
|
|
170
|
-
started = true;
|
|
171
167
|
}
|
|
172
168
|
handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
|
|
173
169
|
};
|
|
@@ -148,7 +148,9 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
|
|
|
148
148
|
});
|
|
149
149
|
|
|
150
150
|
for await (const frame of handle.playoutSource) {
|
|
151
|
-
if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL)
|
|
151
|
+
if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) {
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
152
154
|
if (firstFrame) {
|
|
153
155
|
this.#logger
|
|
154
156
|
.child({ speechId: handle.speechId })
|
package/src/pipeline/index.ts
CHANGED
|
@@ -6,10 +6,10 @@ export {
|
|
|
6
6
|
type AgentState,
|
|
7
7
|
type BeforeTTSCallback,
|
|
8
8
|
type BeforeLLMCallback,
|
|
9
|
-
type VPAEvent,
|
|
10
9
|
type VPACallbacks,
|
|
11
10
|
type AgentCallContext,
|
|
12
11
|
type AgentTranscriptionOptions,
|
|
13
12
|
type VPAOptions,
|
|
13
|
+
VPAEvent,
|
|
14
14
|
VoicePipelineAgent,
|
|
15
15
|
} from './pipeline_agent.js';
|
|
@@ -20,7 +20,7 @@ import type {
|
|
|
20
20
|
import { LLMStream } from '../llm/index.js';
|
|
21
21
|
import { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';
|
|
22
22
|
import { log } from '../log.js';
|
|
23
|
-
import type
|
|
23
|
+
import { type STT, StreamAdapter as STTStreamAdapter } from '../stt/index.js';
|
|
24
24
|
import {
|
|
25
25
|
SentenceTokenizer as BasicSentenceTokenizer,
|
|
26
26
|
WordTokenizer as BasicWordTokenizer,
|
|
@@ -28,6 +28,7 @@ import {
|
|
|
28
28
|
} from '../tokenize/basic/index.js';
|
|
29
29
|
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
30
30
|
import type { TTS } from '../tts/index.js';
|
|
31
|
+
import { StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
|
|
31
32
|
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
|
|
32
33
|
import type { VAD, VADEvent } from '../vad.js';
|
|
33
34
|
import type { SpeechSource, SynthesisHandle } from './agent_output.js';
|
|
@@ -252,6 +253,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
252
253
|
|
|
253
254
|
this.#opts = { ...defaultVPAOptions, ...opts };
|
|
254
255
|
|
|
256
|
+
if (!stt.capabilities.streaming) {
|
|
257
|
+
stt = new STTStreamAdapter(stt, vad);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if (!tts.capabilities.streaming) {
|
|
261
|
+
tts = new TTSStreamAdapter(tts, new BasicSentenceTokenizer());
|
|
262
|
+
}
|
|
263
|
+
|
|
255
264
|
this.#vad = vad;
|
|
256
265
|
this.#stt = stt;
|
|
257
266
|
this.#llm = llm;
|
|
@@ -593,6 +602,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
593
602
|
this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);
|
|
594
603
|
|
|
595
604
|
this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
|
|
605
|
+
handle.markUserCommitted();
|
|
596
606
|
};
|
|
597
607
|
|
|
598
608
|
// wait for the playHandle to finish and check every 1s if user question should be committed
|
|
@@ -618,7 +628,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
618
628
|
// if the answer is using tools, execute the functions and automatically generate
|
|
619
629
|
// a response to the user question from the returned values
|
|
620
630
|
if (isUsingTools && !interrupted) {
|
|
621
|
-
if (!userQuestion || handle.userCommitted) {
|
|
631
|
+
if (!userQuestion || !handle.userCommitted) {
|
|
622
632
|
throw new Error('user speech should have been committed before using tools');
|
|
623
633
|
}
|
|
624
634
|
const llmStream = handle.source;
|
|
@@ -823,7 +833,7 @@ async function* llmStreamToStringIterable(
|
|
|
823
833
|
if (firstFrame) {
|
|
824
834
|
firstFrame = false;
|
|
825
835
|
log()
|
|
826
|
-
.child({ speechId, elapsed: Math.round(Date.now()
|
|
836
|
+
.child({ speechId, elapsed: Math.round(Date.now() - startTime) })
|
|
827
837
|
.debug('received first LLM token');
|
|
828
838
|
}
|
|
829
839
|
yield content;
|
package/src/stt/index.ts
CHANGED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { VAD, VADStream } from '../vad.js';
|
|
6
|
+
import { VADEventType } from '../vad.js';
|
|
7
|
+
import type { SpeechEvent } from './stt.js';
|
|
8
|
+
import { STT, SpeechEventType, SpeechStream } from './stt.js';
|
|
9
|
+
|
|
10
|
+
export class StreamAdapter extends STT {
|
|
11
|
+
#stt: STT;
|
|
12
|
+
#vad: VAD;
|
|
13
|
+
|
|
14
|
+
constructor(stt: STT, vad: VAD) {
|
|
15
|
+
super({ streaming: true, interimResults: false });
|
|
16
|
+
this.#stt = stt;
|
|
17
|
+
this.#vad = vad;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
recognize(frame: AudioFrame): Promise<SpeechEvent> {
|
|
21
|
+
return this.#stt.recognize(frame);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
stream(): StreamAdapterWrapper {
|
|
25
|
+
return new StreamAdapterWrapper(this.#stt, this.#vad);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export class StreamAdapterWrapper extends SpeechStream {
|
|
30
|
+
#stt: STT;
|
|
31
|
+
#vadStream: VADStream;
|
|
32
|
+
|
|
33
|
+
constructor(stt: STT, vad: VAD) {
|
|
34
|
+
super();
|
|
35
|
+
this.#stt = stt;
|
|
36
|
+
this.#vadStream = vad.stream();
|
|
37
|
+
|
|
38
|
+
this.#run();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async #run() {
|
|
42
|
+
const forwardInput = async () => {
|
|
43
|
+
for await (const input of this.input) {
|
|
44
|
+
if (input === SpeechStream.FLUSH_SENTINEL) {
|
|
45
|
+
this.#vadStream.flush();
|
|
46
|
+
} else {
|
|
47
|
+
this.#vadStream.pushFrame(input);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
this.#vadStream.endInput();
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
const recognize = async () => {
|
|
54
|
+
for await (const ev of this.#vadStream) {
|
|
55
|
+
switch (ev.type) {
|
|
56
|
+
case VADEventType.START_OF_SPEECH:
|
|
57
|
+
this.queue.put({ type: SpeechEventType.START_OF_SPEECH, alternatives: [] });
|
|
58
|
+
break;
|
|
59
|
+
case VADEventType.END_OF_SPEECH:
|
|
60
|
+
this.queue.put({ type: SpeechEventType.END_OF_SPEECH, alternatives: [] });
|
|
61
|
+
|
|
62
|
+
const event = await this.#stt.recognize(ev.frames);
|
|
63
|
+
if (!event.alternatives.length || !event.alternatives[0].text) {
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
this.queue.put(event);
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
Promise.all([forwardInput(), recognize()]);
|
|
74
|
+
}
|
|
75
|
+
}
|
package/src/stt/stt.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import type { AudioBuffer } from '../utils.js';
|
|
5
6
|
import { AsyncIterableQueue } from '../utils.js';
|
|
6
7
|
|
|
7
8
|
/** Indicates start/middle/end of speech */
|
|
@@ -73,6 +74,9 @@ export abstract class STT {
|
|
|
73
74
|
return this.#capabilities;
|
|
74
75
|
}
|
|
75
76
|
|
|
77
|
+
/** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
|
|
78
|
+
abstract recognize(frame: AudioBuffer): Promise<SpeechEvent>;
|
|
79
|
+
|
|
76
80
|
/**
|
|
77
81
|
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
|
|
78
82
|
* transcriptions
|
|
@@ -39,7 +39,7 @@ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
|
|
|
39
39
|
|
|
40
40
|
while (true) {
|
|
41
41
|
const tokens = this.#func(this.#inBuf);
|
|
42
|
-
if (tokens.length
|
|
42
|
+
if (tokens.length <= 1) break;
|
|
43
43
|
|
|
44
44
|
if (this.#outBuf) this.#outBuf += ' ';
|
|
45
45
|
|
|
@@ -130,6 +130,10 @@ export class BufferedSentenceStream extends SentenceStream {
|
|
|
130
130
|
this.#stream.pushText(text);
|
|
131
131
|
}
|
|
132
132
|
|
|
133
|
+
flush() {
|
|
134
|
+
this.#stream.flush();
|
|
135
|
+
}
|
|
136
|
+
|
|
133
137
|
close() {
|
|
134
138
|
super.close();
|
|
135
139
|
this.#stream.close();
|
package/src/tts/index.ts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
export {
|
|
4
|
+
export {
|
|
5
|
+
type SynthesizedAudio,
|
|
6
|
+
type TTSCapabilities,
|
|
7
|
+
TTS,
|
|
8
|
+
SynthesizeStream,
|
|
9
|
+
ChunkedStream,
|
|
10
|
+
} from './tts.js';
|
|
11
|
+
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
|
|
5
|
+
import type { ChunkedStream } from './tts.js';
|
|
6
|
+
import { SynthesizeStream, TTS } from './tts.js';
|
|
7
|
+
|
|
8
|
+
export class StreamAdapter extends TTS {
|
|
9
|
+
#tts: TTS;
|
|
10
|
+
#sentenceTokenizer: SentenceTokenizer;
|
|
11
|
+
|
|
12
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
13
|
+
super(tts.sampleRate, tts.numChannels, { streaming: true });
|
|
14
|
+
this.#tts = tts;
|
|
15
|
+
this.#sentenceTokenizer = sentenceTokenizer;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
synthesize(text: string): ChunkedStream {
|
|
19
|
+
return this.#tts.synthesize(text);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
stream(): StreamAdapterWrapper {
|
|
23
|
+
return new StreamAdapterWrapper(this.#tts, this.#sentenceTokenizer);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export class StreamAdapterWrapper extends SynthesizeStream {
|
|
28
|
+
#tts: TTS;
|
|
29
|
+
#sentenceStream: SentenceStream;
|
|
30
|
+
|
|
31
|
+
constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
|
|
32
|
+
super();
|
|
33
|
+
this.#tts = tts;
|
|
34
|
+
this.#sentenceStream = sentenceTokenizer.stream();
|
|
35
|
+
|
|
36
|
+
this.#run();
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async #run() {
|
|
40
|
+
const forwardInput = async () => {
|
|
41
|
+
for await (const input of this.input) {
|
|
42
|
+
if (input === SynthesizeStream.FLUSH_SENTINEL) {
|
|
43
|
+
this.#sentenceStream.flush();
|
|
44
|
+
} else {
|
|
45
|
+
this.#sentenceStream.pushText(input);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
this.#sentenceStream.endInput();
|
|
49
|
+
this.#sentenceStream.close();
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const synthesize = async () => {
|
|
53
|
+
for await (const ev of this.#sentenceStream) {
|
|
54
|
+
for await (const audio of this.#tts.synthesize(ev.token)) {
|
|
55
|
+
this.queue.put(audio);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
Promise.all([forwardInput(), synthesize()]);
|
|
62
|
+
}
|
|
63
|
+
}
|
package/src/tts/tts.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
-
import { AsyncIterableQueue } from '../utils.js';
|
|
5
|
+
import { AsyncIterableQueue, mergeFrames } from '../utils.js';
|
|
6
6
|
|
|
7
7
|
/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
|
|
8
8
|
export interface SynthesizedAudio {
|
|
@@ -60,6 +60,11 @@ export abstract class TTS {
|
|
|
60
60
|
return this.#numChannels;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
/**
|
|
64
|
+
* Receives text and returns synthesis in the form of a {@link ChunkedStream}
|
|
65
|
+
*/
|
|
66
|
+
abstract synthesize(text: string): ChunkedStream;
|
|
67
|
+
|
|
63
68
|
/**
|
|
64
69
|
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
65
70
|
*/
|
|
@@ -139,3 +144,45 @@ export abstract class SynthesizeStream
|
|
|
139
144
|
return this;
|
|
140
145
|
}
|
|
141
146
|
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* An instance of a text-to-speech response, as an asynchronous iterable iterator.
|
|
150
|
+
*
|
|
151
|
+
* @example Looping through frames
|
|
152
|
+
* ```ts
|
|
153
|
+
* for await (const event of stream) {
|
|
154
|
+
* await source.captureFrame(event.frame);
|
|
155
|
+
* }
|
|
156
|
+
* ```
|
|
157
|
+
*
|
|
158
|
+
* @remarks
|
|
159
|
+
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
|
|
160
|
+
* exports its own child ChunkedStream class, which inherits this class's methods.
|
|
161
|
+
*/
|
|
162
|
+
export abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
|
|
163
|
+
protected queue = new AsyncIterableQueue<SynthesizedAudio>();
|
|
164
|
+
protected closed = false;
|
|
165
|
+
|
|
166
|
+
/** Collect every frame into one in a single call */
|
|
167
|
+
async collect(): Promise<AudioFrame> {
|
|
168
|
+
const frames = [];
|
|
169
|
+
for await (const event of this) {
|
|
170
|
+
frames.push(event.frame);
|
|
171
|
+
}
|
|
172
|
+
return mergeFrames(frames);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
next(): Promise<IteratorResult<SynthesizedAudio>> {
|
|
176
|
+
return this.queue.next();
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/** Close both the input and output of the TTS stream */
|
|
180
|
+
close() {
|
|
181
|
+
this.queue.close();
|
|
182
|
+
this.closed = true;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
[Symbol.asyncIterator](): ChunkedStream {
|
|
186
|
+
return this;
|
|
187
|
+
}
|
|
188
|
+
}
|