@livekit/agents 0.3.5 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +36 -0
  3. package/dist/audio.js +17 -30
  4. package/dist/audio.js.map +1 -1
  5. package/dist/cli.js +3 -14
  6. package/dist/cli.js.map +1 -1
  7. package/dist/http_server.d.ts +1 -1
  8. package/dist/http_server.js +5 -9
  9. package/dist/http_server.js.map +1 -1
  10. package/dist/index.d.ts +3 -2
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +14 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/ipc/job_executor.js +3 -5
  15. package/dist/ipc/job_executor.js.map +1 -1
  16. package/dist/ipc/job_main.d.ts +1 -1
  17. package/dist/ipc/proc_job_executor.js +66 -80
  18. package/dist/ipc/proc_job_executor.js.map +1 -1
  19. package/dist/ipc/proc_pool.d.ts +3 -3
  20. package/dist/ipc/proc_pool.d.ts.map +1 -1
  21. package/dist/ipc/proc_pool.js +38 -20
  22. package/dist/ipc/proc_pool.js.map +1 -1
  23. package/dist/job.js +56 -73
  24. package/dist/job.js.map +1 -1
  25. package/dist/llm/chat_context.d.ts +66 -0
  26. package/dist/llm/chat_context.d.ts.map +1 -0
  27. package/dist/llm/chat_context.js +93 -0
  28. package/dist/llm/chat_context.js.map +1 -0
  29. package/dist/llm/function_context.d.ts +19 -1
  30. package/dist/llm/function_context.d.ts.map +1 -1
  31. package/dist/llm/function_context.js +54 -18
  32. package/dist/llm/function_context.js.map +1 -1
  33. package/dist/llm/function_context.test.d.ts +2 -0
  34. package/dist/llm/function_context.test.d.ts.map +1 -0
  35. package/dist/llm/function_context.test.js +218 -0
  36. package/dist/llm/function_context.test.js.map +1 -0
  37. package/dist/llm/index.d.ts +3 -2
  38. package/dist/llm/index.d.ts.map +1 -1
  39. package/dist/llm/index.js +3 -2
  40. package/dist/llm/index.js.map +1 -1
  41. package/dist/llm/llm.d.ts +53 -0
  42. package/dist/llm/llm.d.ts.map +1 -0
  43. package/dist/llm/llm.js +45 -0
  44. package/dist/llm/llm.js.map +1 -0
  45. package/dist/multimodal/agent_playout.d.ts +1 -1
  46. package/dist/multimodal/agent_playout.js +116 -153
  47. package/dist/multimodal/agent_playout.js.map +1 -1
  48. package/dist/multimodal/multimodal_agent.d.ts +4 -3
  49. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  50. package/dist/multimodal/multimodal_agent.js +207 -234
  51. package/dist/multimodal/multimodal_agent.js.map +1 -1
  52. package/dist/pipeline/agent_output.d.ts +30 -0
  53. package/dist/pipeline/agent_output.d.ts.map +1 -0
  54. package/dist/pipeline/agent_output.js +155 -0
  55. package/dist/pipeline/agent_output.js.map +1 -0
  56. package/dist/pipeline/agent_playout.d.ts +38 -0
  57. package/dist/pipeline/agent_playout.d.ts.map +1 -0
  58. package/dist/pipeline/agent_playout.js +142 -0
  59. package/dist/pipeline/agent_playout.js.map +1 -0
  60. package/dist/pipeline/human_input.d.ts +28 -0
  61. package/dist/pipeline/human_input.d.ts.map +1 -0
  62. package/dist/pipeline/human_input.js +134 -0
  63. package/dist/pipeline/human_input.js.map +1 -0
  64. package/dist/pipeline/index.d.ts +2 -0
  65. package/dist/pipeline/index.d.ts.map +1 -0
  66. package/dist/pipeline/index.js +5 -0
  67. package/dist/pipeline/index.js.map +1 -0
  68. package/dist/pipeline/pipeline_agent.d.ts +134 -0
  69. package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
  70. package/dist/pipeline/pipeline_agent.js +661 -0
  71. package/dist/pipeline/pipeline_agent.js.map +1 -0
  72. package/dist/pipeline/speech_handle.d.ts +27 -0
  73. package/dist/pipeline/speech_handle.d.ts.map +1 -0
  74. package/dist/pipeline/speech_handle.js +102 -0
  75. package/dist/pipeline/speech_handle.js.map +1 -0
  76. package/dist/plugin.js +7 -20
  77. package/dist/plugin.js.map +1 -1
  78. package/dist/stt/index.d.ts +1 -2
  79. package/dist/stt/index.d.ts.map +1 -1
  80. package/dist/stt/index.js +1 -2
  81. package/dist/stt/index.js.map +1 -1
  82. package/dist/stt/stt.d.ts +62 -24
  83. package/dist/stt/stt.d.ts.map +1 -1
  84. package/dist/stt/stt.js +77 -27
  85. package/dist/stt/stt.js.map +1 -1
  86. package/dist/tokenize/basic/basic.d.ts +16 -0
  87. package/dist/tokenize/basic/basic.d.ts.map +1 -0
  88. package/dist/tokenize/basic/basic.js +50 -0
  89. package/dist/tokenize/basic/basic.js.map +1 -0
  90. package/dist/tokenize/basic/hyphenator.d.ts +17 -0
  91. package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
  92. package/dist/tokenize/basic/hyphenator.js +420 -0
  93. package/dist/tokenize/basic/hyphenator.js.map +1 -0
  94. package/dist/tokenize/basic/index.d.ts +2 -0
  95. package/dist/tokenize/basic/index.d.ts.map +1 -0
  96. package/dist/tokenize/basic/index.js +5 -0
  97. package/dist/tokenize/basic/index.js.map +1 -0
  98. package/dist/tokenize/basic/paragraph.d.ts +5 -0
  99. package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
  100. package/dist/tokenize/basic/paragraph.js +38 -0
  101. package/dist/tokenize/basic/paragraph.js.map +1 -0
  102. package/dist/tokenize/basic/sentence.d.ts +5 -0
  103. package/dist/tokenize/basic/sentence.d.ts.map +1 -0
  104. package/dist/tokenize/basic/sentence.js +60 -0
  105. package/dist/tokenize/basic/sentence.js.map +1 -0
  106. package/dist/tokenize/basic/word.d.ts +5 -0
  107. package/dist/tokenize/basic/word.d.ts.map +1 -0
  108. package/dist/tokenize/basic/word.js +23 -0
  109. package/dist/tokenize/basic/word.js.map +1 -0
  110. package/dist/tokenize/index.d.ts +5 -0
  111. package/dist/tokenize/index.d.ts.map +1 -0
  112. package/dist/tokenize/index.js +8 -0
  113. package/dist/tokenize/index.js.map +1 -0
  114. package/dist/tokenize/token_stream.d.ts +36 -0
  115. package/dist/tokenize/token_stream.d.ts.map +1 -0
  116. package/dist/tokenize/token_stream.js +136 -0
  117. package/dist/tokenize/token_stream.js.map +1 -0
  118. package/dist/tokenize/tokenizer.d.ts +55 -0
  119. package/dist/tokenize/tokenizer.d.ts.map +1 -0
  120. package/dist/tokenize/tokenizer.js +117 -0
  121. package/dist/tokenize/tokenizer.js.map +1 -0
  122. package/dist/transcription.js +78 -89
  123. package/dist/transcription.js.map +1 -1
  124. package/dist/tts/index.d.ts +1 -3
  125. package/dist/tts/index.d.ts.map +1 -1
  126. package/dist/tts/index.js +1 -3
  127. package/dist/tts/index.js.map +1 -1
  128. package/dist/tts/tts.d.ts +66 -37
  129. package/dist/tts/tts.d.ts.map +1 -1
  130. package/dist/tts/tts.js +79 -74
  131. package/dist/tts/tts.js.map +1 -1
  132. package/dist/utils.d.ts +21 -6
  133. package/dist/utils.d.ts.map +1 -1
  134. package/dist/utils.js +120 -76
  135. package/dist/utils.js.map +1 -1
  136. package/dist/vad.d.ts +43 -39
  137. package/dist/vad.d.ts.map +1 -1
  138. package/dist/vad.js +51 -4
  139. package/dist/vad.js.map +1 -1
  140. package/dist/worker.d.ts +1 -1
  141. package/dist/worker.js +257 -247
  142. package/dist/worker.js.map +1 -1
  143. package/package.json +4 -3
  144. package/src/index.ts +16 -2
  145. package/src/ipc/proc_pool.ts +25 -13
  146. package/src/llm/chat_context.ts +147 -0
  147. package/src/llm/function_context.test.ts +248 -0
  148. package/src/llm/function_context.ts +77 -18
  149. package/src/llm/index.ts +21 -2
  150. package/src/llm/llm.ts +102 -0
  151. package/src/multimodal/multimodal_agent.ts +6 -2
  152. package/src/pipeline/agent_output.ts +185 -0
  153. package/src/pipeline/agent_playout.ts +187 -0
  154. package/src/pipeline/human_input.ts +166 -0
  155. package/src/pipeline/index.ts +15 -0
  156. package/src/pipeline/pipeline_agent.ts +917 -0
  157. package/src/pipeline/speech_handle.ts +136 -0
  158. package/src/stt/index.ts +8 -2
  159. package/src/stt/stt.ts +98 -31
  160. package/src/tokenize/basic/basic.ts +73 -0
  161. package/src/tokenize/basic/hyphenator.ts +436 -0
  162. package/src/tokenize/basic/index.ts +5 -0
  163. package/src/tokenize/basic/paragraph.ts +43 -0
  164. package/src/tokenize/basic/sentence.ts +69 -0
  165. package/src/tokenize/basic/word.ts +27 -0
  166. package/src/tokenize/index.ts +16 -0
  167. package/src/tokenize/token_stream.ts +163 -0
  168. package/src/tokenize/tokenizer.ts +152 -0
  169. package/src/tts/index.ts +1 -20
  170. package/src/tts/tts.ts +110 -57
  171. package/src/utils.ts +95 -25
  172. package/src/vad.ts +86 -45
  173. package/tsconfig.tsbuildinfo +1 -1
  174. package/dist/stt/stream_adapter.d.ts +0 -19
  175. package/dist/stt/stream_adapter.d.ts.map +0 -1
  176. package/dist/stt/stream_adapter.js +0 -96
  177. package/dist/stt/stream_adapter.js.map +0 -1
  178. package/dist/tokenize.d.ts +0 -15
  179. package/dist/tokenize.d.ts.map +0 -1
  180. package/dist/tokenize.js +0 -12
  181. package/dist/tokenize.js.map +0 -1
  182. package/dist/tts/stream_adapter.d.ts +0 -19
  183. package/dist/tts/stream_adapter.d.ts.map +0 -1
  184. package/dist/tts/stream_adapter.js +0 -111
  185. package/dist/tts/stream_adapter.js.map +0 -1
  186. package/src/stt/stream_adapter.ts +0 -104
  187. package/src/tokenize.ts +0 -22
  188. package/src/tts/stream_adapter.ts +0 -93
@@ -0,0 +1,163 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { randomUUID } from 'node:crypto';
5
+ import { AsyncIterableQueue } from '../utils.js';
6
+ import type { TokenData } from './tokenizer.js';
7
+ import { SentenceStream, WordStream } from './tokenizer.js';
8
+
9
+ type TokenizeFunc = (x: string) => string[] | [string, number, number][];
10
+
11
+ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
12
+ protected queue = new AsyncIterableQueue<TokenData>();
13
+ protected closed = false;
14
+
15
+ #func: TokenizeFunc;
16
+ #minTokenLength: number;
17
+ #minContextLength: number;
18
+ #bufTokens: string[] = [];
19
+ #inBuf = '';
20
+ #outBuf = '';
21
+ #currentSegmentId: string;
22
+
23
+ constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
24
+ this.#func = func;
25
+ this.#minTokenLength = minTokenLength;
26
+ this.#minContextLength = minContextLength;
27
+
28
+ this.#currentSegmentId = randomUUID();
29
+ }
30
+
31
+ /** Push a string of text into the token stream */
32
+ pushText(text: string) {
33
+ if (this.closed) {
34
+ throw new Error('Stream is closed');
35
+ }
36
+
37
+ this.#inBuf += text;
38
+ if (this.#inBuf.length < this.#minContextLength) return;
39
+
40
+ while (true) {
41
+ const tokens = this.#func(this.#inBuf);
42
+ if (tokens.length === 0) break;
43
+
44
+ if (this.#outBuf) this.#outBuf += ' ';
45
+
46
+ const tok = tokens.shift()!;
47
+ let tokText = tok as string;
48
+ if (tok.length > 1 && typeof tok[1] === 'number') {
49
+ tokText = tok[0];
50
+ }
51
+
52
+ this.#outBuf += tokText;
53
+ if (this.#outBuf.length >= this.#minTokenLength) {
54
+ this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
55
+ this.#outBuf = '';
56
+ }
57
+
58
+ if (typeof tok! !== 'string') {
59
+ this.#inBuf = this.#inBuf.slice(tok![2]);
60
+ } else {
61
+ this.#inBuf = this.#inBuf
62
+ .slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)
63
+ .trimStart();
64
+ }
65
+ }
66
+ }
67
+
68
+ /** Flush the stream, causing it to process all pending text */
69
+ flush() {
70
+ if (this.closed) {
71
+ throw new Error('Stream is closed');
72
+ }
73
+
74
+ if (this.#inBuf || this.#outBuf) {
75
+ const tokens = this.#func(this.#inBuf);
76
+ if (tokens) {
77
+ if (this.#outBuf) this.#outBuf += ' ';
78
+
79
+ if (typeof tokens[0] !== 'string') {
80
+ this.#outBuf += tokens.map((tok) => tok[0]).join(' ');
81
+ } else {
82
+ this.#outBuf += tokens.join(' ');
83
+ }
84
+ }
85
+
86
+ if (this.#outBuf) {
87
+ this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
88
+ }
89
+
90
+ this.#currentSegmentId = randomUUID();
91
+ }
92
+
93
+ this.#inBuf = '';
94
+ this.#outBuf = '';
95
+ }
96
+
97
+ /** Mark the input as ended and forbid additional pushes */
98
+ endInput() {
99
+ if (this.closed) {
100
+ throw new Error('Stream is closed');
101
+ }
102
+ this.flush();
103
+ this.closed = true;
104
+ }
105
+
106
+ next(): Promise<IteratorResult<TokenData>> {
107
+ return this.queue.next();
108
+ }
109
+
110
+ /** Close both the input and output of the token stream */
111
+ close() {
112
+ this.queue.close();
113
+ this.closed = true;
114
+ }
115
+
116
+ [Symbol.asyncIterator](): BufferedTokenStream {
117
+ return this;
118
+ }
119
+ }
120
+
121
+ export class BufferedSentenceStream extends SentenceStream {
122
+ #stream: BufferedTokenStream;
123
+
124
+ constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
125
+ super();
126
+ this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
127
+ }
128
+
129
+ pushText(text: string) {
130
+ this.#stream.pushText(text);
131
+ }
132
+
133
+ close() {
134
+ super.close();
135
+ this.#stream.close();
136
+ }
137
+
138
+ next(): Promise<IteratorResult<TokenData>> {
139
+ return this.#stream.next();
140
+ }
141
+ }
142
+
143
+ export class BufferedWordStream extends WordStream {
144
+ #stream: BufferedTokenStream;
145
+
146
+ constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {
147
+ super();
148
+ this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
149
+ }
150
+
151
+ pushText(text: string) {
152
+ this.#stream.pushText(text);
153
+ }
154
+
155
+ close() {
156
+ super.close();
157
+ this.#stream.close();
158
+ }
159
+
160
+ next(): Promise<IteratorResult<TokenData>> {
161
+ return this.#stream.next();
162
+ }
163
+ }
@@ -0,0 +1,152 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AsyncIterableQueue } from '../utils.js';
5
+
6
+ // prettier-ignore
7
+ export const PUNCTUATIONS = [
8
+ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=',
9
+ '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '±', '—', '‘', '’', '“', '”',
10
+ '…',
11
+ ]
12
+
13
+ export interface TokenData {
14
+ segmentId: string;
15
+ token: string;
16
+ }
17
+
18
+ export abstract class SentenceTokenizer {
19
+ abstract tokenize(text: string, language?: string): string[];
20
+
21
+ /**
22
+ * Returns a {@link SentenceStream} that can be used to push strings and receive smaller segments.
23
+ */
24
+ abstract stream(): SentenceStream;
25
+ }
26
+
27
+ export abstract class SentenceStream {
28
+ protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
29
+ protected input = new AsyncIterableQueue<string | typeof SentenceStream.FLUSH_SENTINEL>();
30
+ protected queue = new AsyncIterableQueue<TokenData>();
31
+ #closed = false;
32
+
33
+ get closed(): boolean {
34
+ return this.#closed;
35
+ }
36
+
37
+ /** Push a string of text to the tokenizer */
38
+ pushText(text: string) {
39
+ if (this.input.closed) {
40
+ throw new Error('Input is closed');
41
+ }
42
+ if (this.#closed) {
43
+ throw new Error('Stream is closed');
44
+ }
45
+ this.input.put(text);
46
+ }
47
+
48
+ /** Flush the tokenizer, causing it to process all pending text */
49
+ flush() {
50
+ if (this.input.closed) {
51
+ throw new Error('Input is closed');
52
+ }
53
+ if (this.#closed) {
54
+ throw new Error('Stream is closed');
55
+ }
56
+ this.input.put(SentenceStream.FLUSH_SENTINEL);
57
+ }
58
+
59
+ /** Mark the input as ended and forbid additional pushes */
60
+ endInput() {
61
+ if (this.input.closed) {
62
+ throw new Error('Input is closed');
63
+ }
64
+ if (this.#closed) {
65
+ throw new Error('Stream is closed');
66
+ }
67
+ this.input.close();
68
+ }
69
+
70
+ next(): Promise<IteratorResult<TokenData>> {
71
+ return this.queue.next();
72
+ }
73
+
74
+ /** Close both the input and output of the tokenizer stream */
75
+ close() {
76
+ this.input.close();
77
+ this.queue.close();
78
+ this.#closed = true;
79
+ }
80
+
81
+ [Symbol.asyncIterator](): SentenceStream {
82
+ return this;
83
+ }
84
+ }
85
+
86
+ export abstract class WordTokenizer {
87
+ abstract tokenize(text: string, language?: string): string[];
88
+
89
+ /**
90
+ * Returns a {@link WordStream} that can be used to push words and receive smaller segments.
91
+ */
92
+ abstract stream(): WordStream;
93
+ }
94
+
95
+ export abstract class WordStream {
96
+ protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
97
+ protected input = new AsyncIterableQueue<string | typeof WordStream.FLUSH_SENTINEL>();
98
+ protected queue = new AsyncIterableQueue<TokenData>();
99
+ #closed = false;
100
+
101
+ get closed(): boolean {
102
+ return this.#closed;
103
+ }
104
+
105
+ /** Push a string of text to the tokenizer */
106
+ pushText(text: string) {
107
+ if (this.input.closed) {
108
+ throw new Error('Input is closed');
109
+ }
110
+ if (this.#closed) {
111
+ throw new Error('Stream is closed');
112
+ }
113
+ this.input.put(text);
114
+ }
115
+
116
+ /** Flush the tokenizer, causing it to process all pending text */
117
+ flush() {
118
+ if (this.input.closed) {
119
+ throw new Error('Input is closed');
120
+ }
121
+ if (this.#closed) {
122
+ throw new Error('Stream is closed');
123
+ }
124
+ this.input.put(WordStream.FLUSH_SENTINEL);
125
+ }
126
+
127
+ /** Mark the input as ended and forbid additional pushes */
128
+ endInput() {
129
+ if (this.input.closed) {
130
+ throw new Error('Input is closed');
131
+ }
132
+ if (this.#closed) {
133
+ throw new Error('Stream is closed');
134
+ }
135
+ this.input.close();
136
+ }
137
+
138
+ next(): Promise<IteratorResult<TokenData>> {
139
+ return this.queue.next();
140
+ }
141
+
142
+ /** Close both the input and output of the tokenizer stream */
143
+ close() {
144
+ this.input.close();
145
+ this.queue.close();
146
+ this.#closed = true;
147
+ }
148
+
149
+ [Symbol.asyncIterator](): WordStream {
150
+ return this;
151
+ }
152
+ }
package/src/tts/index.ts CHANGED
@@ -1,23 +1,4 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
5
- import {
6
- ChunkedStream,
7
- SynthesisEvent,
8
- SynthesisEventType,
9
- SynthesizeStream,
10
- type SynthesizedAudio,
11
- TTS,
12
- } from './tts.js';
13
-
14
- export {
15
- TTS,
16
- SynthesisEvent,
17
- SynthesisEventType,
18
- SynthesizedAudio,
19
- SynthesizeStream,
20
- StreamAdapter,
21
- StreamAdapterWrapper,
22
- ChunkedStream,
23
- };
4
+ export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream } from './tts.js';
package/src/tts/tts.ts CHANGED
@@ -2,87 +2,140 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
- import { mergeFrames } from '../utils.js';
5
+ import { AsyncIterableQueue } from '../utils.js';
6
6
 
7
+ /** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
7
8
  export interface SynthesizedAudio {
8
- text: string;
9
- data: AudioFrame;
9
+ /** Request ID (one segment could be made up of multiple requests) */
10
+ requestId: string;
11
+ /** Segment ID, each segment is separated by a flush */
12
+ segmentId: string;
13
+ /** Synthesized audio frame */
14
+ frame: AudioFrame;
15
+ /** Current segment of the synthesized audio */
16
+ deltaText?: string;
10
17
  }
11
18
 
12
- export enum SynthesisEventType {
13
- /**
14
- * Indicate the start of synthesis.
15
- * Retriggered after FINISHED.
16
- */
17
- STARTED = 0,
18
- /**
19
- * Indicate that audio data is available.
20
- */
21
- AUDIO = 1,
22
- /**
23
- * Indicate the end of synthesis. Does not necessarily mean stream is done.
24
- */
25
- FINISHED = 2,
19
+ /**
20
+ * Describes the capabilities of the TTS provider.
21
+ *
22
+ * @remarks
23
+ * At present, only `streaming` is supplied to this interface, and the framework only supports
24
+ * providers that do have a streaming endpoint.
25
+ */
26
+ export interface TTSCapabilities {
27
+ streaming: boolean;
26
28
  }
27
29
 
28
- export class SynthesisEvent {
29
- type: SynthesisEventType;
30
- audio?: SynthesizedAudio;
31
-
32
- constructor(type: SynthesisEventType, audio: SynthesizedAudio | undefined = undefined) {
33
- this.type = type;
34
- this.audio = audio;
30
+ /**
31
+ * An instance of a text-to-speech adapter.
32
+ *
33
+ * @remarks
34
+ * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
35
+ * exports its own child TTS class, which inherits this class's methods.
36
+ */
37
+ export abstract class TTS {
38
+ #capabilities: TTSCapabilities;
39
+ #sampleRate: number;
40
+ #numChannels: number;
41
+
42
+ constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {
43
+ this.#capabilities = capabilities;
44
+ this.#sampleRate = sampleRate;
45
+ this.#numChannels = numChannels;
35
46
  }
36
- }
37
-
38
- export abstract class SynthesizeStream implements IterableIterator<SynthesisEvent> {
39
- abstract pushText(token?: string): void;
40
47
 
41
- markSegmentEnd() {
42
- this.pushText(undefined);
48
+ /** Returns this TTS's capabilities */
49
+ get capabilities(): TTSCapabilities {
50
+ return this.#capabilities;
43
51
  }
44
52
 
45
- abstract close(wait: boolean): Promise<void>;
46
- abstract next(): IteratorResult<SynthesisEvent>;
47
-
48
- [Symbol.iterator](): SynthesizeStream {
49
- return this;
53
+ /** Returns the sample rate of audio frames returned by this TTS */
54
+ get sampleRate(): number {
55
+ return this.#sampleRate;
50
56
  }
51
- }
52
-
53
- export abstract class TTS {
54
- #streamingSupported: boolean;
55
57
 
56
- constructor(streamingSupported: boolean) {
57
- this.#streamingSupported = streamingSupported;
58
+ /** Returns the channel count of audio frames returned by this TTS */
59
+ get numChannels(): number {
60
+ return this.#numChannels;
58
61
  }
59
62
 
60
- abstract synthesize(text: string): Promise<ChunkedStream>;
61
-
63
+ /**
64
+ * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
65
+ */
62
66
  abstract stream(): SynthesizeStream;
67
+ }
63
68
 
64
- get streamingSupported(): boolean {
65
- return this.#streamingSupported;
69
+ /**
70
+ * An instance of a text-to-speech stream, as an asynchronous iterable iterator.
71
+ *
72
+ * @example Looping through frames
73
+ * ```ts
74
+ * for await (const event of stream) {
75
+ * await source.captureFrame(event.frame);
76
+ * }
77
+ * ```
78
+ *
79
+ * @remarks
80
+ * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
81
+ * exports its own child SynthesizeStream class, which inherits this class's methods.
82
+ */
83
+ export abstract class SynthesizeStream
84
+ implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>
85
+ {
86
+ protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
87
+ static readonly END_OF_STREAM = Symbol('END_OF_STREAM');
88
+ protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();
89
+ protected queue = new AsyncIterableQueue<
90
+ SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
91
+ >();
92
+ protected closed = false;
93
+
94
+ /** Push a string of text to the TTS */
95
+ pushText(text: string) {
96
+ if (this.input.closed) {
97
+ throw new Error('Input is closed');
98
+ }
99
+ if (this.closed) {
100
+ throw new Error('Stream is closed');
101
+ }
102
+ this.input.put(text);
66
103
  }
67
- }
68
104
 
69
- export abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
70
- async collect(): Promise<AudioFrame> {
71
- const frames = [];
72
- for await (const ev of this) {
73
- frames.push(ev.data);
105
+ /** Flush the TTS, causing it to process all pending text */
106
+ flush() {
107
+ if (this.input.closed) {
108
+ throw new Error('Input is closed');
74
109
  }
75
- return mergeFrames(frames);
110
+ if (this.closed) {
111
+ throw new Error('Stream is closed');
112
+ }
113
+ this.input.put(SynthesizeStream.FLUSH_SENTINEL);
76
114
  }
77
115
 
78
- abstract close(): Promise<void>;
79
- abstract next(): Promise<IteratorResult<SynthesizedAudio>>;
116
+ /** Mark the input as ended and forbid additional pushes */
117
+ endInput() {
118
+ if (this.input.closed) {
119
+ throw new Error('Input is closed');
120
+ }
121
+ if (this.closed) {
122
+ throw new Error('Stream is closed');
123
+ }
124
+ this.input.close();
125
+ }
80
126
 
81
- [Symbol.iterator](): ChunkedStream {
82
- return this;
127
+ next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {
128
+ return this.queue.next();
129
+ }
130
+
131
+ /** Close both the input and output of the TTS stream */
132
+ close() {
133
+ this.input.close();
134
+ this.queue.close();
135
+ this.closed = true;
83
136
  }
84
137
 
85
- [Symbol.asyncIterator](): ChunkedStream {
138
+ [Symbol.asyncIterator](): SynthesizeStream {
86
139
  return this;
87
140
  }
88
141
  }
package/src/utils.ts CHANGED
@@ -225,39 +225,109 @@ export async function gracefullyCancel<T>(promise: CancellablePromise<T>): Promi
225
225
  }
226
226
 
227
227
  /** @internal */
228
- export class AsyncIterableQueue<T> implements AsyncIterable<T> {
229
- private queue: Queue<T | typeof AsyncIterableQueue.QUEUE_END_MARKER>;
230
- private closed = false;
231
- private static readonly QUEUE_END_MARKER = Symbol('QUEUE_END_MARKER');
228
+ export class AsyncIterableQueue<T> implements AsyncIterableIterator<T> {
229
+ private static readonly CLOSE_SENTINEL = Symbol('CLOSE_SENTINEL');
230
+ #queue = new Queue<T | typeof AsyncIterableQueue.CLOSE_SENTINEL>();
231
+ #closed = false;
232
232
 
233
- constructor() {
234
- this.queue = new Queue<T | typeof AsyncIterableQueue.QUEUE_END_MARKER>();
233
+ get closed(): boolean {
234
+ return this.#closed;
235
235
  }
236
236
 
237
237
  put(item: T): void {
238
- if (this.closed) {
238
+ if (this.#closed) {
239
239
  throw new Error('Queue is closed');
240
240
  }
241
- this.queue.put(item);
241
+ this.#queue.put(item);
242
242
  }
243
243
 
244
244
  close(): void {
245
- this.closed = true;
246
- this.queue.put(AsyncIterableQueue.QUEUE_END_MARKER);
247
- }
248
-
249
- [Symbol.asyncIterator](): AsyncIterator<T> {
250
- return {
251
- next: async (): Promise<IteratorResult<T>> => {
252
- if (this.closed && this.queue.items.length === 0) {
253
- return { value: undefined, done: true };
254
- }
255
- const item = await this.queue.get();
256
- if (item === AsyncIterableQueue.QUEUE_END_MARKER && this.closed) {
257
- return { value: undefined, done: true };
258
- }
259
- return { value: item as T, done: false };
260
- },
261
- };
245
+ this.#closed = true;
246
+ this.#queue.put(AsyncIterableQueue.CLOSE_SENTINEL);
247
+ }
248
+
249
+ async next(): Promise<IteratorResult<T>> {
250
+ if (this.#closed && this.#queue.items.length === 0) {
251
+ return { value: undefined, done: true };
252
+ }
253
+ const item = await this.#queue.get();
254
+ if (item === AsyncIterableQueue.CLOSE_SENTINEL && this.#closed) {
255
+ return { value: undefined, done: true };
256
+ }
257
+ return { value: item as T, done: false };
258
+ }
259
+
260
+ [Symbol.asyncIterator](): AsyncIterableQueue<T> {
261
+ return this;
262
+ }
263
+ }
264
+
265
+ /** @internal */
266
+ export class ExpFilter {
267
+ #alpha: number;
268
+ #max?: number;
269
+ #filtered?: number = undefined;
270
+
271
+ constructor(alpha: number, max?: number) {
272
+ this.#alpha = alpha;
273
+ this.#max = max;
274
+ }
275
+
276
+ reset(alpha?: number) {
277
+ if (alpha) {
278
+ this.#alpha = alpha;
279
+ }
280
+ this.#filtered = undefined;
281
+ }
282
+
283
+ apply(exp: number, sample: number): number {
284
+ if (this.#filtered) {
285
+ const a = this.#alpha ** exp;
286
+ this.#filtered = a * this.#filtered + (1 - a) * sample;
287
+ } else {
288
+ this.#filtered = sample;
289
+ }
290
+
291
+ if (this.#max && this.#filtered > this.#max) {
292
+ this.#filtered = this.#max;
293
+ }
294
+
295
+ return this.#filtered;
296
+ }
297
+
298
+ get filtered(): number | undefined {
299
+ return this.#filtered;
300
+ }
301
+
302
+ set alpha(alpha: number) {
303
+ this.#alpha = alpha;
304
+ }
305
+ }
306
+
307
+ /** @internal */
308
+ export class AudioEnergyFilter {
309
+ #cooldownSeconds: number;
310
+ #cooldown: number;
311
+
312
+ constructor(cooldownSeconds = 1) {
313
+ this.#cooldownSeconds = cooldownSeconds;
314
+ this.#cooldown = cooldownSeconds;
315
+ }
316
+
317
+ pushFrame(frame: AudioFrame): boolean {
318
+ const arr = Float32Array.from(frame.data, (x) => x / 32768);
319
+ const rms = (arr.map((x) => x ** 2).reduce((acc, x) => acc + x) / arr.length) ** 0.5;
320
+ if (rms > 0.004) {
321
+ this.#cooldown = this.#cooldownSeconds;
322
+ return true;
323
+ }
324
+
325
+ const durationSeconds = frame.samplesPerChannel / frame.sampleRate;
326
+ this.#cooldown -= durationSeconds;
327
+ if (this.#cooldown > 0) {
328
+ return true;
329
+ }
330
+
331
+ return false;
262
332
  }
263
333
  }