@livekit/agents 0.3.5 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +36 -0
  3. package/dist/audio.js +17 -30
  4. package/dist/audio.js.map +1 -1
  5. package/dist/cli.js +3 -14
  6. package/dist/cli.js.map +1 -1
  7. package/dist/http_server.d.ts +1 -1
  8. package/dist/http_server.js +5 -9
  9. package/dist/http_server.js.map +1 -1
  10. package/dist/index.d.ts +3 -2
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +14 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/ipc/job_executor.js +3 -5
  15. package/dist/ipc/job_executor.js.map +1 -1
  16. package/dist/ipc/job_main.d.ts +1 -1
  17. package/dist/ipc/proc_job_executor.js +66 -80
  18. package/dist/ipc/proc_job_executor.js.map +1 -1
  19. package/dist/ipc/proc_pool.d.ts +3 -3
  20. package/dist/ipc/proc_pool.d.ts.map +1 -1
  21. package/dist/ipc/proc_pool.js +38 -20
  22. package/dist/ipc/proc_pool.js.map +1 -1
  23. package/dist/job.js +56 -73
  24. package/dist/job.js.map +1 -1
  25. package/dist/llm/chat_context.d.ts +66 -0
  26. package/dist/llm/chat_context.d.ts.map +1 -0
  27. package/dist/llm/chat_context.js +93 -0
  28. package/dist/llm/chat_context.js.map +1 -0
  29. package/dist/llm/function_context.d.ts +19 -1
  30. package/dist/llm/function_context.d.ts.map +1 -1
  31. package/dist/llm/function_context.js +54 -18
  32. package/dist/llm/function_context.js.map +1 -1
  33. package/dist/llm/function_context.test.d.ts +2 -0
  34. package/dist/llm/function_context.test.d.ts.map +1 -0
  35. package/dist/llm/function_context.test.js +218 -0
  36. package/dist/llm/function_context.test.js.map +1 -0
  37. package/dist/llm/index.d.ts +3 -2
  38. package/dist/llm/index.d.ts.map +1 -1
  39. package/dist/llm/index.js +3 -2
  40. package/dist/llm/index.js.map +1 -1
  41. package/dist/llm/llm.d.ts +53 -0
  42. package/dist/llm/llm.d.ts.map +1 -0
  43. package/dist/llm/llm.js +45 -0
  44. package/dist/llm/llm.js.map +1 -0
  45. package/dist/multimodal/agent_playout.d.ts +1 -1
  46. package/dist/multimodal/agent_playout.js +116 -153
  47. package/dist/multimodal/agent_playout.js.map +1 -1
  48. package/dist/multimodal/multimodal_agent.d.ts +4 -3
  49. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  50. package/dist/multimodal/multimodal_agent.js +207 -234
  51. package/dist/multimodal/multimodal_agent.js.map +1 -1
  52. package/dist/pipeline/agent_output.d.ts +30 -0
  53. package/dist/pipeline/agent_output.d.ts.map +1 -0
  54. package/dist/pipeline/agent_output.js +155 -0
  55. package/dist/pipeline/agent_output.js.map +1 -0
  56. package/dist/pipeline/agent_playout.d.ts +38 -0
  57. package/dist/pipeline/agent_playout.d.ts.map +1 -0
  58. package/dist/pipeline/agent_playout.js +142 -0
  59. package/dist/pipeline/agent_playout.js.map +1 -0
  60. package/dist/pipeline/human_input.d.ts +28 -0
  61. package/dist/pipeline/human_input.d.ts.map +1 -0
  62. package/dist/pipeline/human_input.js +134 -0
  63. package/dist/pipeline/human_input.js.map +1 -0
  64. package/dist/pipeline/index.d.ts +2 -0
  65. package/dist/pipeline/index.d.ts.map +1 -0
  66. package/dist/pipeline/index.js +5 -0
  67. package/dist/pipeline/index.js.map +1 -0
  68. package/dist/pipeline/pipeline_agent.d.ts +134 -0
  69. package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
  70. package/dist/pipeline/pipeline_agent.js +661 -0
  71. package/dist/pipeline/pipeline_agent.js.map +1 -0
  72. package/dist/pipeline/speech_handle.d.ts +27 -0
  73. package/dist/pipeline/speech_handle.d.ts.map +1 -0
  74. package/dist/pipeline/speech_handle.js +102 -0
  75. package/dist/pipeline/speech_handle.js.map +1 -0
  76. package/dist/plugin.js +7 -20
  77. package/dist/plugin.js.map +1 -1
  78. package/dist/stt/index.d.ts +1 -2
  79. package/dist/stt/index.d.ts.map +1 -1
  80. package/dist/stt/index.js +1 -2
  81. package/dist/stt/index.js.map +1 -1
  82. package/dist/stt/stt.d.ts +62 -24
  83. package/dist/stt/stt.d.ts.map +1 -1
  84. package/dist/stt/stt.js +77 -27
  85. package/dist/stt/stt.js.map +1 -1
  86. package/dist/tokenize/basic/basic.d.ts +16 -0
  87. package/dist/tokenize/basic/basic.d.ts.map +1 -0
  88. package/dist/tokenize/basic/basic.js +50 -0
  89. package/dist/tokenize/basic/basic.js.map +1 -0
  90. package/dist/tokenize/basic/hyphenator.d.ts +17 -0
  91. package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
  92. package/dist/tokenize/basic/hyphenator.js +420 -0
  93. package/dist/tokenize/basic/hyphenator.js.map +1 -0
  94. package/dist/tokenize/basic/index.d.ts +2 -0
  95. package/dist/tokenize/basic/index.d.ts.map +1 -0
  96. package/dist/tokenize/basic/index.js +5 -0
  97. package/dist/tokenize/basic/index.js.map +1 -0
  98. package/dist/tokenize/basic/paragraph.d.ts +5 -0
  99. package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
  100. package/dist/tokenize/basic/paragraph.js +38 -0
  101. package/dist/tokenize/basic/paragraph.js.map +1 -0
  102. package/dist/tokenize/basic/sentence.d.ts +5 -0
  103. package/dist/tokenize/basic/sentence.d.ts.map +1 -0
  104. package/dist/tokenize/basic/sentence.js +60 -0
  105. package/dist/tokenize/basic/sentence.js.map +1 -0
  106. package/dist/tokenize/basic/word.d.ts +5 -0
  107. package/dist/tokenize/basic/word.d.ts.map +1 -0
  108. package/dist/tokenize/basic/word.js +23 -0
  109. package/dist/tokenize/basic/word.js.map +1 -0
  110. package/dist/tokenize/index.d.ts +5 -0
  111. package/dist/tokenize/index.d.ts.map +1 -0
  112. package/dist/tokenize/index.js +8 -0
  113. package/dist/tokenize/index.js.map +1 -0
  114. package/dist/tokenize/token_stream.d.ts +36 -0
  115. package/dist/tokenize/token_stream.d.ts.map +1 -0
  116. package/dist/tokenize/token_stream.js +136 -0
  117. package/dist/tokenize/token_stream.js.map +1 -0
  118. package/dist/tokenize/tokenizer.d.ts +55 -0
  119. package/dist/tokenize/tokenizer.d.ts.map +1 -0
  120. package/dist/tokenize/tokenizer.js +117 -0
  121. package/dist/tokenize/tokenizer.js.map +1 -0
  122. package/dist/transcription.js +78 -89
  123. package/dist/transcription.js.map +1 -1
  124. package/dist/tts/index.d.ts +1 -3
  125. package/dist/tts/index.d.ts.map +1 -1
  126. package/dist/tts/index.js +1 -3
  127. package/dist/tts/index.js.map +1 -1
  128. package/dist/tts/tts.d.ts +66 -37
  129. package/dist/tts/tts.d.ts.map +1 -1
  130. package/dist/tts/tts.js +79 -74
  131. package/dist/tts/tts.js.map +1 -1
  132. package/dist/utils.d.ts +21 -6
  133. package/dist/utils.d.ts.map +1 -1
  134. package/dist/utils.js +120 -76
  135. package/dist/utils.js.map +1 -1
  136. package/dist/vad.d.ts +43 -39
  137. package/dist/vad.d.ts.map +1 -1
  138. package/dist/vad.js +51 -4
  139. package/dist/vad.js.map +1 -1
  140. package/dist/worker.d.ts +1 -1
  141. package/dist/worker.js +257 -247
  142. package/dist/worker.js.map +1 -1
  143. package/package.json +4 -3
  144. package/src/index.ts +16 -2
  145. package/src/ipc/proc_pool.ts +25 -13
  146. package/src/llm/chat_context.ts +147 -0
  147. package/src/llm/function_context.test.ts +248 -0
  148. package/src/llm/function_context.ts +77 -18
  149. package/src/llm/index.ts +21 -2
  150. package/src/llm/llm.ts +102 -0
  151. package/src/multimodal/multimodal_agent.ts +6 -2
  152. package/src/pipeline/agent_output.ts +185 -0
  153. package/src/pipeline/agent_playout.ts +187 -0
  154. package/src/pipeline/human_input.ts +166 -0
  155. package/src/pipeline/index.ts +15 -0
  156. package/src/pipeline/pipeline_agent.ts +917 -0
  157. package/src/pipeline/speech_handle.ts +136 -0
  158. package/src/stt/index.ts +8 -2
  159. package/src/stt/stt.ts +98 -31
  160. package/src/tokenize/basic/basic.ts +73 -0
  161. package/src/tokenize/basic/hyphenator.ts +436 -0
  162. package/src/tokenize/basic/index.ts +5 -0
  163. package/src/tokenize/basic/paragraph.ts +43 -0
  164. package/src/tokenize/basic/sentence.ts +69 -0
  165. package/src/tokenize/basic/word.ts +27 -0
  166. package/src/tokenize/index.ts +16 -0
  167. package/src/tokenize/token_stream.ts +163 -0
  168. package/src/tokenize/tokenizer.ts +152 -0
  169. package/src/tts/index.ts +1 -20
  170. package/src/tts/tts.ts +110 -57
  171. package/src/utils.ts +95 -25
  172. package/src/vad.ts +86 -45
  173. package/tsconfig.tsbuildinfo +1 -1
  174. package/dist/stt/stream_adapter.d.ts +0 -19
  175. package/dist/stt/stream_adapter.d.ts.map +0 -1
  176. package/dist/stt/stream_adapter.js +0 -96
  177. package/dist/stt/stream_adapter.js.map +0 -1
  178. package/dist/tokenize.d.ts +0 -15
  179. package/dist/tokenize.d.ts.map +0 -1
  180. package/dist/tokenize.js +0 -12
  181. package/dist/tokenize.js.map +0 -1
  182. package/dist/tts/stream_adapter.d.ts +0 -19
  183. package/dist/tts/stream_adapter.d.ts.map +0 -1
  184. package/dist/tts/stream_adapter.js +0 -111
  185. package/dist/tts/stream_adapter.js.map +0 -1
  186. package/src/stt/stream_adapter.ts +0 -104
  187. package/src/tokenize.ts +0 -22
  188. package/src/tts/stream_adapter.ts +0 -93
@@ -18,6 +18,24 @@ export interface CallableFunction<P extends z.ZodTypeAny = any, R = any> {
18
18
  execute: (args: inferParameters<P>) => PromiseLike<R>;
19
19
  }
20
20
 
21
+ /** A function that has been called but is not yet running */
22
+ export interface FunctionCallInfo<P extends z.ZodTypeAny = any, R = any> {
23
+ name: string;
24
+ func: CallableFunction<P, R>;
25
+ toolCallId: string;
26
+ rawParams: string;
27
+ params: inferParameters<P>;
28
+ task?: PromiseLike<CallableFunctionResult>;
29
+ }
30
+
31
+ /** The result of a ran FunctionCallInfo. */
32
+ export interface CallableFunctionResult {
33
+ name: string;
34
+ toolCallId: string;
35
+ result?: any;
36
+ error?: any;
37
+ }
38
+
21
39
  /** An object containing callable functions and their names */
22
40
  export type FunctionContext = {
23
41
  [name: string]: CallableFunction;
@@ -26,29 +44,50 @@ export type FunctionContext = {
26
44
  /** @internal */
27
45
  export const oaiParams = (p: z.AnyZodObject) => {
28
46
  const properties: Record<string, any> = {};
29
- const required_properties: string[] = [];
47
+ const requiredProperties: string[] = [];
30
48
 
31
- for (const key in p.shape) {
32
- const field = p.shape[key];
33
- const description = field._def.description || undefined;
34
- let type: string;
35
- let enumValues: any[] | undefined;
49
+ const processZodType = (field: z.ZodTypeAny): any => {
50
+ const isOptional = field instanceof z.ZodOptional;
51
+ const nestedField = isOptional ? field._def.innerType : field;
52
+ const description = field._def.description;
36
53
 
37
- if (field instanceof z.ZodEnum) {
38
- enumValues = field._def.values;
39
- type = typeof enumValues![0];
54
+ if (nestedField instanceof z.ZodEnum) {
55
+ return {
56
+ type: typeof nestedField._def.values[0],
57
+ ...(description && { description }),
58
+ enum: nestedField._def.values,
59
+ };
60
+ } else if (nestedField instanceof z.ZodArray) {
61
+ const elementType = nestedField._def.type;
62
+ return {
63
+ type: 'array',
64
+ ...(description && { description }),
65
+ items: processZodType(elementType),
66
+ };
67
+ } else if (nestedField instanceof z.ZodObject) {
68
+ const { properties, required } = oaiParams(nestedField);
69
+ return {
70
+ type: 'object',
71
+ ...(description && { description }),
72
+ properties,
73
+ required,
74
+ };
40
75
  } else {
41
- type = field._def.typeName.toLowerCase();
76
+ let type = nestedField._def.typeName.toLowerCase();
77
+ type = type.includes('zod') ? type.substring(3) : type;
78
+ return {
79
+ type,
80
+ ...(description && { description }),
81
+ };
42
82
  }
83
+ };
43
84
 
44
- properties[key] = {
45
- type: type.includes('zod') ? type.substring(3) : type,
46
- description,
47
- enum: enumValues,
48
- };
85
+ for (const key in p.shape) {
86
+ const field = p.shape[key];
87
+ properties[key] = processZodType(field);
49
88
 
50
- if (!field._def.defaultValue) {
51
- required_properties.push(key);
89
+ if (!(field instanceof z.ZodOptional)) {
90
+ requiredProperties.push(key);
52
91
  }
53
92
  }
54
93
 
@@ -56,6 +95,26 @@ export const oaiParams = (p: z.AnyZodObject) => {
56
95
  return {
57
96
  type,
58
97
  properties,
59
- required_properties,
98
+ required: requiredProperties,
99
+ };
100
+ };
101
+
102
+ /** @internal */
103
+ export const oaiBuildFunctionInfo = (
104
+ fncCtx: FunctionContext,
105
+ toolCallId: string,
106
+ fncName: string,
107
+ rawArgs: string,
108
+ ): FunctionCallInfo => {
109
+ if (!fncCtx[fncName]) {
110
+ throw new Error(`AI function ${fncName} not found`);
111
+ }
112
+
113
+ return {
114
+ name: fncName,
115
+ func: fncCtx[fncName],
116
+ toolCallId,
117
+ rawParams: rawArgs,
118
+ params: JSON.parse(rawArgs),
60
119
  };
61
120
  };
package/src/llm/index.ts CHANGED
@@ -1,11 +1,30 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import {
4
+ export {
5
5
  type CallableFunction,
6
+ type FunctionCallInfo,
7
+ type CallableFunctionResult,
6
8
  type FunctionContext,
7
9
  type inferParameters,
8
10
  oaiParams,
11
+ oaiBuildFunctionInfo,
9
12
  } from './function_context.js';
10
13
 
11
- export { CallableFunction, FunctionContext, inferParameters, oaiParams };
14
+ export {
15
+ type ChatImage,
16
+ type ChatAudio,
17
+ type ChatContent,
18
+ ChatRole,
19
+ ChatMessage,
20
+ ChatContext,
21
+ } from './chat_context.js';
22
+
23
+ export {
24
+ type ChoiceDelta,
25
+ type CompletionUsage,
26
+ type Choice,
27
+ type ChatChunk,
28
+ LLM,
29
+ LLMStream,
30
+ } from './llm.js';
package/src/llm/llm.ts ADDED
@@ -0,0 +1,102 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AsyncIterableQueue } from '../utils.js';
5
+ import type { ChatContext, ChatRole } from './chat_context.js';
6
+ import type { FunctionCallInfo, FunctionContext } from './function_context.js';
7
+
8
+ export interface ChoiceDelta {
9
+ role: ChatRole;
10
+ content?: string;
11
+ toolCalls?: FunctionCallInfo[];
12
+ }
13
+
14
+ export interface CompletionUsage {
15
+ completionTokens: number;
16
+ promptTokens: number;
17
+ totalTokens: number;
18
+ }
19
+
20
+ export interface Choice {
21
+ delta: ChoiceDelta;
22
+ index: number;
23
+ }
24
+
25
+ export interface ChatChunk {
26
+ requestId: string;
27
+ choices: Choice[];
28
+ usage?: CompletionUsage;
29
+ }
30
+
31
+ export abstract class LLM {
32
+ /**
33
+ * Returns a {@link LLMStream} that can be used to push text and receive LLM responses.
34
+ */
35
+ abstract chat({
36
+ chatCtx,
37
+ fncCtx,
38
+ temperature,
39
+ n,
40
+ parallelToolCalls,
41
+ }: {
42
+ chatCtx: ChatContext;
43
+ fncCtx?: FunctionContext;
44
+ temperature?: number;
45
+ n?: number;
46
+ parallelToolCalls?: boolean;
47
+ }): LLMStream;
48
+ }
49
+
50
+ export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
51
+ protected queue = new AsyncIterableQueue<ChatChunk>();
52
+ protected closed = false;
53
+ protected _functionCalls: FunctionCallInfo[] = [];
54
+
55
+ #chatCtx: ChatContext;
56
+ #fncCtx?: FunctionContext;
57
+
58
+ constructor(chatCtx: ChatContext, fncCtx?: FunctionContext) {
59
+ this.#chatCtx = chatCtx;
60
+ this.#fncCtx = fncCtx;
61
+ }
62
+
63
+ /** List of called functions from this stream. */
64
+ get functionCalls(): FunctionCallInfo[] {
65
+ return this._functionCalls;
66
+ }
67
+
68
+ /** The function context of this stream. */
69
+ get fncCtx(): FunctionContext | undefined {
70
+ return this.#fncCtx;
71
+ }
72
+
73
+ /** The initial chat context of this stream. */
74
+ get chatCtx(): ChatContext {
75
+ return this.#chatCtx;
76
+ }
77
+
78
+ /** Execute all deferred functions of this stream concurrently. */
79
+ executeFunctions(): FunctionCallInfo[] {
80
+ this._functionCalls.forEach(
81
+ (f) =>
82
+ (f.task = f.func.execute(f.params).then(
83
+ (result) => ({ name: f.name, toolCallId: f.toolCallId, result }),
84
+ (error) => ({ name: f.name, toolCallId: f.toolCallId, error }),
85
+ )),
86
+ );
87
+ return this._functionCalls;
88
+ }
89
+
90
+ next(): Promise<IteratorResult<ChatChunk>> {
91
+ return this.queue.next();
92
+ }
93
+
94
+ close() {
95
+ this.queue.close();
96
+ this.closed = true;
97
+ }
98
+
99
+ [Symbol.asyncIterator](): LLMStream {
100
+ return this;
101
+ }
102
+ }
@@ -64,13 +64,16 @@ export class MultimodalAgent extends EventEmitter {
64
64
 
65
65
  constructor({
66
66
  model,
67
+ chatCtx,
67
68
  fncCtx,
68
69
  }: {
69
70
  model: RealtimeModel;
70
- fncCtx?: llm.FunctionContext | undefined;
71
+ chatCtx?: llm.ChatContext;
72
+ fncCtx?: llm.FunctionContext;
71
73
  }) {
72
74
  super();
73
75
  this.model = model;
76
+ this.#chatCtx = chatCtx;
74
77
  this.#fncCtx = fncCtx;
75
78
  }
76
79
 
@@ -83,6 +86,7 @@ export class MultimodalAgent extends EventEmitter {
83
86
  #logger = log();
84
87
  #session: RealtimeSession | null = null;
85
88
  #fncCtx: llm.FunctionContext | undefined = undefined;
89
+ #chatCtx: llm.ChatContext | undefined = undefined;
86
90
 
87
91
  #_started: boolean = false;
88
92
  #_pendingFunctionCalls: Set<string> = new Set();
@@ -209,7 +213,7 @@ export class MultimodalAgent extends EventEmitter {
209
213
  }
210
214
  }
211
215
 
212
- this.#session = this.model.session({ fncCtx: this.#fncCtx });
216
+ this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
213
217
  this.#started = true;
214
218
 
215
219
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -0,0 +1,185 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { log } from '../log.js';
6
+ import { SynthesizeStream, type TTS } from '../tts/index.js';
7
+ import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
8
+ import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
9
+
10
+ export type SpeechSource = AsyncIterable<string> | string | Promise<string>;
11
+
12
+ export class SynthesisHandle {
13
+ static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
14
+
15
+ #speechId: string;
16
+ ttsSource: SpeechSource;
17
+ #agentPlayout: AgentPlayout;
18
+ tts: TTS;
19
+ queue = new AsyncIterableQueue<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>();
20
+ #playHandle?: PlayoutHandle;
21
+ intFut = new Future();
22
+ #logger = log();
23
+
24
+ constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {
25
+ this.#speechId = speechId;
26
+ this.ttsSource = ttsSource;
27
+ this.#agentPlayout = agentPlayout;
28
+ this.tts = tts;
29
+ }
30
+
31
+ get speechId(): string {
32
+ return this.#speechId;
33
+ }
34
+
35
+ get validated(): boolean {
36
+ return !!this.#playHandle;
37
+ }
38
+
39
+ get interrupted(): boolean {
40
+ return this.intFut.done;
41
+ }
42
+
43
+ get playHandle(): PlayoutHandle | undefined {
44
+ return this.#playHandle;
45
+ }
46
+
47
+ /** Validate the speech for playout. */
48
+ play(): PlayoutHandle {
49
+ if (this.interrupted) {
50
+ throw new Error('synthesis was interrupted');
51
+ }
52
+
53
+ this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
54
+ return this.#playHandle;
55
+ }
56
+
57
+ /** Interrupt the speech. */
58
+ interrupt() {
59
+ if (this.interrupted) {
60
+ return;
61
+ }
62
+
63
+ this.#logger.child({ speechId: this.#speechId }).debug('interrupting synthesis/playout');
64
+ this.#playHandle?.interrupt();
65
+ this.intFut.resolve();
66
+ }
67
+ }
68
+
69
+ export class AgentOutput {
70
+ #agentPlayout: AgentPlayout;
71
+ #tts: TTS;
72
+ #tasks: CancellablePromise<void>[] = [];
73
+
74
+ constructor(agentPlayout: AgentPlayout, tts: TTS) {
75
+ this.#agentPlayout = agentPlayout;
76
+ this.#tts = tts;
77
+ }
78
+
79
+ get playout(): AgentPlayout {
80
+ return this.#agentPlayout;
81
+ }
82
+
83
+ async close() {
84
+ this.#tasks.forEach((task) => task.cancel());
85
+ await Promise.all(this.#tasks);
86
+ }
87
+
88
+ synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {
89
+ const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
90
+ const task = this.#synthesize(handle);
91
+ this.#tasks.push(task);
92
+ task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
93
+ return handle;
94
+ }
95
+
96
+ #synthesize(handle: SynthesisHandle): CancellablePromise<void> {
97
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
98
+ return new CancellablePromise(async (resolve, _, onCancel) => {
99
+ const ttsSource = await handle.ttsSource;
100
+ let task: CancellablePromise<void>;
101
+ if (typeof ttsSource === 'string') {
102
+ task = stringSynthesisTask(ttsSource, handle);
103
+ } else {
104
+ task = streamSynthesisTask(ttsSource, handle);
105
+ }
106
+
107
+ onCancel(() => {
108
+ gracefullyCancel(task);
109
+ });
110
+
111
+ try {
112
+ await Promise.any([task, handle.intFut.await]);
113
+ } finally {
114
+ if (handle.intFut.done) {
115
+ gracefullyCancel(task);
116
+ }
117
+ }
118
+
119
+ resolve();
120
+ });
121
+ }
122
+ }
123
+
124
+ const stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise<void> => {
125
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
126
+ return new CancellablePromise<void>(async (resolve, _, onCancel) => {
127
+ let cancelled = false;
128
+ onCancel(() => {
129
+ cancelled = true;
130
+ });
131
+
132
+ const ttsStream = handle.tts.stream();
133
+ ttsStream.pushText(text);
134
+ ttsStream.flush();
135
+ ttsStream.endInput();
136
+ for await (const audio of ttsStream) {
137
+ if (cancelled || audio === SynthesizeStream.END_OF_STREAM) break;
138
+ handle.queue.put(audio.frame);
139
+ }
140
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
141
+
142
+ resolve();
143
+ });
144
+ };
145
+
146
+ const streamSynthesisTask = (
147
+ stream: AsyncIterable<string>,
148
+ handle: SynthesisHandle,
149
+ ): CancellablePromise<void> => {
150
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
151
+ return new CancellablePromise<void>(async (resolve, _, onCancel) => {
152
+ let cancelled = false;
153
+ onCancel(() => {
154
+ cancelled = true;
155
+ });
156
+
157
+ const ttsStream = handle.tts.stream();
158
+ const readGeneratedAudio = async () => {
159
+ let started = false;
160
+ for await (const audio of ttsStream) {
161
+ if (cancelled) break;
162
+ if (audio === SynthesizeStream.END_OF_STREAM) {
163
+ if (started) {
164
+ break;
165
+ } else {
166
+ continue;
167
+ }
168
+ }
169
+ handle.queue.put(audio.frame);
170
+ started = true;
171
+ }
172
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
173
+ };
174
+ readGeneratedAudio();
175
+
176
+ for await (const text of stream) {
177
+ if (cancelled) break;
178
+ ttsStream.pushText(text);
179
+ }
180
+ ttsStream.flush();
181
+ ttsStream.endInput();
182
+
183
+ resolve();
184
+ });
185
+ };
@@ -0,0 +1,187 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
5
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
+ import EventEmitter from 'node:events';
7
+ import { log } from '../log.js';
8
+ import { CancellablePromise, Future, gracefullyCancel } from '../utils.js';
9
+ import { SynthesisHandle } from './agent_output.js';
10
+
11
+ export enum AgentPlayoutEvent {
12
+ PLAYOUT_STARTED,
13
+ PLAYOUT_STOPPED,
14
+ }
15
+
16
+ export type AgentPlayoutCallbacks = {
17
+ [AgentPlayoutEvent.PLAYOUT_STARTED]: () => void;
18
+ [AgentPlayoutEvent.PLAYOUT_STOPPED]: (interrupt: boolean) => void;
19
+ };
20
+
21
+ export class PlayoutHandle {
22
+ #speechId: string;
23
+ #audioSource: AudioSource;
24
+ playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
25
+ totalPlayedTime?: number;
26
+ #interrupted = false;
27
+ pushedDuration = 0;
28
+ intFut = new Future();
29
+ doneFut = new Future();
30
+
31
+ constructor(
32
+ speechId: string,
33
+ audioSource: AudioSource,
34
+ playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
35
+ ) {
36
+ this.#speechId = speechId;
37
+ this.#audioSource = audioSource;
38
+ this.playoutSource = playoutSource;
39
+ }
40
+
41
+ get speechId(): string {
42
+ return this.#speechId;
43
+ }
44
+
45
+ get interrupted(): boolean {
46
+ return this.#interrupted;
47
+ }
48
+
49
+ get timePlayed(): number {
50
+ return this.totalPlayedTime || this.pushedDuration - this.#audioSource.queuedDuration;
51
+ }
52
+
53
+ get done(): boolean {
54
+ return this.doneFut.done || this.#interrupted;
55
+ }
56
+
57
+ interrupt() {
58
+ if (this.done) {
59
+ return;
60
+ }
61
+
62
+ this.intFut.resolve();
63
+ this.#interrupted = true;
64
+ }
65
+
66
+ join(): Future {
67
+ return this.doneFut;
68
+ }
69
+ }
70
+
71
+ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentPlayoutCallbacks>) {
72
+ #closed = false;
73
+ #audioSource: AudioSource;
74
+ #targetVolume = 1;
75
+ #playoutTask?: CancellablePromise<void>;
76
+ #logger = log();
77
+
78
+ constructor(audioSource: AudioSource) {
79
+ super();
80
+ this.#audioSource = audioSource;
81
+ }
82
+
83
+ get targetVolume(): number {
84
+ return this.#targetVolume;
85
+ }
86
+
87
+ set targetVolume(vol: number) {
88
+ this.#targetVolume = vol;
89
+ }
90
+
91
+ play(
92
+ speechId: string,
93
+ playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
94
+ ): PlayoutHandle {
95
+ if (this.#closed) {
96
+ throw new Error('source closed');
97
+ }
98
+
99
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
100
+
101
+ this.#playoutTask = this.#playout(handle, this.#playoutTask);
102
+ return handle;
103
+ }
104
+
105
+ #playout(handle: PlayoutHandle, oldTask?: CancellablePromise<void>): CancellablePromise<void> {
106
+ return new CancellablePromise(async (resolve, _, onCancel) => {
107
+ const cancel = () => {
108
+ captureTask.cancel();
109
+ handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
110
+
111
+ if (handle.interrupted || captureTask.error) {
112
+ this.#audioSource.clearQueue(); // make sure to remove any queued frames
113
+ }
114
+
115
+ if (!firstFrame) {
116
+ this.emit(AgentPlayoutEvent.PLAYOUT_STOPPED, handle.interrupted);
117
+ }
118
+
119
+ handle.doneFut.resolve();
120
+
121
+ this.#logger
122
+ .child({ speechId: handle.speechId, interrupted: handle.interrupted })
123
+ .debug('playout finished');
124
+ };
125
+
126
+ onCancel(() => {
127
+ cancel();
128
+ });
129
+
130
+ if (oldTask) {
131
+ await gracefullyCancel(oldTask);
132
+ }
133
+
134
+ if (this.#audioSource.queuedDuration > 0) {
135
+ // this should not happen, but log it just in case
136
+ this.#logger
137
+ .child({ speechId: handle.speechId, queuedDuration: this.#audioSource.queuedDuration })
138
+ .warn('new playout while the source is still playing');
139
+ }
140
+
141
+ let firstFrame = true;
142
+
143
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
144
+ const captureTask = new CancellablePromise<void>(async (resolve, _, onCancel) => {
145
+ let cancelled = false;
146
+ onCancel(() => {
147
+ cancelled = true;
148
+ });
149
+
150
+ for await (const frame of handle.playoutSource) {
151
+ if (cancelled || frame === SynthesisHandle.FLUSH_SENTINEL) break;
152
+ if (firstFrame) {
153
+ this.#logger
154
+ .child({ speechId: handle.speechId })
155
+ .debug('started playing the first time');
156
+ this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);
157
+ firstFrame = false;
158
+ }
159
+ handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;
160
+ await this.#audioSource.captureFrame(frame);
161
+ await this.#audioSource.waitForPlayout();
162
+ }
163
+
164
+ // XXX(nbsp): line 161 waits instead of this. this is not the case on python agents,
165
+ // but for some reason too many TTS frames can gunk up the buffer and lead to
166
+ // FFI errors. this works 🤷‍♀️
167
+ // if (this.#audioSource.queuedDuration > 0) {
168
+ // await this.#audioSource.waitForPlayout();
169
+ // }
170
+
171
+ resolve();
172
+ });
173
+
174
+ try {
175
+ await Promise.any([captureTask, handle.intFut.await]);
176
+ } finally {
177
+ cancel();
178
+ resolve();
179
+ }
180
+ });
181
+ }
182
+
183
+ async close() {
184
+ this.#closed = true;
185
+ await this.#playoutTask;
186
+ }
187
+ }