@livekit/agents 0.3.5 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +36 -0
  3. package/dist/audio.js +17 -30
  4. package/dist/audio.js.map +1 -1
  5. package/dist/cli.js +3 -14
  6. package/dist/cli.js.map +1 -1
  7. package/dist/http_server.d.ts +1 -1
  8. package/dist/http_server.js +5 -9
  9. package/dist/http_server.js.map +1 -1
  10. package/dist/index.d.ts +3 -2
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +14 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/ipc/job_executor.js +3 -5
  15. package/dist/ipc/job_executor.js.map +1 -1
  16. package/dist/ipc/job_main.d.ts +1 -1
  17. package/dist/ipc/proc_job_executor.js +66 -80
  18. package/dist/ipc/proc_job_executor.js.map +1 -1
  19. package/dist/ipc/proc_pool.d.ts +3 -3
  20. package/dist/ipc/proc_pool.d.ts.map +1 -1
  21. package/dist/ipc/proc_pool.js +38 -20
  22. package/dist/ipc/proc_pool.js.map +1 -1
  23. package/dist/job.js +56 -73
  24. package/dist/job.js.map +1 -1
  25. package/dist/llm/chat_context.d.ts +66 -0
  26. package/dist/llm/chat_context.d.ts.map +1 -0
  27. package/dist/llm/chat_context.js +93 -0
  28. package/dist/llm/chat_context.js.map +1 -0
  29. package/dist/llm/function_context.d.ts +19 -1
  30. package/dist/llm/function_context.d.ts.map +1 -1
  31. package/dist/llm/function_context.js +54 -18
  32. package/dist/llm/function_context.js.map +1 -1
  33. package/dist/llm/function_context.test.d.ts +2 -0
  34. package/dist/llm/function_context.test.d.ts.map +1 -0
  35. package/dist/llm/function_context.test.js +218 -0
  36. package/dist/llm/function_context.test.js.map +1 -0
  37. package/dist/llm/index.d.ts +3 -2
  38. package/dist/llm/index.d.ts.map +1 -1
  39. package/dist/llm/index.js +3 -2
  40. package/dist/llm/index.js.map +1 -1
  41. package/dist/llm/llm.d.ts +53 -0
  42. package/dist/llm/llm.d.ts.map +1 -0
  43. package/dist/llm/llm.js +45 -0
  44. package/dist/llm/llm.js.map +1 -0
  45. package/dist/multimodal/agent_playout.d.ts +1 -1
  46. package/dist/multimodal/agent_playout.js +116 -153
  47. package/dist/multimodal/agent_playout.js.map +1 -1
  48. package/dist/multimodal/multimodal_agent.d.ts +4 -3
  49. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  50. package/dist/multimodal/multimodal_agent.js +207 -234
  51. package/dist/multimodal/multimodal_agent.js.map +1 -1
  52. package/dist/pipeline/agent_output.d.ts +30 -0
  53. package/dist/pipeline/agent_output.d.ts.map +1 -0
  54. package/dist/pipeline/agent_output.js +155 -0
  55. package/dist/pipeline/agent_output.js.map +1 -0
  56. package/dist/pipeline/agent_playout.d.ts +38 -0
  57. package/dist/pipeline/agent_playout.d.ts.map +1 -0
  58. package/dist/pipeline/agent_playout.js +142 -0
  59. package/dist/pipeline/agent_playout.js.map +1 -0
  60. package/dist/pipeline/human_input.d.ts +28 -0
  61. package/dist/pipeline/human_input.d.ts.map +1 -0
  62. package/dist/pipeline/human_input.js +134 -0
  63. package/dist/pipeline/human_input.js.map +1 -0
  64. package/dist/pipeline/index.d.ts +2 -0
  65. package/dist/pipeline/index.d.ts.map +1 -0
  66. package/dist/pipeline/index.js +5 -0
  67. package/dist/pipeline/index.js.map +1 -0
  68. package/dist/pipeline/pipeline_agent.d.ts +134 -0
  69. package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
  70. package/dist/pipeline/pipeline_agent.js +661 -0
  71. package/dist/pipeline/pipeline_agent.js.map +1 -0
  72. package/dist/pipeline/speech_handle.d.ts +27 -0
  73. package/dist/pipeline/speech_handle.d.ts.map +1 -0
  74. package/dist/pipeline/speech_handle.js +102 -0
  75. package/dist/pipeline/speech_handle.js.map +1 -0
  76. package/dist/plugin.js +7 -20
  77. package/dist/plugin.js.map +1 -1
  78. package/dist/stt/index.d.ts +1 -2
  79. package/dist/stt/index.d.ts.map +1 -1
  80. package/dist/stt/index.js +1 -2
  81. package/dist/stt/index.js.map +1 -1
  82. package/dist/stt/stt.d.ts +62 -24
  83. package/dist/stt/stt.d.ts.map +1 -1
  84. package/dist/stt/stt.js +77 -27
  85. package/dist/stt/stt.js.map +1 -1
  86. package/dist/tokenize/basic/basic.d.ts +16 -0
  87. package/dist/tokenize/basic/basic.d.ts.map +1 -0
  88. package/dist/tokenize/basic/basic.js +50 -0
  89. package/dist/tokenize/basic/basic.js.map +1 -0
  90. package/dist/tokenize/basic/hyphenator.d.ts +17 -0
  91. package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
  92. package/dist/tokenize/basic/hyphenator.js +420 -0
  93. package/dist/tokenize/basic/hyphenator.js.map +1 -0
  94. package/dist/tokenize/basic/index.d.ts +2 -0
  95. package/dist/tokenize/basic/index.d.ts.map +1 -0
  96. package/dist/tokenize/basic/index.js +5 -0
  97. package/dist/tokenize/basic/index.js.map +1 -0
  98. package/dist/tokenize/basic/paragraph.d.ts +5 -0
  99. package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
  100. package/dist/tokenize/basic/paragraph.js +38 -0
  101. package/dist/tokenize/basic/paragraph.js.map +1 -0
  102. package/dist/tokenize/basic/sentence.d.ts +5 -0
  103. package/dist/tokenize/basic/sentence.d.ts.map +1 -0
  104. package/dist/tokenize/basic/sentence.js +60 -0
  105. package/dist/tokenize/basic/sentence.js.map +1 -0
  106. package/dist/tokenize/basic/word.d.ts +5 -0
  107. package/dist/tokenize/basic/word.d.ts.map +1 -0
  108. package/dist/tokenize/basic/word.js +23 -0
  109. package/dist/tokenize/basic/word.js.map +1 -0
  110. package/dist/tokenize/index.d.ts +5 -0
  111. package/dist/tokenize/index.d.ts.map +1 -0
  112. package/dist/tokenize/index.js +8 -0
  113. package/dist/tokenize/index.js.map +1 -0
  114. package/dist/tokenize/token_stream.d.ts +36 -0
  115. package/dist/tokenize/token_stream.d.ts.map +1 -0
  116. package/dist/tokenize/token_stream.js +136 -0
  117. package/dist/tokenize/token_stream.js.map +1 -0
  118. package/dist/tokenize/tokenizer.d.ts +55 -0
  119. package/dist/tokenize/tokenizer.d.ts.map +1 -0
  120. package/dist/tokenize/tokenizer.js +117 -0
  121. package/dist/tokenize/tokenizer.js.map +1 -0
  122. package/dist/transcription.js +78 -89
  123. package/dist/transcription.js.map +1 -1
  124. package/dist/tts/index.d.ts +1 -3
  125. package/dist/tts/index.d.ts.map +1 -1
  126. package/dist/tts/index.js +1 -3
  127. package/dist/tts/index.js.map +1 -1
  128. package/dist/tts/tts.d.ts +66 -37
  129. package/dist/tts/tts.d.ts.map +1 -1
  130. package/dist/tts/tts.js +79 -74
  131. package/dist/tts/tts.js.map +1 -1
  132. package/dist/utils.d.ts +21 -6
  133. package/dist/utils.d.ts.map +1 -1
  134. package/dist/utils.js +120 -76
  135. package/dist/utils.js.map +1 -1
  136. package/dist/vad.d.ts +43 -39
  137. package/dist/vad.d.ts.map +1 -1
  138. package/dist/vad.js +51 -4
  139. package/dist/vad.js.map +1 -1
  140. package/dist/worker.d.ts +1 -1
  141. package/dist/worker.js +257 -247
  142. package/dist/worker.js.map +1 -1
  143. package/package.json +4 -3
  144. package/src/index.ts +16 -2
  145. package/src/ipc/proc_pool.ts +25 -13
  146. package/src/llm/chat_context.ts +147 -0
  147. package/src/llm/function_context.test.ts +248 -0
  148. package/src/llm/function_context.ts +77 -18
  149. package/src/llm/index.ts +21 -2
  150. package/src/llm/llm.ts +102 -0
  151. package/src/multimodal/multimodal_agent.ts +6 -2
  152. package/src/pipeline/agent_output.ts +185 -0
  153. package/src/pipeline/agent_playout.ts +187 -0
  154. package/src/pipeline/human_input.ts +166 -0
  155. package/src/pipeline/index.ts +15 -0
  156. package/src/pipeline/pipeline_agent.ts +917 -0
  157. package/src/pipeline/speech_handle.ts +136 -0
  158. package/src/stt/index.ts +8 -2
  159. package/src/stt/stt.ts +98 -31
  160. package/src/tokenize/basic/basic.ts +73 -0
  161. package/src/tokenize/basic/hyphenator.ts +436 -0
  162. package/src/tokenize/basic/index.ts +5 -0
  163. package/src/tokenize/basic/paragraph.ts +43 -0
  164. package/src/tokenize/basic/sentence.ts +69 -0
  165. package/src/tokenize/basic/word.ts +27 -0
  166. package/src/tokenize/index.ts +16 -0
  167. package/src/tokenize/token_stream.ts +163 -0
  168. package/src/tokenize/tokenizer.ts +152 -0
  169. package/src/tts/index.ts +1 -20
  170. package/src/tts/tts.ts +110 -57
  171. package/src/utils.ts +95 -25
  172. package/src/vad.ts +86 -45
  173. package/tsconfig.tsbuildinfo +1 -1
  174. package/dist/stt/stream_adapter.d.ts +0 -19
  175. package/dist/stt/stream_adapter.d.ts.map +0 -1
  176. package/dist/stt/stream_adapter.js +0 -96
  177. package/dist/stt/stream_adapter.js.map +0 -1
  178. package/dist/tokenize.d.ts +0 -15
  179. package/dist/tokenize.d.ts.map +0 -1
  180. package/dist/tokenize.js +0 -12
  181. package/dist/tokenize.js.map +0 -1
  182. package/dist/tts/stream_adapter.d.ts +0 -19
  183. package/dist/tts/stream_adapter.d.ts.map +0 -1
  184. package/dist/tts/stream_adapter.js +0 -111
  185. package/dist/tts/stream_adapter.js.map +0 -1
  186. package/src/stt/stream_adapter.ts +0 -104
  187. package/src/tokenize.ts +0 -22
  188. package/src/tts/stream_adapter.ts +0 -93
@@ -0,0 +1,917 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { LocalTrackPublication, RemoteParticipant, Room } from '@livekit/rtc-node';
5
+ import {
6
+ AudioSource,
7
+ LocalAudioTrack,
8
+ RoomEvent,
9
+ TrackPublishOptions,
10
+ TrackSource,
11
+ } from '@livekit/rtc-node';
12
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
13
+ import EventEmitter from 'node:events';
14
+ import type {
15
+ CallableFunctionResult,
16
+ FunctionCallInfo,
17
+ FunctionContext,
18
+ LLM,
19
+ } from '../llm/index.js';
20
+ import { LLMStream } from '../llm/index.js';
21
+ import { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';
22
+ import { log } from '../log.js';
23
+ import type { STT } from '../stt/index.js';
24
+ import {
25
+ SentenceTokenizer as BasicSentenceTokenizer,
26
+ WordTokenizer as BasicWordTokenizer,
27
+ hyphenateWord,
28
+ } from '../tokenize/basic/index.js';
29
+ import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
30
+ import type { TTS } from '../tts/index.js';
31
+ import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
32
+ import type { VAD, VADEvent } from '../vad.js';
33
+ import type { SpeechSource, SynthesisHandle } from './agent_output.js';
34
+ import { AgentOutput } from './agent_output.js';
35
+ import { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';
36
+ import { HumanInput, HumanInputEvent } from './human_input.js';
37
+ import { SpeechHandle } from './speech_handle.js';
38
+
39
+ export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
40
+
41
+ export type BeforeLLMCallback = (
42
+ agent: VoicePipelineAgent,
43
+ chatCtx: ChatContext,
44
+ ) => LLMStream | false | void | Promise<LLMStream | false | void>;
45
+
46
+ export type BeforeTTSCallback = (
47
+ agent: VoicePipelineAgent,
48
+ source: string | AsyncIterable<string>,
49
+ ) => SpeechSource;
50
+
51
+ export enum VPAEvent {
52
+ USER_STARTED_SPEAKING,
53
+ USER_STOPPED_SPEAKING,
54
+ AGENT_STARTED_SPEAKING,
55
+ AGENT_STOPPED_SPEAKING,
56
+ USER_SPEECH_COMMITTED,
57
+ AGENT_SPEECH_COMMITTED,
58
+ AGENT_SPEECH_INTERRUPTED,
59
+ FUNCTION_CALLS_COLLECTED,
60
+ FUNCTION_CALLS_FINISHED,
61
+ }
62
+
63
+ export type VPACallbacks = {
64
+ [VPAEvent.USER_STARTED_SPEAKING]: () => void;
65
+ [VPAEvent.USER_STOPPED_SPEAKING]: () => void;
66
+ [VPAEvent.AGENT_STARTED_SPEAKING]: () => void;
67
+ [VPAEvent.AGENT_STOPPED_SPEAKING]: () => void;
68
+ [VPAEvent.USER_SPEECH_COMMITTED]: (msg: ChatMessage) => void;
69
+ [VPAEvent.AGENT_SPEECH_COMMITTED]: (msg: ChatMessage) => void;
70
+ [VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
71
+ [VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
72
+ [VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
73
+ };
74
+
75
+ export class AgentCallContext {
76
+ #agent: VoicePipelineAgent;
77
+ #llmStream: LLMStream;
78
+ #metadata = new Map<string, any>();
79
+ static #current: AgentCallContext;
80
+
81
+ constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {
82
+ this.#agent = agent;
83
+ this.#llmStream = llmStream;
84
+ AgentCallContext.#current = this;
85
+ }
86
+
87
+ static getCurrent(): AgentCallContext {
88
+ return AgentCallContext.#current;
89
+ }
90
+
91
+ get agent(): VoicePipelineAgent {
92
+ return this.#agent;
93
+ }
94
+
95
+ storeMetadata(key: string, value: any) {
96
+ this.#metadata.set(key, value);
97
+ }
98
+
99
+ getMetadata(key: string, orDefault: any = undefined) {
100
+ return this.#metadata.get(key) || orDefault;
101
+ }
102
+
103
+ get llmStream(): LLMStream {
104
+ return this.#llmStream;
105
+ }
106
+ }
107
+
108
+ const defaultBeforeLLMCallback: BeforeLLMCallback = (
109
+ agent: VoicePipelineAgent,
110
+ chatCtx: ChatContext,
111
+ ): LLMStream => {
112
+ return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });
113
+ };
114
+
115
+ const defaultBeforeTTSCallback: BeforeTTSCallback = (
116
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
117
+ _: VoicePipelineAgent,
118
+ text: string | AsyncIterable<string>,
119
+ ): string | AsyncIterable<string> => {
120
+ return text;
121
+ };
122
+
123
+ export interface AgentTranscriptionOptions {
124
+ /** Whether to forward the user transcription to the client */
125
+ userTranscription: boolean;
126
+ /** Whether to forward the agent transcription to the client */
127
+ agentTranscription: boolean;
128
+ /**
129
+ * The speed at which the agent's speech transcription is forwarded to the client.
130
+ * We try to mimic the agent's speech speed by adjusting the transcription speed.
131
+ */
132
+ agentTranscriptionSpeech: number;
133
+ /**
134
+ * The tokenizer used to split the speech into sentences.
135
+ * This is used to decide when to mark a transcript as final for the agent transcription.
136
+ */
137
+ sentenceTokenizer: SentenceTokenizer;
138
+ /**
139
+ * The tokenizer used to split the speech into words.
140
+ * This is used to simulate the "interim results" of the agent transcription.
141
+ */
142
+ wordTokenizer: WordTokenizer;
143
+ /**
144
+ * A function that takes a string (word) as input and returns a list of strings,
145
+ * representing the hyphenated parts of the word.
146
+ */
147
+ hyphenateWord: (word: string) => string[];
148
+ }
149
+
150
+ const defaultAgentTranscriptionOptions: AgentTranscriptionOptions = {
151
+ userTranscription: true,
152
+ agentTranscription: true,
153
+ agentTranscriptionSpeech: 1,
154
+ sentenceTokenizer: new BasicSentenceTokenizer(),
155
+ wordTokenizer: new BasicWordTokenizer(false),
156
+ hyphenateWord: hyphenateWord,
157
+ };
158
+
159
+ export interface VPAOptions {
160
+ /** Chat context for the assistant. */
161
+ chatCtx?: ChatContext;
162
+ /** Function context for the assistant. */
163
+ fncCtx?: FunctionContext;
164
+ /** Whether to allow the user to interrupt the assistant. */
165
+ allowInterruptions: boolean;
166
+ /** Minimum duration of speech to consider for interruption. */
167
+ interruptSpeechDuration: number;
168
+ /** Minimum number of words to consider for interuption. This may increase latency. */
169
+ interruptMinWords: number;
170
+ /** Delay to wait before considering the user speech done. */
171
+ minEndpointingDelay: number;
172
+ maxRecursiveFncCalls: number;
173
+ /* Whether to preemptively synthesize responses. */
174
+ preemptiveSynthesis: boolean;
175
+ /*
176
+ * Callback called when the assistant is about to synthesize a reply.
177
+ *
178
+ * @remarks
179
+ * Returning void will create a default LLM stream.
180
+ * You can also return your own LLM stream by calling `llm.chat()`.
181
+ * Returning `false` ill cancel the synthesis of the reply.
182
+ */
183
+ beforeLLMCallback: BeforeLLMCallback;
184
+ /*
185
+ * Callback called when the assistant is about to synthesize speech.
186
+ *
187
+ * @remarks
188
+ * This can be used to customize text before synthesis
189
+ * (e.g. editing the pronunciation of a word).
190
+ */
191
+ beforeTTSCallback: BeforeTTSCallback;
192
+ /** Options for assistant transcription. */
193
+ transcription: AgentTranscriptionOptions;
194
+ }
195
+
196
+ const defaultVPAOptions: VPAOptions = {
197
+ chatCtx: new ChatContext(),
198
+ allowInterruptions: true,
199
+ interruptSpeechDuration: 50,
200
+ interruptMinWords: 0,
201
+ minEndpointingDelay: 500,
202
+ maxRecursiveFncCalls: 1,
203
+ preemptiveSynthesis: false,
204
+ beforeLLMCallback: defaultBeforeLLMCallback,
205
+ beforeTTSCallback: defaultBeforeTTSCallback,
206
+ transcription: defaultAgentTranscriptionOptions,
207
+ };
208
+
209
+ /** A pipeline agent (VAD + STT + LLM + TTS) implementation. */
210
+ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<VPACallbacks>) {
211
+ /** Minimum time played for the user speech to be committed to the chat context. */
212
+ readonly MIN_TIME_PLAYED_FOR_COMMIT = 1.5;
213
+ protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
214
+
215
+ #vad: VAD;
216
+ #stt: STT;
217
+ #llm: LLM;
218
+ #tts: TTS;
219
+ #opts: VPAOptions;
220
+ #humanInput?: HumanInput;
221
+ #agentOutput?: AgentOutput;
222
+ #trackPublishedFut = new Future();
223
+ #pendingAgentReply?: SpeechHandle;
224
+ #agentReplyTask?: CancellablePromise<void>;
225
+ #playingSpeech?: SpeechHandle;
226
+ #transcribedText = '';
227
+ #transcribedInterimText = '';
228
+ #speechQueueOpen = new Future();
229
+ #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
230
+ #lastEndOfSpeechTime?: number;
231
+ #updateStateTask?: CancellablePromise<void>;
232
+ #started = false;
233
+ #room?: Room;
234
+ #participant: RemoteParticipant | string | null = null;
235
+ #deferredValidation: DeferredReplyValidation;
236
+ #logger = log();
237
+ #agentPublication?: LocalTrackPublication;
238
+
239
+ constructor(
240
+ /** Voice Activity Detection instance. */
241
+ vad: VAD,
242
+ /** Speech-to-Text instance. */
243
+ stt: STT,
244
+ /** Large Language Model instance. */
245
+ llm: LLM,
246
+ /** Text-to-Speech instance. */
247
+ tts: TTS,
248
+ /** Additional VoicePipelineAgent options. */
249
+ opts: Partial<VPAOptions> = defaultVPAOptions,
250
+ ) {
251
+ super();
252
+
253
+ this.#opts = { ...defaultVPAOptions, ...opts };
254
+
255
+ this.#vad = vad;
256
+ this.#stt = stt;
257
+ this.#llm = llm;
258
+ this.#tts = tts;
259
+
260
+ this.#deferredValidation = new DeferredReplyValidation(
261
+ this.#validateReplyIfPossible.bind(this),
262
+ this.#opts.minEndpointingDelay,
263
+ );
264
+ }
265
+
266
+ get fncCtx(): FunctionContext | undefined {
267
+ return this.#opts.fncCtx;
268
+ }
269
+
270
+ set fncCtx(ctx: FunctionContext) {
271
+ this.#opts.fncCtx = ctx;
272
+ }
273
+
274
+ get chatCtx(): ChatContext {
275
+ return this.#opts.chatCtx!;
276
+ }
277
+
278
+ get llm(): LLM {
279
+ return this.#llm;
280
+ }
281
+
282
+ get tts(): TTS {
283
+ return this.#tts;
284
+ }
285
+
286
+ get stt(): STT {
287
+ return this.#stt;
288
+ }
289
+
290
+ get vad(): VAD {
291
+ return this.#vad;
292
+ }
293
+
294
+ /** Start the voice assistant. */
295
+ start(
296
+ /** The room to connect to. */
297
+ room: Room,
298
+ /**
299
+ * The participant to listen to.
300
+ *
301
+ * @remarks
302
+ * Can be a participant or an identity.
303
+ * If omitted, the first participant in the room will be selected.
304
+ */
305
+ participant: RemoteParticipant | string | null = null,
306
+ ) {
307
+ if (this.#started) {
308
+ throw new Error('voice assistant already started');
309
+ }
310
+ room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
311
+ // automatically link to the first participant that connects, if not already linked
312
+ if (this.#participant) {
313
+ return;
314
+ }
315
+ this.#linkParticipant.call(this, participant.identity);
316
+ });
317
+
318
+ this.#room = room;
319
+ this.#participant = participant;
320
+
321
+ if (participant) {
322
+ if (typeof participant === 'string') {
323
+ this.#linkParticipant(participant);
324
+ } else {
325
+ this.#linkParticipant(participant.identity);
326
+ }
327
+ }
328
+
329
+ this.#run();
330
+ }
331
+
332
+ /** Play a speech source through the voice assistant. */
333
+ async say(
334
+ source: string | LLMStream | AsyncIterable<string>,
335
+ allowInterruptions = true,
336
+ addToChatCtx = true,
337
+ ) {
338
+ await this.#trackPublishedFut.await;
339
+ const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
340
+ const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
341
+ newHandle.initialize(source, synthesisHandle);
342
+ this.#addSpeechForPlayout(newHandle);
343
+ }
344
+
345
+ #updateState(state: AgentState, delay = 0) {
346
+ const runTask = (delay: number): CancellablePromise<void> => {
347
+ return new CancellablePromise(async (resolve, _, onCancel) => {
348
+ let cancelled = false;
349
+ onCancel(() => {
350
+ cancelled = true;
351
+ });
352
+ await new Promise((resolve) => setTimeout(resolve, delay));
353
+ if (this.#room?.isConnected) {
354
+ if (!cancelled) {
355
+ await this.#room.localParticipant?.setAttributes({ ATTRIBUTE_AGENT_STATE: state });
356
+ }
357
+ }
358
+ resolve();
359
+ });
360
+ };
361
+
362
+ if (this.#updateStateTask) {
363
+ this.#updateStateTask.cancel();
364
+ }
365
+
366
+ this.#updateStateTask = runTask(delay);
367
+ }
368
+
369
+ #linkParticipant(participantIdentity: string): void {
370
+ if (!this.#room) {
371
+ this.#logger.error('Room is not set');
372
+ return;
373
+ }
374
+
375
+ this.#participant = this.#room.remoteParticipants.get(participantIdentity) || null;
376
+ if (!this.#participant) {
377
+ this.#logger.error(`Participant with identity ${participantIdentity} not found`);
378
+ return;
379
+ }
380
+
381
+ this.#humanInput = new HumanInput(this.#room, this.#vad, this.#stt, this.#participant);
382
+ this.#humanInput.on(HumanInputEvent.START_OF_SPEECH, (event) => {
383
+ this.emit(VPAEvent.USER_STARTED_SPEAKING);
384
+ this.#deferredValidation.onHumanStartOfSpeech(event);
385
+ });
386
+ this.#humanInput.on(HumanInputEvent.VAD_INFERENCE_DONE, (event) => {
387
+ if (!this.#trackPublishedFut.done) {
388
+ return;
389
+ }
390
+ if (!this.#agentOutput) {
391
+ throw new Error('agent output is undefined');
392
+ }
393
+
394
+ let tv = 1;
395
+ if (this.#opts.allowInterruptions) {
396
+ tv = Math.max(0, 1 - event.probability);
397
+ this.#agentOutput.playout.targetVolume = tv;
398
+ }
399
+
400
+ if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
401
+ this.#interruptIfPossible();
402
+ }
403
+ });
404
+ this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
405
+ this.emit(VPAEvent.USER_STARTED_SPEAKING);
406
+ this.#deferredValidation.onHumanEndOfSpeech(event);
407
+ this.#lastEndOfSpeechTime = Date.now();
408
+ });
409
+ this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
410
+ this.#transcribedInterimText = event.alternatives[0].text;
411
+ });
412
+ this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
413
+ const newTranscript = event.alternatives[0].text;
414
+ if (!newTranscript) return;
415
+
416
+ this.#logger.child({ userTranscript: newTranscript }).debug('received user transcript');
417
+ this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
418
+
419
+ if (
420
+ this.#opts.preemptiveSynthesis &&
421
+ (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)
422
+ ) {
423
+ this.#synthesizeAgentReply();
424
+ }
425
+
426
+ this.#deferredValidation.onHumanFinalTranscript(newTranscript);
427
+
428
+ const words = this.#opts.transcription.wordTokenizer.tokenize(newTranscript);
429
+ if (words.length >= 3) {
430
+ // VAD can sometimes not detect that the human is speaking.
431
+ // to make the interruption more reliable, we also interrupt on the final transcript.
432
+ this.#interruptIfPossible();
433
+ }
434
+ });
435
+ }
436
+
437
+ async #run() {
438
+ this.#updateState('initializing');
439
+ const audioSource = new AudioSource(this.#tts.sampleRate, this.#tts.numChannels);
440
+ const track = LocalAudioTrack.createAudioTrack('assistant_voice', audioSource);
441
+ this.#agentPublication = await this.#room?.localParticipant?.publishTrack(
442
+ track,
443
+ new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }),
444
+ );
445
+
446
+ const agentPlayout = new AgentPlayout(audioSource);
447
+ this.#agentOutput = new AgentOutput(agentPlayout, this.#tts);
448
+
449
+ agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STARTED, () => {
450
+ this.emit(VPAEvent.AGENT_STARTED_SPEAKING);
451
+ this.#updateState('speaking');
452
+ });
453
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
454
+ agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STOPPED, (_) => {
455
+ this.emit(VPAEvent.AGENT_STOPPED_SPEAKING);
456
+ this.#updateState('listening');
457
+ });
458
+
459
+ this.#trackPublishedFut.resolve();
460
+
461
+ while (true) {
462
+ await this.#speechQueueOpen.await;
463
+ for await (const speech of this.#speechQueue) {
464
+ if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
465
+ this.#playingSpeech = speech;
466
+ await this.#playSpeech(speech);
467
+ this.#playingSpeech = undefined;
468
+ }
469
+ this.#speechQueueOpen = new Future();
470
+ }
471
+ }
472
+
473
+ #synthesizeAgentReply() {
474
+ this.#pendingAgentReply?.cancel();
475
+ if (this.#humanInput && this.#humanInput.speaking) {
476
+ this.#updateState('thinking', 200);
477
+ }
478
+
479
+ this.#pendingAgentReply = SpeechHandle.createAssistantReply(
480
+ this.#opts.allowInterruptions,
481
+ true,
482
+ this.#transcribedText,
483
+ );
484
+ const newHandle = this.#pendingAgentReply;
485
+ this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
486
+ }
487
+
488
+ #synthesizeAnswerTask(
489
+ oldTask: CancellablePromise<void> | undefined,
490
+ handle?: SpeechHandle,
491
+ ): CancellablePromise<void> {
492
+ return new CancellablePromise(async (resolve, _, onCancel) => {
493
+ let cancelled = false;
494
+ onCancel(() => {
495
+ cancelled = true;
496
+ });
497
+
498
+ if (oldTask) {
499
+ await gracefullyCancel(oldTask);
500
+ }
501
+
502
+ const copiedCtx = this.chatCtx.copy();
503
+ const playingSpeech = this.#playingSpeech;
504
+ if (playingSpeech && playingSpeech.initialized) {
505
+ if (
506
+ (!playingSpeech.userQuestion || playingSpeech.userCommitted) &&
507
+ !playingSpeech.speechCommitted
508
+ ) {
509
+ // the speech is playing but not committed yet,
510
+ // add it to the chat context for this new reply synthesis
511
+ copiedCtx.messages.push(
512
+ ChatMessage.create({
513
+ // TODO(nbsp): uhhh unsure where to get the played text here
514
+ // text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
515
+ role: ChatRole.ASSISTANT,
516
+ }),
517
+ );
518
+ }
519
+ }
520
+
521
+ copiedCtx.messages.push(
522
+ ChatMessage.create({
523
+ text: handle?.userQuestion,
524
+ role: ChatRole.USER,
525
+ }),
526
+ );
527
+
528
+ if (cancelled) resolve();
529
+ let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
530
+ if (llmStream === false) {
531
+ handle?.cancel();
532
+ return;
533
+ }
534
+
535
+ if (cancelled) resolve();
536
+ // fallback to default impl if no custom/user stream is returned
537
+ if (!(llmStream instanceof LLMStream)) {
538
+ llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
539
+ }
540
+
541
+ if (handle!.interrupted) {
542
+ return;
543
+ }
544
+
545
+ const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
546
+ handle!.initialize(llmStream, synthesisHandle);
547
+
548
+ // TODO(theomonnom): find a more reliable way to get the elapsed time from the last EOS
549
+ // (VAD could not have detected any speech — maybe unlikely?)
550
+ const elapsed = !!this.#lastEndOfSpeechTime
551
+ ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000
552
+ : -1;
553
+
554
+ this.#logger.child({ speechId: handle!.id, elapsed }).debug('synthesizing agent reply');
555
+ resolve();
556
+ });
557
+ }
558
+
559
+ async #playSpeech(handle: SpeechHandle) {
560
+ try {
561
+ await handle.waitForInitialization();
562
+ } catch {
563
+ return;
564
+ }
565
+ await this.#agentPublication!.waitForSubscription();
566
+ const synthesisHandle = handle.synthesisHandle;
567
+ if (synthesisHandle.interrupted) return;
568
+
569
+ const userQuestion = handle.userQuestion;
570
+ const playHandle = synthesisHandle.play();
571
+ const joinFut = playHandle.join();
572
+
573
+ const commitUserQuestionIfNeeded = () => {
574
+ if (!userQuestion || synthesisHandle.interrupted || handle.userCommitted) return;
575
+ const isUsingTools =
576
+ handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
577
+
578
+ // make sure at least some speech was played before committing the user message
579
+ // since we try to validate as fast as possible it is possible the agent gets interrupted
580
+ // really quickly (barely audible), we don't want to mark this question as "answered".
581
+ if (
582
+ handle.allowInterruptions &&
583
+ !isUsingTools &&
584
+ playHandle.timePlayed < this.MIN_TIME_PLAYED_FOR_COMMIT &&
585
+ !joinFut.done
586
+ ) {
587
+ return;
588
+ }
589
+
590
+ this.#logger.child({ userTranscript: userQuestion }).debug('committed user transcript');
591
+ const userMsg = ChatMessage.create({ text: userQuestion, role: ChatRole.USER });
592
+ this.chatCtx.messages.push(userMsg);
593
+ this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);
594
+
595
+ this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
596
+ };
597
+
598
+ // wait for the playHandle to finish and check every 1s if user question should be committed
599
+ commitUserQuestionIfNeeded();
600
+
601
+ while (!joinFut.done) {
602
+ await new Promise<void>(async (resolve) => {
603
+ setTimeout(resolve, 500);
604
+ await joinFut.await;
605
+ resolve();
606
+ });
607
+ commitUserQuestionIfNeeded();
608
+ if (handle.interrupted) break;
609
+ }
610
+ commitUserQuestionIfNeeded();
611
+
612
+ // TODO(nbsp): what goes here
613
+ let collectedText = '';
614
+ const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
615
+ const extraToolsMessages = []; // additional messages from the functions to add to the context
616
+ let interrupted = handle.interrupted;
617
+
618
+ // if the answer is using tools, execute the functions and automatically generate
619
+ // a response to the user question from the returned values
620
+ if (isUsingTools && !interrupted) {
621
+ if (!userQuestion || handle.userCommitted) {
622
+ throw new Error('user speech should have been committed before using tools');
623
+ }
624
+ const llmStream = handle.source;
625
+ let newFunctionCalls = llmStream.functionCalls;
626
+
627
+ for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {
628
+ this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
629
+ const calledFuncs: FunctionCallInfo[] = [];
630
+ for (const func of newFunctionCalls) {
631
+ const task = func.func.execute(func.params).then(
632
+ (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
633
+ (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
634
+ );
635
+ calledFuncs.push({ ...func, task });
636
+ this.#logger
637
+ .child({ function: func.name, speechId: handle.id })
638
+ .debug('executing AI function');
639
+ try {
640
+ await task;
641
+ } catch {
642
+ this.#logger
643
+ .child({ function: func.name, speechId: handle.id })
644
+ .error('error executing AI function');
645
+ }
646
+ }
647
+
648
+ const toolCallsInfo = [];
649
+ const toolCallsResults = [];
650
+ for (const fnc of calledFuncs) {
651
+ // ignore the function calls that return void
652
+ const task = await fnc.task;
653
+ if (!task || task.result === undefined) continue;
654
+ toolCallsInfo.push(fnc);
655
+ toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
656
+ }
657
+
658
+ if (!toolCallsInfo.length) break;
659
+
660
+ // generate an answer from the tool calls
661
+ extraToolsMessages.push(ChatMessage.createToolCalls(toolCallsInfo, collectedText));
662
+ extraToolsMessages.push(...toolCallsResults);
663
+
664
+ const chatCtx = handle.source.chatCtx.copy();
665
+ chatCtx.messages.push(...extraToolsMessages);
666
+
667
+ const answerLLMStream = this.llm.chat({
668
+ chatCtx,
669
+ fncCtx: this.fncCtx,
670
+ });
671
+ const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
672
+ // replace the synthesis handle with the new one to allow interruption
673
+ handle.synthesisHandle = answerSynthesis;
674
+ const playHandle = answerSynthesis.play();
675
+ await playHandle.join().await;
676
+
677
+ // TODO(nbsp): what text goes here
678
+ collectedText = '';
679
+ interrupted = answerSynthesis.interrupted;
680
+ newFunctionCalls = answerLLMStream.functionCalls;
681
+
682
+ this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
683
+ if (!newFunctionCalls) break;
684
+ }
685
+
686
+ if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
687
+ this.chatCtx.messages.push(...extraToolsMessages);
688
+ if (interrupted) {
689
+ collectedText + '…';
690
+ }
691
+
692
+ const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
693
+ this.chatCtx.messages.push(msg);
694
+
695
+ handle.markSpeechCommitted();
696
+ if (interrupted) {
697
+ this.emit(VPAEvent.AGENT_SPEECH_INTERRUPTED, msg);
698
+ } else {
699
+ this.emit(VPAEvent.AGENT_SPEECH_COMMITTED, msg);
700
+ }
701
+
702
+ this.#logger
703
+ .child({
704
+ agentTranscript: collectedText,
705
+ interrupted,
706
+ speechId: handle.id,
707
+ })
708
+ .debug('committed agent speech');
709
+ }
710
+ }
711
+ }
712
+
713
+ #synthesizeAgentSpeech(
714
+ speechId: string,
715
+ source: string | LLMStream | AsyncIterable<string>,
716
+ ): SynthesisHandle {
717
+ if (!this.#agentOutput) {
718
+ throw new Error('agent output should be initialized when ready');
719
+ }
720
+
721
+ if (source instanceof LLMStream) {
722
+ source = llmStreamToStringIterable(speechId, source);
723
+ }
724
+
725
+ const ogSource = source;
726
+ if (!(typeof source === 'string')) {
727
+ // TODO(nbsp): itertools.tee
728
+ }
729
+
730
+ const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);
731
+ if (!ttsSource) {
732
+ throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');
733
+ }
734
+
735
+ return this.#agentOutput.synthesize(speechId, ttsSource);
736
+ }
737
+
738
+ async #validateReplyIfPossible() {
739
+ if (this.#playingSpeech && this.#playingSpeech.allowInterruptions) {
740
+ this.#logger
741
+ .child({ speechId: this.#playingSpeech.id })
742
+ .debug('skipping validation, agent is speaking and does not allow interruptions');
743
+ return;
744
+ }
745
+
746
+ if (!this.#pendingAgentReply) {
747
+ if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
748
+ return;
749
+ }
750
+ this.#synthesizeAgentReply();
751
+ }
752
+
753
+ if (!this.#pendingAgentReply) {
754
+ throw new Error('pending agent reply is undefined');
755
+ }
756
+
757
+ // in some bad timimg, we could end up with two pushed agent replies inside the speech queue.
758
+ // so make sure we directly interrupt every reply when validating a new one
759
+ if (this.#speechQueueOpen.done) {
760
+ for await (const speech of this.#speechQueue) {
761
+ if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
762
+ if (!speech.isReply) continue;
763
+ if (!speech.allowInterruptions) speech.interrupt();
764
+ }
765
+ }
766
+
767
+ this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');
768
+
769
+ this.#addSpeechForPlayout(this.#pendingAgentReply);
770
+ this.#pendingAgentReply = undefined;
771
+ this.#transcribedInterimText = '';
772
+ }
773
+
774
+ #interruptIfPossible() {
775
+ if (
776
+ !this.#playingSpeech ||
777
+ !this.#playingSpeech.allowInterruptions ||
778
+ this.#playingSpeech.interrupted
779
+ ) {
780
+ return;
781
+ }
782
+
783
+ if (this.#opts.interruptMinWords !== 0) {
784
+ // check the final/interim transcribed text for the minimum word count
785
+ // to interrupt the agent speech
786
+ const interimWords = this.#opts.transcription.wordTokenizer.tokenize(
787
+ this.#transcribedInterimText,
788
+ );
789
+ if (interimWords.length < this.#opts.interruptMinWords) {
790
+ return;
791
+ }
792
+ }
793
+ this.#playingSpeech.interrupt();
794
+ }
795
+
796
+ #addSpeechForPlayout(handle: SpeechHandle) {
797
+ this.#speechQueue.put(handle);
798
+ this.#speechQueue.put(VoicePipelineAgent.FLUSH_SENTINEL);
799
+ this.#speechQueueOpen.resolve();
800
+ }
801
+
802
+ /** Close the voice assistant. */
803
+ async close() {
804
+ if (!this.#started) {
805
+ return;
806
+ }
807
+
808
+ this.#room?.removeAllListeners(RoomEvent.ParticipantConnected);
809
+ // TODO(nbsp): await this.#deferredValidation.close()
810
+ }
811
+ }
812
+
813
+ async function* llmStreamToStringIterable(
814
+ speechId: string,
815
+ stream: LLMStream,
816
+ ): AsyncIterable<string> {
817
+ const startTime = Date.now();
818
+ let firstFrame = true;
819
+ for await (const chunk of stream) {
820
+ const content = chunk.choices[0].delta.content;
821
+ if (!content) continue;
822
+
823
+ if (firstFrame) {
824
+ firstFrame = false;
825
+ log()
826
+ .child({ speechId, elapsed: Math.round(Date.now() * 1000 - startTime) / 1000 })
827
+ .debug('received first LLM token');
828
+ }
829
+ yield content;
830
+ }
831
+ }
832
+
833
+ /** This class is used to try to find the best time to validate the agent reply. */
834
+ class DeferredReplyValidation {
835
+ // if the STT gives us punctuation, we can try to validate the reply faster.
836
+ readonly PUNCTUATION = '.!?';
837
+ readonly PUNCTUATION_REDUCE_FACTOR = 0.75;
838
+ readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech
839
+
840
+ #validateFunc: () => Promise<void>;
841
+ #validatingPromise?: Promise<void>;
842
+ #validatingFuture = new Future();
843
+ #lastFinalTranscript = '';
844
+ #lastRecvEndOfSpeechTime = 0;
845
+ #speaking = false;
846
+ #endOfSpeechDelay: number;
847
+ #finalTranscriptDelay: number;
848
+
849
+ constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {
850
+ this.#validateFunc = validateFunc;
851
+ this.#endOfSpeechDelay = minEndpointingDelay;
852
+ this.#finalTranscriptDelay = minEndpointingDelay;
853
+ }
854
+
855
+ get validating(): boolean {
856
+ return !this.#validatingFuture.done;
857
+ }
858
+
859
+ onHumanFinalTranscript(transcript: string) {
860
+ this.#lastFinalTranscript = transcript.trim();
861
+ if (this.#speaking) return;
862
+
863
+ const hasRecentEndOfSpeech =
864
+ Date.now() - this.#lastRecvEndOfSpeechTime < this.LATE_TRANSCRIPT_TOLERANCE;
865
+ let delay = hasRecentEndOfSpeech ? this.#endOfSpeechDelay : this.#finalTranscriptDelay;
866
+ delay = this.#endWithPunctuation() ? delay * this.PUNCTUATION_REDUCE_FACTOR : 1;
867
+
868
+ this.#run(delay);
869
+ }
870
+
871
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
872
+ onHumanStartOfSpeech(_: VADEvent) {
873
+ this.#speaking = true;
874
+ // TODO(nbsp):
875
+ // if (this.validating) {
876
+ // this.#validatingPromise.cancel()
877
+ // }
878
+ }
879
+
880
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
881
+ onHumanEndOfSpeech(_: VADEvent) {
882
+ this.#speaking = false;
883
+ this.#lastRecvEndOfSpeechTime = Date.now();
884
+
885
+ if (this.#lastFinalTranscript) {
886
+ const delay = this.#endWithPunctuation()
887
+ ? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR
888
+ : 1;
889
+ this.#run(delay);
890
+ }
891
+ }
892
+
893
+ // TODO(nbsp): aclose
894
+
895
+ #endWithPunctuation(): boolean {
896
+ return (
897
+ this.#lastFinalTranscript.length > 0 &&
898
+ this.PUNCTUATION.includes(this.#lastFinalTranscript[this.#lastFinalTranscript.length - 1])
899
+ );
900
+ }
901
+
902
+ #resetStates() {
903
+ this.#lastFinalTranscript = '';
904
+ this.#lastRecvEndOfSpeechTime = 0;
905
+ }
906
+
907
+ #run(delay: number) {
908
+ const runTask = async (delay: number) => {
909
+ await new Promise((resolve) => setTimeout(resolve, delay));
910
+ this.#resetStates();
911
+ await this.#validateFunc();
912
+ };
913
+
914
+ this.#validatingFuture = new Future();
915
+ this.#validatingPromise = runTask(delay);
916
+ }
917
+ }