npm - @livekit/agents - Versions diffs - 1.0.3 → 1.0.5 - Mend

@livekit/agents 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/dist/index.cjs +2 -5
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -3
package/dist/index.d.ts +2 -3
package/dist/index.d.ts.map +1 -1
package/dist/index.js +1 -3
package/dist/index.js.map +1 -1
package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
package/dist/tokenize/basic/hyphenator.js.map +1 -1
package/dist/utils.cjs +77 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +21 -0
package/dist/utils.d.ts +21 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +76 -1
package/dist/utils.js.map +1 -1
package/dist/voice/agent_activity.cjs +107 -71
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +107 -71
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +204 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -0
package/dist/voice/avatar/datastream_io.d.cts +37 -0
package/dist/voice/avatar/datastream_io.d.ts +37 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
package/dist/voice/avatar/datastream_io.js +188 -0
package/dist/voice/avatar/datastream_io.js.map +1 -0
package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
package/dist/voice/avatar/index.cjs.map +1 -0
package/dist/voice/avatar/index.d.cts +2 -0
package/dist/voice/avatar/index.d.ts +2 -0
package/dist/voice/avatar/index.d.ts.map +1 -0
package/dist/voice/avatar/index.js +2 -0
package/dist/voice/avatar/index.js.map +1 -0
package/dist/voice/index.cjs +2 -0
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -0
package/dist/voice/index.d.ts +1 -0
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +1 -0
package/dist/voice/index.js.map +1 -1
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +1 -1
package/dist/voice/io.d.ts +1 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +3 -1
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +3 -1
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/run_context.cjs +13 -0
package/dist/voice/run_context.cjs.map +1 -1
package/dist/voice/run_context.d.cts +10 -0
package/dist/voice/run_context.d.ts +10 -0
package/dist/voice/run_context.d.ts.map +1 -1
package/dist/voice/run_context.js +13 -0
package/dist/voice/run_context.js.map +1 -1
package/dist/voice/speech_handle.cjs +152 -30
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +67 -16
package/dist/voice/speech_handle.d.ts +67 -16
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +153 -31
package/dist/voice/speech_handle.js.map +1 -1
package/dist/worker.cjs +4 -1
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +4 -1
package/dist/worker.js.map +1 -1
package/package.json +2 -2
package/src/index.ts +2 -3
package/src/tokenize/basic/hyphenator.ts +1 -1
package/src/utils.ts +121 -1
package/src/voice/agent_activity.ts +122 -78
package/src/voice/avatar/datastream_io.ts +247 -0
package/src/voice/avatar/index.ts +4 -0
package/src/voice/index.ts +2 -0
package/src/voice/io.ts +1 -1
package/src/voice/room_io/_input.ts +9 -3
package/src/voice/run_context.ts +16 -2
package/src/voice/speech_handle.ts +183 -38
package/src/worker.ts +5 -1
package/dist/multimodal/agent_playout.cjs +0 -233
package/dist/multimodal/agent_playout.cjs.map +0 -1
package/dist/multimodal/agent_playout.d.cts +0 -34
package/dist/multimodal/agent_playout.d.ts +0 -34
package/dist/multimodal/agent_playout.d.ts.map +0 -1
package/dist/multimodal/agent_playout.js +0 -207
package/dist/multimodal/agent_playout.js.map +0 -1
package/dist/multimodal/index.cjs.map +0 -1
package/dist/multimodal/index.d.cts +0 -2
package/dist/multimodal/index.d.ts +0 -2
package/dist/multimodal/index.d.ts.map +0 -1
package/dist/multimodal/index.js +0 -2
package/dist/multimodal/index.js.map +0 -1
package/src/multimodal/agent_playout.ts +0 -266
package/src/multimodal/index.ts +0 -4

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -82,7 +82,7 @@ export class AgentActivity implements RecognitionHooks {
   private _currentSpeech?: SpeechHandle;
   private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
   private q_updated: Future;
-  private speechTasks: Set<Promise<unknown>> = new Set();
+  private speechTasks: Set<Task<void>> = new Set();
   private lock = new Mutex();
   private audioStream = new DeferredReadableStream<AudioFrame>();
   // default to null as None, which maps to the default provider tool choice value
@@ -269,7 +269,7 @@ export class AgentActivity implements RecognitionHooks {
       this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
       this.createSpeechTask({
-        promise: this.agent.onEnter(),
+        task: Task.from(() => this.agent.onEnter()),
         name: 'AgentActivity_onEnter',
       });
     } finally {
@@ -441,9 +441,10 @@ export class AgentActivity implements RecognitionHooks {
         speechHandle: handle,
       }),
     );
     const task = this.createSpeechTask({
-      promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
+      task: Task.from((abortController: AbortController) =>
+        this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
+      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.say_tts',
     });
@@ -572,7 +573,9 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
     this.createSpeechTask({
-      promise: this.realtimeGenerationTask(handle, ev, {}),
+      task: Task.from((abortController: AbortController) =>
+        this.realtimeGenerationTask(handle, ev, {}, abortController),
+      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.realtimeGeneration',
     });
@@ -659,26 +662,32 @@ export class AgentActivity implements RecognitionHooks {
     );
   }
-  private createSpeechTask<T>(options: {
-    promise: Promise<T>;
+  private createSpeechTask(options: {
+    task: Task<void>;
     ownedSpeechHandle?: SpeechHandle;
     name?: string;
-  }): Promise<T> {
-    const { promise, ownedSpeechHandle } = options;
-    this.speechTasks.add(promise);
+  }): Promise<void> {
+    const { task, ownedSpeechHandle } = options;
-    promise.finally(() => {
-      this.speechTasks.delete(promise);
+    this.speechTasks.add(task);
+    task.addDoneCallback(() => {
+      this.speechTasks.delete(task);
+    });
-      if (ownedSpeechHandle) {
-        ownedSpeechHandle._markPlayoutDone();
-      }
+    if (ownedSpeechHandle) {
+      ownedSpeechHandle._tasks.push(task);
+      task.addDoneCallback(() => {
+        if (ownedSpeechHandle._tasks.every((t) => t.done)) {
+          ownedSpeechHandle._markDone();
+        }
+      });
+    }
+    task.addDoneCallback(() => {
       this.wakeupMainTask();
     });
-    return promise;
+    return task.result;
   }
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
@@ -705,7 +714,7 @@ export class AgentActivity implements RecognitionHooks {
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
-      promise: this.userTurnCompleted(info, oldTask),
+      task: Task.from(() => this.userTurnCompleted(info, oldTask)),
       name: 'AgentActivity.userTurnCompleted',
     });
     return true;
@@ -736,8 +745,8 @@ export class AgentActivity implements RecognitionHooks {
         }
         const speechHandle = heapItem[2];
         this._currentSpeech = speechHandle;
-        speechHandle._authorizePlayout();
-        await speechHandle.waitForPlayout();
+        speechHandle._authorizeGeneration();
+        await speechHandle._waitForGeneration();
         this._currentSpeech = undefined;
       }
@@ -815,16 +824,19 @@ export class AgentActivity implements RecognitionHooks {
     if (this.llm instanceof RealtimeModel) {
       this.createSpeechTask({
-        promise: this.realtimeReplyTask({
-          speechHandle: handle,
-          // TODO(brian): support llm.ChatMessage for the realtime model
-          userInput: userMessage?.textContent,
-          instructions,
-          modelSettings: {
-            // isGiven(toolChoice) = toolChoice !== undefined
-            toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
-          },
-        }),
+        task: Task.from((abortController: AbortController) =>
+          this.realtimeReplyTask({
+            speechHandle: handle,
+            // TODO(brian): support llm.ChatMessage for the realtime model
+            userInput: userMessage?.textContent,
+            instructions,
+            modelSettings: {
+              // isGiven(toolChoice) = toolChoice !== undefined
+              toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
+            },
+            abortController,
+          }),
+        ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.realtimeReply',
       });
@@ -837,13 +849,18 @@ export class AgentActivity implements RecognitionHooks {
       }
       const task = this.createSpeechTask({
-        promise: this.pipelineReplyTask(
-          handle,
-          chatCtx ?? this.agent.chatCtx,
-          this.agent.toolCtx,
-          { toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice) },
-          instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
-          userMessage,
+        task: Task.from((abortController: AbortController) =>
+          this.pipelineReplyTask(
+            handle,
+            chatCtx ?? this.agent.chatCtx,
+            this.agent.toolCtx,
+            {
+              toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
+            },
+            abortController,
+            instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
+            userMessage,
+          ),
         ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.pipelineReply',
@@ -860,6 +877,8 @@ export class AgentActivity implements RecognitionHooks {
     const future = new Future<void>();
     const currentSpeech = this._currentSpeech;
+    //TODO(AJS-273): add interrupt for background speeches
     currentSpeech?.interrupt();
     for (const [_, __, speech] of this.speechQueue) {
@@ -871,7 +890,7 @@ export class AgentActivity implements RecognitionHooks {
     if (currentSpeech === undefined) {
       future.resolve();
     } else {
-      currentSpeech.then(() => {
+      currentSpeech.addDoneCallback(() => {
         if (future.done) return;
         future.resolve();
       });
@@ -881,7 +900,7 @@ export class AgentActivity implements RecognitionHooks {
   }
   private onPipelineReplyDone(): void {
-    if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
+    if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
       this.agentSession._updateAgentState('listening');
     }
   }
@@ -980,6 +999,7 @@ export class AgentActivity implements RecognitionHooks {
     text: string | ReadableStream<string>,
     addToChatCtx: boolean,
     modelSettings: ModelSettings,
+    replyAbortController: AbortController,
     audio?: ReadableStream<AudioFrame> | null,
   ): Promise<void> {
     speechHandleStorage.enterWith(speechHandle);
@@ -992,7 +1012,6 @@ export class AgentActivity implements RecognitionHooks {
       ? this.agentSession.output.audio
       : null;
-    const replyAbortController = new AbortController();
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
     if (speechHandle.interrupted) {
@@ -1102,14 +1121,13 @@ export class AgentActivity implements RecognitionHooks {
     chatCtx: ChatContext,
     toolCtx: ToolContext,
     modelSettings: ModelSettings,
+    replyAbortController: AbortController,
     instructions?: string,
     newMessage?: ChatMessage,
     toolsMessages?: ChatItem[],
   ): Promise<void> {
     speechHandleStorage.enterWith(speechHandle);
-    const replyAbortController = new AbortController();
     const audioOutput = this.agentSession.output.audioEnabled
       ? this.agentSession.output.audio
       : null;
@@ -1163,13 +1181,19 @@ export class AgentActivity implements RecognitionHooks {
       tasks.push(ttsTask);
     }
-    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
+    this.agentSession._updateAgentState('thinking');
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    speechHandle._clearAuthorization();
     const replyStartedAt = Date.now();
     const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
     let textOut: _TextOut | null = null;
@@ -1205,6 +1229,9 @@ export class AgentActivity implements RecognitionHooks {
       textOut?.firstTextFut.await.finally(onFirstFrame);
     }
+    //TODO(AJS-272): before executing tools, make sure we generated all the text
+    // (this ensure everything is kept ordered)
     const onToolExecutionStarted = (_: FunctionCall) => {
       // TODO(brian): handle speech_handle item_added
     };
@@ -1223,7 +1250,6 @@ export class AgentActivity implements RecognitionHooks {
       onToolExecutionStarted,
       onToolExecutionCompleted,
     });
-    tasks.push(executeToolsTask);
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1290,7 +1316,7 @@ export class AgentActivity implements RecognitionHooks {
         'playout completed with interrupt',
       );
       // TODO(shubhra) add chat message to speech handle
-      speechHandle._markPlayoutDone();
+      speechHandle._markGenerationDone();
       await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
@@ -1318,14 +1344,15 @@ export class AgentActivity implements RecognitionHooks {
       this.agentSession._updateAgentState('listening');
     }
-    speechHandle._markPlayoutDone();
+    // mark the playout done before waiting for the tool execution
+    speechHandle._markGenerationDone();
     await executeToolsTask.result;
     if (toolOutput.output.length === 0) return;
     // important: no agent output should be used after this point
     const { maxToolSteps } = this.agentSession.options;
-    if (speechHandle.stepIndex >= maxToolSteps) {
+    if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
         'maximum number of function calls steps reached',
@@ -1390,7 +1417,7 @@ export class AgentActivity implements RecognitionHooks {
       const handle = SpeechHandle.create({
         allowInterruptions: speechHandle.allowInterruptions,
-        stepIndex: speechHandle.stepIndex + 1,
+        stepIndex: speechHandle._stepIndex + 1,
         parent: speechHandle,
       });
       this.agentSession.emit(
@@ -1407,14 +1434,17 @@ export class AgentActivity implements RecognitionHooks {
       const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
       const toolResponseTask = this.createSpeechTask({
-        promise: this.pipelineReplyTask(
-          handle,
-          chatCtx,
-          toolCtx,
-          { toolChoice: respondToolChoice },
-          instructions,
-          undefined,
-          toolMessages,
+        task: Task.from(() =>
+          this.pipelineReplyTask(
+            handle,
+            chatCtx,
+            toolCtx,
+            { toolChoice: respondToolChoice },
+            replyAbortController,
+            instructions,
+            undefined,
+            toolMessages,
+          ),
         ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.pipelineReply',
@@ -1435,6 +1465,7 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle: SpeechHandle,
     ev: GenerationCreatedEvent,
     modelSettings: ModelSettings,
+    replyAbortController: AbortController,
   ): Promise<void> {
     speechHandleStorage.enterWith(speechHandle);
@@ -1446,7 +1477,7 @@ export class AgentActivity implements RecognitionHooks {
     }
     this.logger.debug(
-      { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
+      { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
       'realtime generation started',
     );
@@ -1459,6 +1490,7 @@ export class AgentActivity implements RecognitionHooks {
     const toolCtx = this.realtimeSession.tools;
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    speechHandle._clearAuthorization();
     if (speechHandle.interrupted) {
       return;
@@ -1468,8 +1500,6 @@ export class AgentActivity implements RecognitionHooks {
       this.agentSession._updateAgentState('speaking');
     };
-    const replyAbortController = new AbortController();
     const readMessages = async (
       abortController: AbortController,
       outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
@@ -1566,12 +1596,14 @@ export class AgentActivity implements RecognitionHooks {
       ),
     );
-    const onToolExecutionStarted = (_: FunctionCall) => {
-      // TODO(brian): handle speech_handle item_added
+    const onToolExecutionStarted = (f: FunctionCall) => {
+      speechHandle._itemAdded([f]);
     };
-    const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
-      // TODO(brian): handle speech_handle item_added
+    const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
+      if (out.toolCallOutput) {
+        speechHandle._itemAdded([out.toolCallOutput]);
+      }
     };
     const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1640,7 +1672,7 @@ export class AgentActivity implements RecognitionHooks {
             interrupted: true,
           });
           this.agent._chatCtx.insert(message);
-          speechHandle._setChatMessage(message);
+          speechHandle._itemAdded([message]);
           this.agentSession._conversationItemAdded(message);
           // TODO(brian): add tracing span
@@ -1650,8 +1682,7 @@ export class AgentActivity implements RecognitionHooks {
           'playout completed with interrupt',
         );
       }
-      // TODO(shubhra) add chat message to speech handle
-      speechHandle._markPlayoutDone();
+      speechHandle._markGenerationDone();
       await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       // TODO(brian): close tees
@@ -1668,13 +1699,13 @@ export class AgentActivity implements RecognitionHooks {
         interrupted: false,
       });
       this.agent._chatCtx.insert(message);
-      speechHandle._setChatMessage(message);
+      speechHandle._itemAdded([message]);
       this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
       // TODO(brian): add tracing span
     }
     // mark the playout done before waiting for the tool execution
-    speechHandle._markPlayoutDone();
+    speechHandle._markGenerationDone();
     // TODO(brian): close tees
     toolOutput.firstToolStartedFuture.await.finally(() => {
@@ -1687,7 +1718,7 @@ export class AgentActivity implements RecognitionHooks {
     // important: no agent ouput should be used after this point
     const { maxToolSteps } = this.agentSession.options;
-    if (speechHandle.stepIndex >= maxToolSteps) {
+    if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
         'maximum number of function calls steps reached',
@@ -1763,7 +1794,7 @@ export class AgentActivity implements RecognitionHooks {
     const replySpeechHandle = SpeechHandle.create({
       allowInterruptions: speechHandle.allowInterruptions,
-      stepIndex: speechHandle.stepIndex + 1,
+      stepIndex: speechHandle.numSteps + 1,
       parent: speechHandle,
     });
     this.agentSession.emit(
@@ -1777,10 +1808,13 @@ export class AgentActivity implements RecognitionHooks {
     const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
     this.createSpeechTask({
-      promise: this.realtimeReplyTask({
-        speechHandle: replySpeechHandle,
-        modelSettings: { toolChoice },
-      }),
+      task: Task.from((abortController: AbortController) =>
+        this.realtimeReplyTask({
+          speechHandle: replySpeechHandle,
+          modelSettings: { toolChoice },
+          abortController,
+        }),
+      ),
       ownedSpeechHandle: replySpeechHandle,
       name: 'AgentActivity.realtime_reply',
     });
@@ -1793,9 +1827,11 @@ export class AgentActivity implements RecognitionHooks {
     modelSettings: { toolChoice },
     userInput,
     instructions,
+    abortController,
   }: {
     speechHandle: SpeechHandle;
     modelSettings: ModelSettings;
+    abortController: AbortController;
     userInput?: string;
     instructions?: string;
   }): Promise<void> {
@@ -1825,7 +1861,12 @@ export class AgentActivity implements RecognitionHooks {
     try {
       const generationEvent = await this.realtimeSession.generateReply(instructions);
-      await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
+      await this.realtimeGenerationTask(
+        speechHandle,
+        generationEvent,
+        { toolChoice },
+        abortController,
+      );
     } finally {
       // reset toolChoice value
       if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
@@ -1837,14 +1878,17 @@ export class AgentActivity implements RecognitionHooks {
   private scheduleSpeech(
     speechHandle: SpeechHandle,
     priority: number,
-    bypassDraining: boolean = false,
+    force: boolean = false,
   ): void {
-    if (this.draining && !bypassDraining) {
+    // when force=true, we allow tool responses to bypass draining
+    // This allows for tool responses to be generated before the AgentActivity is finalized
+    if (this.draining && !force) {
       throw new Error('cannot schedule new speech, the agent is draining');
     }
     // Monotonic time to avoid near 0 collisions
     this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
+    speechHandle._markScheduled();
     this.wakeupMainTask();
   }
@@ -1854,7 +1898,7 @@ export class AgentActivity implements RecognitionHooks {
       if (this._draining) return;
       this.createSpeechTask({
-        promise: this.agent.onExit(),
+        task: Task.from(() => this.agent.onExit()),
         name: 'AgentActivity_onExit',
       });