npm - @livekit/agents - Versions diffs - 1.0.38 → 1.0.40 - Mend

@livekit/agents 1.0.38 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/dist/http_server.cjs +9 -6
package/dist/http_server.cjs.map +1 -1
package/dist/http_server.d.cts +5 -1
package/dist/http_server.d.ts +5 -1
package/dist/http_server.d.ts.map +1 -1
package/dist/http_server.js +9 -6
package/dist/http_server.js.map +1 -1
package/dist/inference/llm.cjs +7 -3
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +5 -6
package/dist/inference/llm.d.ts +5 -6
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +7 -3
package/dist/inference/llm.js.map +1 -1
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.cts +5 -4
package/dist/inference/stt.d.ts +5 -4
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +10 -7
package/dist/inference/tts.d.ts +10 -7
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js.map +1 -1
package/dist/ipc/supervised_proc.cjs +4 -0
package/dist/ipc/supervised_proc.cjs.map +1 -1
package/dist/ipc/supervised_proc.d.cts +1 -0
package/dist/ipc/supervised_proc.d.ts +1 -0
package/dist/ipc/supervised_proc.d.ts.map +1 -1
package/dist/ipc/supervised_proc.js +4 -0
package/dist/ipc/supervised_proc.js.map +1 -1
package/dist/stt/stream_adapter.cjs +9 -1
package/dist/stt/stream_adapter.cjs.map +1 -1
package/dist/stt/stream_adapter.d.ts.map +1 -1
package/dist/stt/stream_adapter.js +9 -1
package/dist/stt/stream_adapter.js.map +1 -1
package/dist/tokenize/basic/sentence.cjs +3 -3
package/dist/tokenize/basic/sentence.cjs.map +1 -1
package/dist/tokenize/basic/sentence.js +3 -3
package/dist/tokenize/basic/sentence.js.map +1 -1
package/dist/tokenize/tokenizer.test.cjs +3 -1
package/dist/tokenize/tokenizer.test.cjs.map +1 -1
package/dist/tokenize/tokenizer.test.js +3 -1
package/dist/tokenize/tokenizer.test.js.map +1 -1
package/dist/utils.cjs +5 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +8 -0
package/dist/utils.d.ts +8 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +4 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +1 -2
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.js +1 -2
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +23 -14
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -0
package/dist/voice/agent_activity.d.ts +1 -0
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +23 -14
package/dist/voice/agent_activity.js.map +1 -1
package/dist/worker.cjs +12 -2
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +12 -2
package/dist/worker.js.map +1 -1
package/package.json +2 -2
package/src/http_server.ts +18 -6
package/src/inference/llm.ts +20 -15
package/src/inference/stt.ts +9 -7
package/src/inference/tts.ts +36 -16
package/src/ipc/supervised_proc.ts +4 -0
package/src/stt/stream_adapter.ts +12 -1
package/src/tokenize/basic/sentence.ts +3 -3
package/src/tokenize/tokenizer.test.ts +4 -0
package/src/utils.ts +14 -0
package/src/voice/agent.ts +2 -2
package/src/voice/agent_activity.ts +36 -15
package/src/worker.ts +24 -2

package/src/inference/tts.ts CHANGED Viewed

@@ -23,22 +23,27 @@ import {
 import { type AnyString, connectWs, createAccessToken } from './utils.js';
 export type CartesiaModels =
-  | 'cartesia'
-  | 'cartesia/sonic'
+  | 'cartesia/sonic-3'
   | 'cartesia/sonic-2'
-  | 'cartesia/sonic-turbo';
+  | 'cartesia/sonic-turbo'
+  | 'cartesia/sonic';
+export type DeepgramTTSModels = 'deepgram/aura' | 'deepgram/aura-2';
 export type ElevenlabsModels =
-  | 'elevenlabs'
   | 'elevenlabs/eleven_flash_v2'
   | 'elevenlabs/eleven_flash_v2_5'
   | 'elevenlabs/eleven_turbo_v2'
   | 'elevenlabs/eleven_turbo_v2_5'
   | 'elevenlabs/eleven_multilingual_v2';
-export type RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
+export type InworldModels =
+  | 'inworld/inworld-tts-1.5-max'
+  | 'inworld/inworld-tts-1.5-mini'
+  | 'inworld/inworld-tts-1-max'
+  | 'inworld/inworld-tts-1';
-export type InworldModels = 'inworld' | 'inworld/inworld-tts-1';
+export type RimeModels = 'rime/arcana' | 'rime/mistv2';
 export interface CartesiaOptions {
   duration?: number; // max duration of audio in seconds
@@ -50,25 +55,40 @@ export interface ElevenlabsOptions {
   apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
 }
+export interface DeepgramTTSOptions {}
 export interface RimeOptions {}
 export interface InworldOptions {}
-type _TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels;
-export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyString;
+type _TTSModels =
+  | CartesiaModels
+  | DeepgramTTSModels
+  | ElevenlabsModels
+  | RimeModels
+  | InworldModels;
+export type TTSModels =
+  | CartesiaModels
+  | DeepgramTTSModels
+  | ElevenlabsModels
+  | RimeModels
+  | InworldModels
+  | AnyString;
 export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
 export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
   ? CartesiaOptions
-  : TModel extends ElevenlabsModels
-    ? ElevenlabsOptions
-    : TModel extends RimeOptions
-      ? RimeOptions
-      : TModel extends InworldOptions
-        ? InworldOptions
-        : Record<string, unknown>;
+  : TModel extends DeepgramTTSModels
+    ? DeepgramTTSOptions
+    : TModel extends ElevenlabsModels
+      ? ElevenlabsOptions
+      : TModel extends RimeModels
+        ? RimeOptions
+        : TModel extends InworldModels
+          ? InworldOptions
+          : Record<string, unknown>;
 type TTSEncoding = 'pcm_s16le';

package/src/ipc/supervised_proc.ts CHANGED Viewed

@@ -59,6 +59,10 @@ export abstract class SupervisedProc {
     return this.#started;
   }
+  get isAlive(): boolean {
+    return this.#started && !this.#closing && !!this.proc?.connected;
+  }
   get runningJob(): RunningJobInfo | undefined {
     return this.#runningJob;
   }

package/src/stt/stream_adapter.ts CHANGED Viewed

@@ -4,6 +4,7 @@
 import type { AudioFrame } from '@livekit/rtc-node';
 import { log } from '../log.js';
 import type { APIConnectOptions } from '../types.js';
+import { isStreamClosedError } from '../utils.js';
 import type { VAD, VADStream } from '../vad.js';
 import { VADEventType } from '../vad.js';
 import type { SpeechEvent } from './stt.js';
@@ -68,7 +69,17 @@ export class StreamAdapterWrapper extends SpeechStream {
           this.#vadStream.pushFrame(input);
         }
       }
-      this.#vadStream.endInput();
+      // Guard against calling endInput() on already-closed stream
+      // This happens during handover when close() is called while forwardInput is running
+      try {
+        this.#vadStream.endInput();
+      } catch (e) {
+        if (isStreamClosedError(e)) {
+          return;
+        }
+        throw e;
+      }
     };
     const recognize = async () => {

package/src/tokenize/basic/sentence.ts CHANGED Viewed

@@ -16,7 +16,7 @@ export const splitSentences = (
   const starters =
     /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
   const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
-  const websites = /[.](com|net|org|io|gov|edu|me)/g;
+  const websites = /(\w+\.)+(com|net|org|io|gov|edu|me)/g;
   const digits = /([0-9])/g;
   const dots = /\.{2,}/g;
@@ -27,7 +27,7 @@ export const splitSentences = (
   }
   text = text.replaceAll(prefixes, '$1<prd>');
-  text = text.replaceAll(websites, '<prd>$2');
+  text = text.replace(websites, (match) => match.replaceAll('.', '<prd>'));
   text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, 'g'), '$1<prd>$2');
   text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
   text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
@@ -51,7 +51,7 @@ export const splitSentences = (
   text = text.replaceAll('."', '".');
   text = text.replaceAll('!"', '"!');
   text = text.replaceAll('?"', '"?');
-  text = text.replaceAll('.', '.<stop>');
+  text = text.replace(/\.(?=\s|$)/g, '.<stop>');
   text = text.replaceAll('?', '?<stop>');
   text = text.replaceAll('!', '!<stop>');
   text = text.replaceAll('<prd>', '.');

package/src/tokenize/tokenizer.test.ts CHANGED Viewed

@@ -13,6 +13,8 @@ const TEXT =
   'This is a test. Another test. ' +
   'A short sentence. ' +
   'A longer sentence that is longer than the previous sentence. ' +
+  'Find additional resources on livekit.com. ' +
+  'Find additional resources on docs.livekit.com. ' +
   'f(x) = x * 2.54 + 42. ' +
   'Hey! Hi! Hello! ';
@@ -22,6 +24,8 @@ const EXPECTED_MIN_20 = [
   'Mr. Theo is testing the sentence tokenizer.',
   'This is a test. Another test.',
   'A short sentence. A longer sentence that is longer than the previous sentence.',
+  'Find additional resources on livekit.com.',
+  'Find additional resources on docs.livekit.com.',
   'f(x) = x * 2.54 + 42.',
   'Hey! Hi! Hello!',
 ];

package/src/utils.ts CHANGED Viewed

@@ -675,6 +675,20 @@ export class InvalidErrorType extends Error {
   }
 }
+/**
+ * Check if an error is a stream closed error that can be safely ignored during cleanup.
+ * This happens during handover/cleanup when close() is called while operations are still running.
+ *
+ * @param error - The error to check.
+ * @returns True if the error is a stream closed error.
+ */
+export function isStreamClosedError(error: unknown): boolean {
+  return (
+    error instanceof Error &&
+    (error.message === 'Stream is closed' || error.message === 'Input is closed')
+  );
+}
 /**
  * In JS an error can be any arbitrary value.
  * This function converts an unknown error to an Error and stores the original value in the error object.

package/src/voice/agent.ts CHANGED Viewed

@@ -325,16 +325,16 @@ export class Agent<UserData = any> {
         );
       }
-      // TODO(brian): make parallelToolCalls configurable
       const { toolChoice } = modelSettings;
       const connOptions = activity.agentSession.connOptions.llmConnOptions;
+      // parallelToolCalls is not passed here - it will use the value from LLM's modelOptions
+      // This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })
       const stream = activity.llm.chat({
         chatCtx,
         toolCtx,
         toolChoice,
         connOptions,
-        parallelToolCalls: true,
       });
       let cleaned = false;

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
     if (
       !this.vad &&
       this.stt &&
+      !this.stt.capabilities.streaming &&
       this.llm instanceof LLM &&
       this.allowInterruptions &&
       this.turnDetectionMode === undefined
     ) {
       this.logger.warn(
-        'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
+        'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
           'for more responsive interruption handling.',
       );
     }
@@ -659,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
-      // skip speech handle interruption if server side turn detection is enabled
-      return;
+    if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
+      this.interruptByAudioActivity();
     }
+  }
-    if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
+  private interruptByAudioActivity(): void {
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
+      // skip speech handle interruption if server side turn detection is enabled
       return;
     }
@@ -694,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
       !this._currentSpeech.interrupted &&
       this._currentSpeech.allowInterruptions
     ) {
-      this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
+      this.logger.info(
+        { 'speech id': this._currentSpeech.id },
+        'speech interrupted by audio activity',
+      );
       this.realtimeSession?.interrupt();
       this._currentSpeech.interrupt();
     }
@@ -715,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
         // TODO(AJS-106): add multi participant support
       }),
     );
+    if (ev.alternatives![0].text) {
+      this.interruptByAudioActivity();
+    }
   }
   onFinalTranscript(ev: SpeechEvent): void {
@@ -732,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
         // TODO(AJS-106): add multi participant support
       }),
     );
+    // agent speech might not be interrupted if VAD failed and a final transcript is received
+    // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
+    if (
+      this.audioRecognition &&
+      this.turnDetection !== 'manual' &&
+      this.turnDetection !== 'realtime_llm'
+    ) {
+      this.interruptByAudioActivity();
+      // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
+    }
+    // TODO: resume false interruption - start interrupt paused speech task
   }
   onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
@@ -1982,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
     if (audioOutput) {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
-      this.agentSession._updateAgentState('listening');
     }
     if (speechHandle.interrupted) {
@@ -2069,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle._markGenerationDone();
     // TODO(brian): close tees
-    toolOutput.firstToolStartedFuture.await.finally(() => {
-      this.agentSession._updateAgentState('thinking');
-    });
     await executeToolsTask.result;
+    if (toolOutput.output.length > 0) {
+      this.agentSession._updateAgentState('thinking');
+    } else if (this.agentSession.agentState === 'speaking') {
+      this.agentSession._updateAgentState('listening');
+    }
     if (toolOutput.output.length === 0) {
-      // return to listening state for thinking-only turns (no audio output, no tools)
-      if (!speechHandle.interrupted) {
-        this.agentSession._updateAgentState('listening');
-      }
       return;
     }

package/src/worker.ts CHANGED Viewed

@@ -339,13 +339,35 @@ export class AgentServer {
     );
     this.#opts = opts;
-    this.#httpServer = new HTTPServer(opts.host, opts.port, () => ({
+    const healthCheck = () => {
+      // Check if inference executor exists and is not alive
+      if (this.#inferenceExecutor && !this.#inferenceExecutor.isAlive) {
+        return { healthy: false, message: 'inference process not running' };
+      }
+      // Only healthy when fully connected with an active WebSocket
+      if (
+        this.#closed ||
+        this.#connecting ||
+        !this.#session ||
+        this.#session.readyState !== WebSocket.OPEN
+      ) {
+        return { healthy: false, message: 'not connected to livekit' };
+      }
+      return { healthy: true, message: 'OK' };
+    };
+    const getWorkerInfo = () => ({
       agent_name: opts.agentName,
       worker_type: JobType[opts.serverType],
       active_jobs: this.activeJobs.length,
       sdk_version: version,
       project_type: PROJECT_TYPE,
-    }));
+    });
+    this.#httpServer = new HTTPServer(opts.host, opts.port, healthCheck, getWorkerInfo);
   }
   /** @throws {@link WorkerError} if worker failed to connect or already running */