npm - @livekit/agents - Versions diffs - 1.0.41 → 1.0.43 - Mend

@livekit/agents 1.0.41 → 1.0.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

package/dist/inference/index.cjs +8 -0
package/dist/inference/index.cjs.map +1 -1
package/dist/inference/index.d.cts +2 -2
package/dist/inference/index.d.ts +2 -2
package/dist/inference/index.d.ts.map +1 -1
package/dist/inference/index.js +8 -0
package/dist/inference/index.js.map +1 -1
package/dist/inference/stt.cjs +51 -10
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.cts +33 -0
package/dist/inference/stt.d.ts +33 -0
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +48 -9
package/dist/inference/stt.js.map +1 -1
package/dist/inference/stt.test.cjs +204 -0
package/dist/inference/stt.test.cjs.map +1 -0
package/dist/inference/stt.test.js +203 -0
package/dist/inference/stt.test.js.map +1 -0
package/dist/inference/tts.cjs +52 -10
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +22 -0
package/dist/inference/tts.d.ts +22 -0
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js +49 -9
package/dist/inference/tts.js.map +1 -1
package/dist/inference/tts.test.cjs +223 -0
package/dist/inference/tts.test.cjs.map +1 -0
package/dist/inference/tts.test.js +222 -0
package/dist/inference/tts.test.js.map +1 -0
package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/inference_proc_lazy_main.js +13 -1
package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
package/dist/ipc/job_proc_lazy_main.cjs +2 -1
package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/job_proc_lazy_main.js +2 -1
package/dist/ipc/job_proc_lazy_main.js.map +1 -1
package/dist/ipc/supervised_proc.cjs.map +1 -1
package/dist/ipc/supervised_proc.d.cts +7 -0
package/dist/ipc/supervised_proc.d.ts +7 -0
package/dist/ipc/supervised_proc.d.ts.map +1 -1
package/dist/ipc/supervised_proc.js.map +1 -1
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +7 -0
package/dist/stt/stt.d.ts +7 -0
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js.map +1 -1
package/dist/transcription.cjs.map +1 -1
package/dist/transcription.d.cts +6 -0
package/dist/transcription.d.ts +6 -0
package/dist/transcription.d.ts.map +1 -1
package/dist/transcription.js.map +1 -1
package/dist/vad.cjs +1 -1
package/dist/vad.cjs.map +1 -1
package/dist/vad.d.cts +3 -2
package/dist/vad.d.ts +3 -2
package/dist/vad.d.ts.map +1 -1
package/dist/vad.js +1 -1
package/dist/vad.js.map +1 -1
package/dist/voice/agent_activity.cjs +1 -2
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.js +1 -2
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/audio_recognition.cjs +1 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +14 -0
package/dist/voice/audio_recognition.d.ts +14 -0
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +1 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/package.json +1 -1
package/src/inference/index.ts +8 -0
package/src/inference/stt.test.ts +236 -0
package/src/inference/stt.ts +95 -17
package/src/inference/tts.test.ts +255 -0
package/src/inference/tts.ts +81 -15
package/src/ipc/inference_proc_lazy_main.ts +13 -1
package/src/ipc/job_proc_lazy_main.ts +5 -1
package/src/ipc/supervised_proc.ts +7 -0
package/src/stt/stt.ts +7 -0
package/src/transcription.ts +6 -0
package/src/vad.ts +4 -3
package/src/voice/agent_activity.ts +1 -1
package/src/voice/audio_recognition.ts +16 -1

package/src/inference/tts.ts CHANGED Viewed

@@ -16,7 +16,6 @@ import { Event, Future, Task, cancelAndWait, combineSignals, shortuuid } from '.
 import {
   type TtsClientEvent,
   type TtsServerEvent,
-  type TtsSessionCreateEvent,
   ttsClientEventSchema,
   ttsServerEventSchema,
 } from './api_protos.js';
@@ -46,13 +45,17 @@ export type InworldModels =
 export type RimeModels = 'rime/arcana' | 'rime/mistv2';
 export interface CartesiaOptions {
-  duration?: number; // max duration of audio in seconds
-  speed?: 'slow' | 'normal' | 'fast'; // default: not specified
+  /** Maximum duration of audio in seconds. */
+  duration?: number;
+  /** Speech speed. Default: not specified. */
+  speed?: 'slow' | 'normal' | 'fast';
 }
 export interface ElevenlabsOptions {
-  inactivity_timeout?: number; // default: 60
-  apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
+  /** Inactivity timeout in seconds. Default: 60. */
+  inactivity_timeout?: number;
+  /** Text normalization mode. Default: "auto". */
+  apply_text_normalization?: 'auto' | 'off' | 'on';
 }
 export interface DeepgramTTSOptions {}
@@ -90,6 +93,45 @@ export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
           ? InworldOptions
           : Record<string, unknown>;
+/** Parse a model string into [model, voice]. Voice is undefined if not specified. */
+export function parseTTSModelString(model: string): [string, string | undefined] {
+  const idx = model.lastIndexOf(':');
+  if (idx !== -1) {
+    return [model.slice(0, idx), model.slice(idx + 1)];
+  }
+  return [model, undefined];
+}
+/** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
+export interface TTSFallbackModel {
+  /** Model name (e.g. "cartesia/sonic", "elevenlabs/eleven_flash_v2", "rime/arcana"). */
+  model: string;
+  /** Voice to use for the model. */
+  voice: string;
+  /** Extra configuration for the model. */
+  extraKwargs?: Record<string, unknown>;
+}
+export type TTSFallbackModelType = TTSFallbackModel | string;
+/** Normalize a single or list of FallbackModelType into TTSFallbackModel[]. */
+export function normalizeTTSFallback(
+  fallback: TTSFallbackModelType | TTSFallbackModelType[],
+): TTSFallbackModel[] {
+  const makeFallback = (model: TTSFallbackModelType): TTSFallbackModel => {
+    if (typeof model === 'string') {
+      const [name, voice] = parseTTSModelString(model);
+      return { model: name, voice: voice ?? '' };
+    }
+    return model;
+  };
+  if (Array.isArray(fallback)) {
+    return fallback.map(makeFallback);
+  }
+  return [makeFallback(fallback)];
+}
 type TTSEncoding = 'pcm_s16le';
 const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
@@ -108,6 +150,8 @@ export interface InferenceTTSOptions<TModel extends TTSModels> {
   apiKey: string;
   apiSecret: string;
   modelOptions: TTSOptions<TModel>;
+  fallback?: TTSFallbackModel[];
+  connOptions?: APIConnectOptions;
 }
 /**
@@ -130,6 +174,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
     apiKey?: string;
     apiSecret?: string;
     modelOptions?: TTSOptions<TModel>;
+    fallback?: TTSFallbackModelType | TTSFallbackModelType[];
+    connOptions?: APIConnectOptions;
   }) {
     const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
     super(sampleRate, 1, { streaming: true });
@@ -143,6 +189,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
       apiKey,
       apiSecret,
       modelOptions = {} as TTSOptions<TModel>,
+      fallback,
+      connOptions,
     } = opts || {};
     const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
@@ -176,6 +224,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
       }
     }
+    const normalizedFallback = fallback ? normalizeTTSFallback(fallback) : undefined;
     this.opts = {
       model: nextModel,
       voice: nextVoice,
@@ -186,6 +236,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
       apiKey: lkApiKey,
       apiSecret: lkApiSecret,
       modelOptions,
+      fallback: normalizedFallback,
+      connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
     };
     // Initialize connection pool
@@ -203,11 +255,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
   }
   static fromModelString(modelString: string): TTS<AnyString> {
-    if (modelString.includes(':')) {
-      const [model, voice] = modelString.split(':') as [TTSModels, string];
-      return new TTS({ model, voice });
-    }
-    return new TTS({ model: modelString });
+    const [model, voice] = parseTTSModelString(modelString);
+    return new TTS({ model, voice: voice || undefined });
   }
   updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
@@ -222,7 +271,7 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
   }
   stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
-    const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
+    const { connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } = options || {};
     const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
     this.streams.add(stream);
     return stream;
@@ -243,11 +292,28 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
       sample_rate: String(this.opts.sampleRate),
       encoding: this.opts.encoding,
       extra: this.opts.modelOptions,
-    } as TtsSessionCreateEvent;
+    } as Record<string, unknown>;
+    if (this.opts.voice) (params as Record<string, unknown>).voice = this.opts.voice;
+    if (this.opts.model) (params as Record<string, unknown>).model = this.opts.model;
+    if (this.opts.language) (params as Record<string, unknown>).language = this.opts.language;
+    if (this.opts.fallback?.length) {
+      params.fallback = {
+        models: this.opts.fallback.map((m) => ({
+          model: m.model,
+          voice: m.voice,
+          extra: m.extraKwargs ?? {},
+        })),
+      };
+    }
-    if (this.opts.voice) params.voice = this.opts.voice;
-    if (this.opts.model) params.model = this.opts.model;
-    if (this.opts.language) params.language = this.opts.language;
+    if (this.opts.connOptions) {
+      params.connection = {
+        timeout: this.opts.connOptions.timeoutMs / 1000,
+        retries: this.opts.connOptions.maxRetry,
+      };
+    }
     this.#logger.debug({ url }, 'inference.TTS creating new websocket connection (pool miss)');
     const socket = await connectWs(url, headers, timeout);

package/src/ipc/inference_proc_lazy_main.ts CHANGED Viewed

@@ -36,7 +36,19 @@ const ORPHANED_TIMEOUT = 15 * 1000;
     const runners: { [id: string]: InferenceRunner } = await Promise.all(
       Object.entries(JSON.parse(process.argv[2]!)).map(async ([k, v]) => {
-        return [k, await import(v as string).then((m) => new m.default())];
+        return [
+          k,
+          await import(v as string).then((m) => {
+            // Handle both ESM (m.default is the class) and CJS (m.default.default is the class)
+            const Runner = typeof m.default === 'function' ? m.default : m.default?.default;
+            if (typeof Runner !== 'function') {
+              throw new Error(
+                `Unable to load inference runner: Missing or invalid default export in ${v}`,
+              );
+            }
+            return new Runner();
+          }),
+        ];
       }),
     ).then(Object.fromEntries);

package/src/ipc/job_proc_lazy_main.ts CHANGED Viewed

@@ -156,7 +156,11 @@ const startJob = (
     //   [2] import.meta.filename of function containing entry file
     const moduleFile = process.argv[2];
     const agent: Agent = await import(pathToFileURL(moduleFile!).pathname).then((module) => {
-      const agent = module.default;
+      // Handle both ESM (module.default is the agent) and CJS (module.default.default is the agent)
+      const agent =
+        typeof module.default === 'function' || isAgent(module.default)
+          ? module.default
+          : module.default?.default;
       if (agent === undefined || !isAgent(agent)) {
         throw new Error(`Unable to load agent: Missing or invalid default export in ${moduleFile}`);
       }

package/src/ipc/supervised_proc.ts CHANGED Viewed

@@ -10,12 +10,19 @@ import { Future } from '../utils.js';
 import type { IPCMessage } from './message.js';
 export interface ProcOpts {
+  /** Timeout for process initialization in milliseconds. */
   initializeTimeout: number;
+  /** Timeout for process shutdown in milliseconds. */
   closeTimeout: number;
+  /** Memory usage warning threshold in megabytes. */
   memoryWarnMB: number;
+  /** Memory usage limit in megabytes. */
   memoryLimitMB: number;
+  /** Interval for health check pings in milliseconds. */
   pingInterval: number;
+  /** Timeout waiting for pong response in milliseconds. */
   pingTimeout: number;
+  /** Threshold for warning about unresponsive processes in milliseconds. */
   highPingThreshold: number;
 }

package/src/stt/stt.ts CHANGED Viewed

@@ -49,15 +49,22 @@ export enum SpeechEventType {
 /** SpeechData contains metadata about this {@link SpeechEvent}. */
 export interface SpeechData {
+  /** Language code of the speech. */
   language: string;
+  /** Transcribed text. */
   text: string;
+  /** Start time of the speech segment in seconds. */
   startTime: number;
+  /** End time of the speech segment in seconds. */
   endTime: number;
+  /** Confidence score of the transcription (0-1). */
   confidence: number;
+  /** Word-level timing information. */
   words?: TimedString[];
 }
 export interface RecognitionUsage {
+  /** Duration of the audio that was recognized in seconds. */
   audioDuration: number;
 }

package/src/transcription.ts CHANGED Viewed

@@ -13,11 +13,17 @@ import { AsyncIterableQueue, Future, shortuuid } from './utils.js';
 const STANDARD_SPEECH_RATE = 3830;
 export interface TextSyncOptions {
+  /** Language code for transcription. */
   language: string;
+  /** Speech speed multiplier. */
   speed: number;
+  /** Delay between sentences in milliseconds. */
   newSentenceDelay: number;
+  /** Tokenizer for splitting text into sentences. */
   sentenceTokenizer: SentenceTokenizer;
+  /** Function to hyphenate words. */
   hyphenateWord: (word: string) => string[];
+  /** Function to split text into words with positions. */
   splitWords: (words: string) => [string, number, number][];
 }

package/src/vad.ts CHANGED Viewed

@@ -30,9 +30,9 @@ export interface VADEvent {
   samplesIndex: number;
   /** Timestamp when the event was fired. */
   timestamp: number;
-  /** Duration of the speech segment. */
+  /** Duration of the speech segment in seconds. */
   speechDuration: number;
-  /** Duration of the silence segment. */
+  /** Duration of the silence segment in seconds. */
   silenceDuration: number;
   /**
    * List of audio frames associated with the speech.
@@ -56,6 +56,7 @@ export interface VADEvent {
 }
 export interface VADCapabilities {
+  /** Duration of each VAD inference window in milliseconds. Used to batch metrics emissions to roughly once per second. */
   updateInterval: number;
 }
@@ -154,7 +155,7 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
       switch (value.type) {
         case VADEventType.START_OF_SPEECH:
           inferenceCount++;
-          if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
+          if (inferenceCount >= 1000 / this.#vad.capabilities.updateInterval) {
             this.#vad.emit('metrics_collected', {
               type: 'vad_metrics',
               timestamp: Date.now(),

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -1023,7 +1023,7 @@ export class AgentActivity implements RecognitionHooks {
               toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
             },
             abortController,
-            instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
+            instructions,
             userMessage,
           ),
         ),

package/src/voice/audio_recognition.ts CHANGED Viewed

@@ -18,11 +18,17 @@ import type { TurnDetectionMode } from './agent_session.js';
 import type { STTNode } from './io.js';
 export interface EndOfTurnInfo {
+  /** The new transcript text from the user's speech. */
   newTranscript: string;
+  /** Confidence score of the transcript (0-1). */
   transcriptConfidence: number;
+  /** Delay from speech stop to final transcription in milliseconds. */
   transcriptionDelay: number;
+  /** Delay from speech stop to end of utterance detection in milliseconds. */
   endOfUtteranceDelay: number;
+  /** Timestamp when user started speaking (milliseconds since epoch). */
   startedSpeakingAt: number | undefined;
+  /** Timestamp when user stopped speaking (milliseconds since epoch). */
   stoppedSpeakingAt: number | undefined;
 }
@@ -50,13 +56,21 @@ export interface _TurnDetector {
 }
 export interface AudioRecognitionOptions {
+  /** Hooks for recognition events. */
   recognitionHooks: RecognitionHooks;
+  /** Speech-to-text node. */
   stt?: STTNode;
+  /** Voice activity detection. */
   vad?: VAD;
+  /** Turn detector for end-of-turn prediction. */
   turnDetector?: _TurnDetector;
+  /** Turn detection mode. */
   turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
+  /** Minimum endpointing delay in milliseconds. */
   minEndpointingDelay: number;
+  /** Maximum endpointing delay in milliseconds. */
   maxEndpointingDelay: number;
+  /** Root span context for tracing. */
   rootSpanContext?: Context;
 }
@@ -161,7 +175,6 @@ export class AudioRecognition {
     switch (ev.type) {
       case SpeechEventType.FINAL_TRANSCRIPT:
-        this.hooks.onFinalTranscript(ev);
         const transcript = ev.alternatives?.[0]?.text;
         const confidence = ev.alternatives?.[0]?.confidence ?? 0;
         this.lastLanguage = ev.alternatives?.[0]?.language;
@@ -171,6 +184,8 @@ export class AudioRecognition {
           return;
         }
+        this.hooks.onFinalTranscript(ev);
         this.logger.debug(
           {
             user_transcript: transcript,