npm - @livekit/agents - Versions diffs - 1.0.5 → 1.1.0 - Mend

@livekit/agents 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

package/dist/index.cjs +3 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +2 -0
package/dist/index.js.map +1 -1
package/dist/inference/api_protos.cjs +104 -0
package/dist/inference/api_protos.cjs.map +1 -0
package/dist/inference/api_protos.d.cts +222 -0
package/dist/inference/api_protos.d.ts +222 -0
package/dist/inference/api_protos.d.ts.map +1 -0
package/dist/inference/api_protos.js +70 -0
package/dist/inference/api_protos.js.map +1 -0
package/dist/inference/index.cjs +56 -0
package/dist/inference/index.cjs.map +1 -0
package/dist/inference/index.d.cts +9 -0
package/dist/inference/index.d.ts +9 -0
package/dist/inference/index.d.ts.map +1 -0
package/dist/inference/index.js +16 -0
package/dist/inference/index.js.map +1 -0
package/dist/inference/llm.cjs +315 -0
package/dist/inference/llm.cjs.map +1 -0
package/dist/inference/llm.d.cts +92 -0
package/dist/inference/llm.d.ts +92 -0
package/dist/inference/llm.d.ts.map +1 -0
package/dist/inference/llm.js +286 -0
package/dist/inference/llm.js.map +1 -0
package/dist/inference/stt.cjs +305 -0
package/dist/inference/stt.cjs.map +1 -0
package/dist/inference/stt.d.cts +79 -0
package/dist/inference/stt.d.ts +79 -0
package/dist/inference/stt.d.ts.map +1 -0
package/dist/inference/stt.js +284 -0
package/dist/inference/stt.js.map +1 -0
package/dist/inference/tts.cjs +317 -0
package/dist/inference/tts.cjs.map +1 -0
package/dist/inference/tts.d.cts +75 -0
package/dist/inference/tts.d.ts +75 -0
package/dist/inference/tts.d.ts.map +1 -0
package/dist/inference/tts.js +299 -0
package/dist/inference/tts.js.map +1 -0
package/dist/inference/utils.cjs +76 -0
package/dist/inference/utils.cjs.map +1 -0
package/dist/inference/utils.d.cts +5 -0
package/dist/inference/utils.d.ts +5 -0
package/dist/inference/utils.d.ts.map +1 -0
package/dist/inference/utils.js +51 -0
package/dist/inference/utils.js.map +1 -0
package/dist/tts/tts.cjs +1 -1
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.js +1 -1
package/dist/tts/tts.js.map +1 -1
package/dist/utils.cjs +11 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +1 -0
package/dist/utils.d.ts +1 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +10 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +16 -3
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +4 -3
package/dist/voice/agent.d.ts +4 -3
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +20 -3
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_session.cjs +16 -3
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +4 -3
package/dist/voice/agent_session.d.ts +4 -3
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +20 -3
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/room_io/_input.cjs +9 -0
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +10 -0
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +1 -1
package/dist/worker.js.map +1 -1
package/package.json +3 -2
package/src/index.ts +2 -1
package/src/inference/api_protos.ts +82 -0
package/src/inference/index.ts +12 -0
package/src/inference/llm.ts +485 -0
package/src/inference/stt.ts +414 -0
package/src/inference/tts.ts +421 -0
package/src/inference/utils.ts +66 -0
package/src/tts/tts.ts +1 -1
package/src/utils.ts +11 -0
package/src/voice/agent.ts +30 -6
package/src/voice/agent_session.ts +29 -6
package/src/voice/room_io/_input.ts +12 -1
package/src/worker.ts +2 -7

package/src/inference/tts.ts ADDED Viewed

@@ -0,0 +1,421 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import type { AudioFrame } from '@livekit/rtc-node';
+import { WebSocket } from 'ws';
+import { APIError, APIStatusError } from '../_exceptions.js';
+import { AudioByteStream } from '../audio.js';
+import { log } from '../log.js';
+import { createStreamChannel } from '../stream/stream_channel.js';
+import { basic as tokenizeBasic } from '../tokenize/index.js';
+import {
+  SynthesizeStream as BaseSynthesizeStream,
+  TTS as BaseTTS,
+  ChunkedStream,
+} from '../tts/index.js';
+import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
+import { shortuuid } from '../utils.js';
+import {
+  type TtsClientEvent,
+  type TtsServerEvent,
+  type TtsSessionCreateEvent,
+  ttsClientEventSchema,
+  ttsServerEventSchema,
+} from './api_protos.js';
+import { type AnyModels, connectWs, createAccessToken } from './utils.js';
+type _CartesiaModels = 'cartesia' | 'cartesia/sonic' | 'cartesia/sonic-2' | 'cartesia/sonic-turbo';
+export type CartesiaModels = _CartesiaModels | `${_CartesiaModels}:${string}`;
+type _ElevenlabsModels =
+  | 'elevenlabs'
+  | 'elevenlabs/eleven_flash_v2'
+  | 'elevenlabs/eleven_flash_v2_5'
+  | 'elevenlabs/eleven_turbo_v2'
+  | 'elevenlabs/eleven_turbo_v2_5'
+  | 'elevenlabs/eleven_multilingual_v2';
+export type ElevenlabsModels = _ElevenlabsModels | `${_ElevenlabsModels}:${string}`;
+export type _RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
+export type RimeModels = _RimeModels | `${_RimeModels}:${string}`;
+export type _InworldModels = 'inworld' | 'inworld/inworld-tts-1';
+export type InworldModels = _InworldModels | `${_InworldModels}:${string}`;
+export interface CartesiaOptions {
+  duration?: number; // max duration of audio in seconds
+  speed?: 'slow' | 'normal' | 'fast'; // default: not specified
+}
+export interface ElevenlabsOptions {
+  inactivity_timeout?: number; // default: 60
+  apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
+}
+export interface RimeOptions {}
+export interface InworldOptions {}
+export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyModels;
+export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
+  ? CartesiaOptions
+  : TModel extends ElevenlabsModels
+    ? ElevenlabsOptions
+    : TModel extends RimeOptions
+      ? RimeOptions
+      : TModel extends InworldOptions
+        ? InworldOptions
+        : Record<string, unknown>;
+type TTSEncoding = 'pcm_s16le';
+const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
+const DEFAULT_SAMPLE_RATE = 16000;
+const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
+const NUM_CHANNELS = 1;
+const DEFAULT_LANGUAGE = 'en';
+export interface InferenceTTSOptions<TModel extends TTSModels> {
+  model?: TModel;
+  voice?: string;
+  language?: string;
+  encoding: TTSEncoding;
+  sampleRate: number;
+  baseURL: string;
+  apiKey: string;
+  apiSecret: string;
+  extraKwargs: TTSOptions<TModel>;
+}
+export class TTS<TModel extends TTSModels> extends BaseTTS {
+  private opts: InferenceTTSOptions<TModel>;
+  private streams: Set<SynthesizeStream<TModel>> = new Set();
+  #logger = log();
+  constructor(opts: {
+    model: TModel;
+    voice?: string;
+    language?: string;
+    baseURL?: string;
+    encoding?: TTSEncoding;
+    sampleRate?: number;
+    apiKey?: string;
+    apiSecret?: string;
+    extraKwargs?: TTSOptions<TModel>;
+  }) {
+    const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
+    super(sampleRate, 1, { streaming: true });
+    const {
+      model,
+      voice,
+      language = DEFAULT_LANGUAGE,
+      baseURL,
+      encoding = DEFAULT_ENCODING,
+      apiKey,
+      apiSecret,
+      extraKwargs = {} as TTSOptions<TModel>,
+    } = opts || {};
+    const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
+    const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
+    if (!lkApiKey) {
+      throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
+    }
+    const lkApiSecret =
+      apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
+    if (!lkApiSecret) {
+      throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
+    }
+    // read voice id from the model if provided: "provider/model:voice_id"
+    let nextModel = model;
+    let nextVoice = voice;
+    if (typeof nextModel === 'string') {
+      const idx = nextModel.lastIndexOf(':');
+      if (idx !== -1) {
+        const voiceFromModel = nextModel.slice(idx + 1);
+        if (nextVoice && nextVoice !== voiceFromModel) {
+          this.#logger.warn(
+            '`voice` is provided via both argument and model, using the one from the argument',
+            { voice: nextVoice, model: nextModel },
+          );
+        } else {
+          nextVoice = voiceFromModel;
+        }
+        nextModel = nextModel.slice(0, idx) as TModel;
+      }
+    }
+    this.opts = {
+      model: nextModel,
+      voice: nextVoice,
+      language,
+      encoding,
+      sampleRate,
+      baseURL: lkBaseURL,
+      apiKey: lkApiKey,
+      apiSecret: lkApiSecret,
+      extraKwargs,
+    };
+  }
+  get label() {
+    return 'inference.TTS';
+  }
+  updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
+    this.opts = { ...this.opts, ...opts };
+    for (const stream of this.streams) {
+      stream.updateOptions(opts);
+    }
+  }
+  synthesize(_: string): ChunkedStream {
+    throw new Error('ChunkedStream is not implemented');
+  }
+  stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
+    const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
+    const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
+    this.streams.add(stream);
+    return stream;
+  }
+  async connectWs(timeout: number): Promise<WebSocket> {
+    let baseURL = this.opts.baseURL;
+    if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
+      baseURL = baseURL.replace('http', 'ws');
+    }
+    const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
+    const url = `${baseURL}/tts`;
+    const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
+    const params = {
+      type: 'session.create',
+      sample_rate: String(this.opts.sampleRate),
+      encoding: this.opts.encoding,
+      extra: this.opts.extraKwargs,
+    } as TtsSessionCreateEvent;
+    if (this.opts.voice) params.voice = this.opts.voice;
+    if (this.opts.model) params.model = this.opts.model;
+    if (this.opts.language) params.language = this.opts.language;
+    const socket = await connectWs(url, headers, timeout);
+    socket.send(JSON.stringify(params));
+    return socket;
+  }
+  async closeWs(ws: WebSocket) {
+    await ws.close();
+  }
+  async close() {
+    for (const stream of this.streams) {
+      await stream.close();
+    }
+    this.streams.clear();
+  }
+}
+export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
+  private opts: InferenceTTSOptions<TModel>;
+  private tts: TTS<TModel>;
+  private connOptions: APIConnectOptions;
+  #logger = log();
+  constructor(tts: TTS<TModel>, opts: InferenceTTSOptions<TModel>, connOptions: APIConnectOptions) {
+    super(tts, connOptions);
+    this.opts = opts;
+    this.tts = tts;
+    this.connOptions = connOptions;
+  }
+  get label() {
+    return 'inference.SynthesizeStream';
+  }
+  updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
+    this.opts = { ...this.opts, ...opts };
+  }
+  protected async run(): Promise<void> {
+    let ws: WebSocket | null = null;
+    let closing = false;
+    let finalReceived = false;
+    let lastFrame: AudioFrame | undefined;
+    const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
+    const eventChannel = createStreamChannel<TtsServerEvent>();
+    const requestId = shortuuid('tts_request_');
+    const resourceCleanup = () => {
+      if (closing) return;
+      closing = true;
+      sendTokenizerStream.close();
+      eventChannel.close();
+      ws?.removeAllListeners();
+      ws?.close();
+    };
+    const sendClientEvent = async (event: TtsClientEvent) => {
+      const validatedEvent = await ttsClientEventSchema.parseAsync(event);
+      if (!ws || ws.readyState !== WebSocket.OPEN) {
+        this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
+        return;
+      }
+      ws.send(JSON.stringify(validatedEvent));
+    };
+    const sendLastFrame = (segmentId: string, final: boolean) => {
+      if (lastFrame) {
+        this.queue.put({ requestId, segmentId, frame: lastFrame, final });
+        lastFrame = undefined;
+      }
+    };
+    const createInputTask = async () => {
+      for await (const data of this.input) {
+        if (this.abortController.signal.aborted) break;
+        if (data === SynthesizeStream.FLUSH_SENTINEL) {
+          sendTokenizerStream.flush();
+          continue;
+        }
+        sendTokenizerStream.pushText(data);
+      }
+      sendTokenizerStream.endInput();
+    };
+    const createSentenceStreamTask = async () => {
+      for await (const ev of sendTokenizerStream) {
+        if (this.abortController.signal.aborted) break;
+        sendClientEvent({
+          type: 'input_transcript',
+          transcript: ev.token + ' ',
+        });
+      }
+      sendClientEvent({ type: 'session.flush' });
+    };
+    const createWsListenerTask = async (ws: WebSocket) => {
+      return new Promise<void>((resolve, reject) => {
+        this.abortController.signal.addEventListener('abort', () => {
+          resourceCleanup();
+          reject(new Error('WebSocket connection aborted'));
+        });
+        ws.on('message', async (data) => {
+          const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
+          const validatedEvent = ttsServerEventSchema.parse(eventJson);
+          eventChannel.write(validatedEvent);
+        });
+        ws.on('error', (e) => {
+          this.#logger.error({ error: e }, 'WebSocket error');
+          resourceCleanup();
+          reject(e);
+        });
+        ws.on('close', () => {
+          resourceCleanup();
+          if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
+          if (finalReceived) return resolve();
+          reject(
+            new APIStatusError({
+              message: 'Gateway connection closed unexpectedly',
+              options: { requestId },
+            }),
+          );
+        });
+      });
+    };
+    const createRecvTask = async () => {
+      let currentSessionId: string | null = null;
+      const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
+      const serverEventStream = eventChannel.stream();
+      const reader = serverEventStream.getReader();
+      try {
+        while (!this.closed && !this.abortController.signal.aborted) {
+          const result = await reader.read();
+          if (this.abortController.signal.aborted) return;
+          if (result.done) return;
+          const serverEvent = result.value;
+          switch (serverEvent.type) {
+            case 'session.created':
+              currentSessionId = serverEvent.session_id;
+              break;
+            case 'output_audio':
+              const base64Data = new Int8Array(Buffer.from(serverEvent.audio, 'base64'));
+              for (const frame of bstream.write(base64Data.buffer)) {
+                sendLastFrame(currentSessionId!, false);
+                lastFrame = frame;
+              }
+              break;
+            case 'done':
+              finalReceived = true;
+              for (const frame of bstream.flush()) {
+                sendLastFrame(currentSessionId!, false);
+                lastFrame = frame;
+              }
+              sendLastFrame(currentSessionId!, true);
+              this.queue.put(SynthesizeStream.END_OF_STREAM);
+              break;
+            case 'session.closed':
+              resourceCleanup();
+              break;
+            case 'error':
+              this.#logger.error(
+                { serverEvent },
+                'Received error message from LiveKit TTS WebSocket',
+              );
+              resourceCleanup();
+              throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
+            default:
+              this.#logger.warn('Unexpected message %s', serverEvent);
+              break;
+          }
+        }
+      } finally {
+        reader.releaseLock();
+        try {
+          await serverEventStream.cancel();
+        } catch (e) {
+          this.#logger.debug('Error cancelling serverEventStream (may already be cancelled):', e);
+        }
+      }
+    };
+    try {
+      ws = await this.tts.connectWs(this.connOptions.timeoutMs);
+      await Promise.all([
+        createInputTask(),
+        createSentenceStreamTask(),
+        createWsListenerTask(ws),
+        createRecvTask(),
+      ]);
+    } catch (e) {
+      this.#logger.error('Error in SynthesizeStream', { error: e });
+    } finally {
+      resourceCleanup();
+    }
+  }
+}

package/src/inference/utils.ts ADDED Viewed

@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { AccessToken } from 'livekit-server-sdk';
+import { WebSocket } from 'ws';
+import { APIConnectionError, APIStatusError } from '../index.js';
+export type AnyModels = string & NonNullable<unknown>;
+export async function createAccessToken(
+  apiKey: string,
+  apiSecret: string,
+  ttl: number = 600,
+): Promise<string> {
+  const token = new AccessToken(apiKey, apiSecret, { identity: 'agent', ttl });
+  token.addInferenceGrant({ perform: true });
+  return await token.toJwt();
+}
+export async function connectWs(
+  url: string,
+  headers: Record<string, string>,
+  timeoutMs: number,
+): Promise<WebSocket> {
+  return new Promise<WebSocket>((resolve, reject) => {
+    const socket = new WebSocket(url, { headers: headers });
+    const timeout = setTimeout(() => {
+      reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
+    }, timeoutMs);
+    const onOpen = () => {
+      clearTimeout(timeout);
+      resolve(socket);
+    };
+    const onError = (err: unknown) => {
+      clearTimeout(timeout);
+      if (err && typeof err === 'object' && 'code' in err && (err as any).code === 429) {
+        reject(
+          new APIStatusError({
+            message: 'LiveKit gateway quota exceeded',
+            options: { statusCode: 429 },
+          }),
+        );
+      } else {
+        reject(new APIConnectionError({ message: 'Error connecting to LiveKit WebSocket' }));
+      }
+    };
+    const onClose = (code: number) => {
+      clearTimeout(timeout);
+      if (code !== 1000) {
+        reject(
+          new APIConnectionError({
+            message: 'Connection closed unexpectedly',
+          }),
+        );
+      }
+    };
+    socket.once('open', onOpen);
+    socket.once('error', onError);
+    socket.once('close', onClose);
+  });
+}

package/src/tts/tts.ts CHANGED Viewed

@@ -443,7 +443,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
     for await (const audio of this.queue) {
       this.output.put(audio);
       requestId = audio.requestId;
-      if (!ttfb) {
+      if (ttfb === BigInt(-1)) {
         ttfb = process.hrtime.bigint() - startTime;
       }
       audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;

package/src/utils.ts CHANGED Viewed

@@ -817,3 +817,14 @@ export async function waitForTrackPublication({
     room.off(RoomEvent.TrackPublished, onTrackPublished);
   }
 }
+export async function waitForAbort(signal: AbortSignal) {
+  const abortFuture = new Future<void>();
+  const handler = () => {
+    abortFuture.resolve();
+    signal.removeEventListener('abort', handler);
+  };
+  signal.addEventListener('abort', handler, { once: true });
+  return await abortFuture.await;
+}

package/src/voice/agent.ts CHANGED Viewed

@@ -4,6 +4,14 @@
 import type { AudioFrame } from '@livekit/rtc-node';
 import { AsyncLocalStorage } from 'node:async_hooks';
 import { ReadableStream } from 'node:stream/web';
+import {
+  LLM as InferenceLLM,
+  STT as InferenceSTT,
+  TTS as InferenceTTS,
+  type LLMModels,
+  type STTModels,
+  type TTSModels,
+} from '../inference/index.js';
 import { ReadonlyChatContext } from '../llm/chat_context.js';
 import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';
 import {
@@ -55,10 +63,10 @@ export interface AgentOptions<UserData> {
   chatCtx?: ChatContext;
   tools?: ToolContext<UserData>;
   turnDetection?: TurnDetectionMode;
-  stt?: STT;
+  stt?: STT | STTModels;
   vad?: VAD;
-  llm?: LLM | RealtimeModel;
-  tts?: TTS;
+  llm?: LLM | RealtimeModel | LLMModels;
+  tts?: TTS | TTSModels;
   allowInterruptions?: boolean;
   minConsecutiveSpeechDelay?: number;
 }
@@ -101,10 +109,26 @@ export class Agent<UserData = any> {
       : ChatContext.empty();
     this.turnDetection = turnDetection;
-    this._stt = stt;
     this._vad = vad;
-    this._llm = llm;
-    this._tts = tts;
+    if (typeof stt === 'string') {
+      this._stt = new InferenceSTT({ model: stt });
+    } else {
+      this._stt = stt;
+    }
+    if (typeof llm === 'string') {
+      this._llm = new InferenceLLM({ model: llm });
+    } else {
+      this._llm = llm;
+    }
+    if (typeof tts === 'string') {
+      this._tts = new InferenceTTS({ model: tts });
+    } else {
+      this._tts = tts;
+    }
     this._agentActivity = undefined;
   }

package/src/voice/agent_session.ts CHANGED Viewed

@@ -5,6 +5,14 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
 import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
 import { EventEmitter } from 'node:events';
 import type { ReadableStream } from 'node:stream/web';
+import {
+  LLM as InferenceLLM,
+  STT as InferenceSTT,
+  TTS as InferenceTTS,
+  type LLMModels,
+  type STTModels,
+  type TTSModels,
+} from '../inference/index.js';
 import { getJobContext } from '../job.js';
 import { ChatContext, ChatMessage } from '../llm/chat_context.js';
 import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
@@ -77,10 +85,10 @@ export type AgentSessionCallbacks = {
 export type AgentSessionOptions<UserData = UnknownUserData> = {
   turnDetection?: TurnDetectionMode;
-  stt?: STT;
+  stt?: STT | STTModels;
   vad?: VAD;
-  llm?: LLM | RealtimeModel;
-  tts?: TTS;
+  llm?: LLM | RealtimeModel | LLMModels;
+  tts?: TTS | TTSModels;
   userData?: UserData;
   voiceOptions?: Partial<VoiceOptions>;
 };
@@ -128,9 +136,24 @@ export class AgentSession<
     } = opts;
     this.vad = vad;
-    this.stt = stt;
-    this.llm = llm;
-    this.tts = tts;
+    if (typeof stt === 'string') {
+      this.stt = new InferenceSTT({ model: stt });
+    } else {
+      this.stt = stt;
+    }
+    if (typeof llm === 'string') {
+      this.llm = new InferenceLLM({ model: llm });
+    } else {
+      this.llm = llm;
+    }
+    if (typeof tts === 'string') {
+      this.tts = new InferenceTTS({ model: tts });
+    } else {
+      this.tts = tts;
+    }
     this.turnDetection = turnDetection;
     this._userData = userData;

package/src/voice/room_io/_input.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import type { AudioFrame } from '@livekit/rtc-node';
 import {
+  AudioFrame,
   AudioStream,
   type NoiseCancellationOptions,
   RemoteParticipant,
@@ -66,6 +66,17 @@ export class ParticipantAudioInputStream extends AudioInput {
         ? participant
         : this.room.remoteParticipants.get(participantIdentity);
+    // Convert Map iterator to array for Pino serialization
+    const trackPublicationsArray = Array.from(participantValue?.trackPublications.values() ?? []);
+    this.logger.info(
+      {
+        participantValue: participantValue?.identity,
+        trackPublications: trackPublicationsArray,
+        lengthOfTrackPublications: trackPublicationsArray.length,
+      },
+      'participantValue.trackPublications',
+    );
     // We need to check if the participant has a microphone track and subscribe to it
     // in case we miss the tracksubscribed event
     if (participantValue) {

package/src/worker.ts CHANGED Viewed

@@ -1,12 +1,7 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import type {
-  JobAssignment,
-  JobTermination,
-  ParticipantInfo,
-  TrackSource,
-} from '@livekit/protocol';
+import type { JobAssignment, JobTermination, TrackSource } from '@livekit/protocol';
 import {
   type AvailabilityRequest,
   JobType,
@@ -15,7 +10,7 @@ import {
   WorkerMessage,
   WorkerStatus,
 } from '@livekit/protocol';
-import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
+import { AccessToken, ParticipantInfo, RoomServiceClient } from 'livekit-server-sdk';
 import { EventEmitter } from 'node:events';
 import os from 'node:os';
 import { WebSocket } from 'ws';