npm - @livekit/agents - Versions diffs - 1.0.6 → 1.0.8 - Mend

@livekit/agents 1.0.6 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

package/dist/index.cjs +3 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +2 -0
package/dist/index.js.map +1 -1
package/dist/inference/api_protos.cjs +104 -0
package/dist/inference/api_protos.cjs.map +1 -0
package/dist/inference/api_protos.d.cts +222 -0
package/dist/inference/api_protos.d.ts +222 -0
package/dist/inference/api_protos.d.ts.map +1 -0
package/dist/inference/api_protos.js +70 -0
package/dist/inference/api_protos.js.map +1 -0
package/dist/inference/index.cjs +56 -0
package/dist/inference/index.cjs.map +1 -0
package/dist/inference/index.d.cts +8 -0
package/dist/inference/index.d.ts +8 -0
package/dist/inference/index.d.ts.map +1 -0
package/dist/inference/index.js +23 -0
package/dist/inference/index.js.map +1 -0
package/dist/inference/llm.cjs +301 -0
package/dist/inference/llm.cjs.map +1 -0
package/dist/inference/llm.d.cts +107 -0
package/dist/inference/llm.d.ts +107 -0
package/dist/inference/llm.d.ts.map +1 -0
package/dist/inference/llm.js +272 -0
package/dist/inference/llm.js.map +1 -0
package/dist/inference/stt.cjs +313 -0
package/dist/inference/stt.cjs.map +1 -0
package/dist/inference/stt.d.cts +87 -0
package/dist/inference/stt.d.ts +87 -0
package/dist/inference/stt.d.ts.map +1 -0
package/dist/inference/stt.js +292 -0
package/dist/inference/stt.js.map +1 -0
package/dist/inference/tts.cjs +324 -0
package/dist/inference/tts.cjs.map +1 -0
package/dist/inference/tts.d.cts +77 -0
package/dist/inference/tts.d.ts +77 -0
package/dist/inference/tts.d.ts.map +1 -0
package/dist/inference/tts.js +306 -0
package/dist/inference/tts.js.map +1 -0
package/dist/inference/utils.cjs +76 -0
package/dist/inference/utils.cjs.map +1 -0
package/dist/inference/utils.d.cts +5 -0
package/dist/inference/utils.d.ts +5 -0
package/dist/inference/utils.d.ts.map +1 -0
package/dist/inference/utils.js +51 -0
package/dist/inference/utils.js.map +1 -0
package/dist/llm/remote_chat_context.cjs.map +1 -1
package/dist/llm/remote_chat_context.d.cts +2 -0
package/dist/llm/remote_chat_context.d.ts +2 -0
package/dist/llm/remote_chat_context.d.ts.map +1 -1
package/dist/llm/remote_chat_context.js.map +1 -1
package/dist/tts/tts.cjs +1 -1
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.js +1 -1
package/dist/tts/tts.js.map +1 -1
package/dist/utils.cjs +11 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +1 -0
package/dist/utils.d.ts +1 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +10 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +16 -3
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +5 -3
package/dist/voice/agent.d.ts +5 -3
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +20 -3
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +4 -2
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +4 -2
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +16 -3
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +4 -3
package/dist/voice/agent_session.d.ts +4 -3
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +20 -3
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/events.cjs +2 -0
package/dist/voice/events.cjs.map +1 -1
package/dist/voice/events.d.cts +4 -1
package/dist/voice/events.d.ts +4 -1
package/dist/voice/events.d.ts.map +1 -1
package/dist/voice/events.js +2 -0
package/dist/voice/events.js.map +1 -1
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +1 -0
package/dist/voice/generation.d.ts +1 -0
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js.map +1 -1
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +1 -0
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +1 -1
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +1 -0
package/dist/voice/room_io/_output.d.ts +1 -0
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +1 -1
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs +1 -1
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +20 -0
package/dist/voice/room_io/room_io.d.ts +20 -0
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +1 -1
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/transcription/synchronizer.cjs +1 -1
package/dist/voice/transcription/synchronizer.cjs.map +1 -1
package/dist/voice/transcription/synchronizer.d.cts +1 -0
package/dist/voice/transcription/synchronizer.d.ts +1 -0
package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
package/dist/voice/transcription/synchronizer.js +1 -1
package/dist/voice/transcription/synchronizer.js.map +1 -1
package/dist/worker.cjs +3 -3
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.cts +3 -0
package/dist/worker.d.ts +3 -0
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +4 -4
package/dist/worker.js.map +1 -1
package/package.json +3 -2
package/src/index.ts +2 -1
package/src/inference/api_protos.ts +82 -0
package/src/inference/index.ts +32 -0
package/src/inference/llm.ts +464 -0
package/src/inference/stt.ts +444 -0
package/src/inference/tts.ts +432 -0
package/src/inference/utils.ts +66 -0
package/src/llm/remote_chat_context.ts +2 -2
package/src/tts/tts.ts +1 -1
package/src/utils.ts +11 -0
package/src/voice/agent.ts +31 -7
package/src/voice/agent_activity.ts +2 -0
package/src/voice/agent_session.ts +30 -6
package/src/voice/events.ts +6 -0
package/src/voice/generation.ts +1 -1
package/src/voice/room_io/_input.ts +1 -1
package/src/voice/room_io/_output.ts +1 -1
package/src/voice/room_io/room_io.ts +21 -2
package/src/voice/transcription/synchronizer.ts +1 -1
package/src/worker.ts +5 -10

package/src/inference/tts.ts ADDED Viewed

@@ -0,0 +1,432 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import type { AudioFrame } from '@livekit/rtc-node';
+import { WebSocket } from 'ws';
+import { APIError, APIStatusError } from '../_exceptions.js';
+import { AudioByteStream } from '../audio.js';
+import { log } from '../log.js';
+import { createStreamChannel } from '../stream/stream_channel.js';
+import { basic as tokenizeBasic } from '../tokenize/index.js';
+import {
+  SynthesizeStream as BaseSynthesizeStream,
+  TTS as BaseTTS,
+  ChunkedStream,
+} from '../tts/index.js';
+import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
+import { shortuuid } from '../utils.js';
+import {
+  type TtsClientEvent,
+  type TtsServerEvent,
+  type TtsSessionCreateEvent,
+  ttsClientEventSchema,
+  ttsServerEventSchema,
+} from './api_protos.js';
+import { type AnyString, connectWs, createAccessToken } from './utils.js';
+export type CartesiaModels =
+  | 'cartesia'
+  | 'cartesia/sonic'
+  | 'cartesia/sonic-2'
+  | 'cartesia/sonic-turbo';
+export type ElevenlabsModels =
+  | 'elevenlabs'
+  | 'elevenlabs/eleven_flash_v2'
+  | 'elevenlabs/eleven_flash_v2_5'
+  | 'elevenlabs/eleven_turbo_v2'
+  | 'elevenlabs/eleven_turbo_v2_5'
+  | 'elevenlabs/eleven_multilingual_v2';
+export type RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
+export type InworldModels = 'inworld' | 'inworld/inworld-tts-1';
+export interface CartesiaOptions {
+  duration?: number; // max duration of audio in seconds
+  speed?: 'slow' | 'normal' | 'fast'; // default: not specified
+}
+export interface ElevenlabsOptions {
+  inactivity_timeout?: number; // default: 60
+  apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
+}
+export interface RimeOptions {}
+export interface InworldOptions {}
+type _TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels;
+export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyString;
+export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
+export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
+  ? CartesiaOptions
+  : TModel extends ElevenlabsModels
+    ? ElevenlabsOptions
+    : TModel extends RimeOptions
+      ? RimeOptions
+      : TModel extends InworldOptions
+        ? InworldOptions
+        : Record<string, unknown>;
+type TTSEncoding = 'pcm_s16le';
+const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
+const DEFAULT_SAMPLE_RATE = 16000;
+const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
+const NUM_CHANNELS = 1;
+const DEFAULT_LANGUAGE = 'en';
+export interface InferenceTTSOptions<TModel extends TTSModels> {
+  model?: TModel;
+  voice?: string;
+  language?: string;
+  encoding: TTSEncoding;
+  sampleRate: number;
+  baseURL: string;
+  apiKey: string;
+  apiSecret: string;
+  modelOptions: TTSOptions<TModel>;
+}
+/**
+ * Livekit Cloud Inference TTS
+ */
+export class TTS<TModel extends TTSModels> extends BaseTTS {
+  private opts: InferenceTTSOptions<TModel>;
+  private streams: Set<SynthesizeStream<TModel>> = new Set();
+  #logger = log();
+  constructor(opts: {
+    model: TModel;
+    voice?: string;
+    language?: string;
+    baseURL?: string;
+    encoding?: TTSEncoding;
+    sampleRate?: number;
+    apiKey?: string;
+    apiSecret?: string;
+    modelOptions?: TTSOptions<TModel>;
+  }) {
+    const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
+    super(sampleRate, 1, { streaming: true });
+    const {
+      model,
+      voice,
+      language = DEFAULT_LANGUAGE,
+      baseURL,
+      encoding = DEFAULT_ENCODING,
+      apiKey,
+      apiSecret,
+      modelOptions = {} as TTSOptions<TModel>,
+    } = opts || {};
+    const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
+    const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
+    if (!lkApiKey) {
+      throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
+    }
+    const lkApiSecret =
+      apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
+    if (!lkApiSecret) {
+      throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
+    }
+    // read voice id from the model if provided: "provider/model:voice_id"
+    let nextModel = model;
+    let nextVoice = voice;
+    if (typeof nextModel === 'string') {
+      const idx = nextModel.lastIndexOf(':');
+      if (idx !== -1) {
+        const voiceFromModel = nextModel.slice(idx + 1);
+        if (nextVoice && nextVoice !== voiceFromModel) {
+          this.#logger.warn(
+            '`voice` is provided via both argument and model, using the one from the argument',
+            { voice: nextVoice, model: nextModel },
+          );
+        } else {
+          nextVoice = voiceFromModel;
+        }
+        nextModel = nextModel.slice(0, idx) as TModel;
+      }
+    }
+    this.opts = {
+      model: nextModel,
+      voice: nextVoice,
+      language,
+      encoding,
+      sampleRate,
+      baseURL: lkBaseURL,
+      apiKey: lkApiKey,
+      apiSecret: lkApiSecret,
+      modelOptions,
+    };
+  }
+  get label() {
+    return 'inference.TTS';
+  }
+  static fromModelString(modelString: string): TTS<AnyString> {
+    if (modelString.includes(':')) {
+      const [model, voice] = modelString.split(':') as [TTSModels, string];
+      return new TTS({ model, voice });
+    }
+    return new TTS({ model: modelString });
+  }
+  updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
+    this.opts = { ...this.opts, ...opts };
+    for (const stream of this.streams) {
+      stream.updateOptions(opts);
+    }
+  }
+  synthesize(_: string): ChunkedStream {
+    throw new Error('ChunkedStream is not implemented');
+  }
+  stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
+    const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
+    const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
+    this.streams.add(stream);
+    return stream;
+  }
+  async connectWs(timeout: number): Promise<WebSocket> {
+    let baseURL = this.opts.baseURL;
+    if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
+      baseURL = baseURL.replace('http', 'ws');
+    }
+    const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
+    const url = `${baseURL}/tts`;
+    const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
+    const params = {
+      type: 'session.create',
+      sample_rate: String(this.opts.sampleRate),
+      encoding: this.opts.encoding,
+      extra: this.opts.modelOptions,
+    } as TtsSessionCreateEvent;
+    if (this.opts.voice) params.voice = this.opts.voice;
+    if (this.opts.model) params.model = this.opts.model;
+    if (this.opts.language) params.language = this.opts.language;
+    const socket = await connectWs(url, headers, timeout);
+    socket.send(JSON.stringify(params));
+    return socket;
+  }
+  async closeWs(ws: WebSocket) {
+    await ws.close();
+  }
+  async close() {
+    for (const stream of this.streams) {
+      await stream.close();
+    }
+    this.streams.clear();
+  }
+}
+export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
+  private opts: InferenceTTSOptions<TModel>;
+  private tts: TTS<TModel>;
+  private connOptions: APIConnectOptions;
+  #logger = log();
+  constructor(tts: TTS<TModel>, opts: InferenceTTSOptions<TModel>, connOptions: APIConnectOptions) {
+    super(tts, connOptions);
+    this.opts = opts;
+    this.tts = tts;
+    this.connOptions = connOptions;
+  }
+  get label() {
+    return 'inference.SynthesizeStream';
+  }
+  updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
+    this.opts = { ...this.opts, ...opts };
+  }
+  protected async run(): Promise<void> {
+    let ws: WebSocket | null = null;
+    let closing = false;
+    let finalReceived = false;
+    let lastFrame: AudioFrame | undefined;
+    const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
+    const eventChannel = createStreamChannel<TtsServerEvent>();
+    const requestId = shortuuid('tts_request_');
+    const resourceCleanup = () => {
+      if (closing) return;
+      closing = true;
+      sendTokenizerStream.close();
+      eventChannel.close();
+      ws?.removeAllListeners();
+      ws?.close();
+    };
+    const sendClientEvent = async (event: TtsClientEvent) => {
+      const validatedEvent = await ttsClientEventSchema.parseAsync(event);
+      if (!ws || ws.readyState !== WebSocket.OPEN) {
+        this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
+        return;
+      }
+      ws.send(JSON.stringify(validatedEvent));
+    };
+    const sendLastFrame = (segmentId: string, final: boolean) => {
+      if (lastFrame) {
+        this.queue.put({ requestId, segmentId, frame: lastFrame, final });
+        lastFrame = undefined;
+      }
+    };
+    const createInputTask = async () => {
+      for await (const data of this.input) {
+        if (this.abortController.signal.aborted) break;
+        if (data === SynthesizeStream.FLUSH_SENTINEL) {
+          sendTokenizerStream.flush();
+          continue;
+        }
+        sendTokenizerStream.pushText(data);
+      }
+      sendTokenizerStream.endInput();
+    };
+    const createSentenceStreamTask = async () => {
+      for await (const ev of sendTokenizerStream) {
+        if (this.abortController.signal.aborted) break;
+        sendClientEvent({
+          type: 'input_transcript',
+          transcript: ev.token + ' ',
+        });
+      }
+      sendClientEvent({ type: 'session.flush' });
+    };
+    const createWsListenerTask = async (ws: WebSocket) => {
+      return new Promise<void>((resolve, reject) => {
+        this.abortController.signal.addEventListener('abort', () => {
+          resourceCleanup();
+          reject(new Error('WebSocket connection aborted'));
+        });
+        ws.on('message', async (data) => {
+          const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
+          const validatedEvent = ttsServerEventSchema.parse(eventJson);
+          eventChannel.write(validatedEvent);
+        });
+        ws.on('error', (e) => {
+          this.#logger.error({ error: e }, 'WebSocket error');
+          resourceCleanup();
+          reject(e);
+        });
+        ws.on('close', () => {
+          resourceCleanup();
+          if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
+          if (finalReceived) return resolve();
+          reject(
+            new APIStatusError({
+              message: 'Gateway connection closed unexpectedly',
+              options: { requestId },
+            }),
+          );
+        });
+      });
+    };
+    const createRecvTask = async () => {
+      let currentSessionId: string | null = null;
+      const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
+      const serverEventStream = eventChannel.stream();
+      const reader = serverEventStream.getReader();
+      try {
+        while (!this.closed && !this.abortController.signal.aborted) {
+          const result = await reader.read();
+          if (this.abortController.signal.aborted) return;
+          if (result.done) return;
+          const serverEvent = result.value;
+          switch (serverEvent.type) {
+            case 'session.created':
+              currentSessionId = serverEvent.session_id;
+              break;
+            case 'output_audio':
+              const base64Data = new Int8Array(Buffer.from(serverEvent.audio, 'base64'));
+              for (const frame of bstream.write(base64Data.buffer)) {
+                sendLastFrame(currentSessionId!, false);
+                lastFrame = frame;
+              }
+              break;
+            case 'done':
+              finalReceived = true;
+              for (const frame of bstream.flush()) {
+                sendLastFrame(currentSessionId!, false);
+                lastFrame = frame;
+              }
+              sendLastFrame(currentSessionId!, true);
+              this.queue.put(SynthesizeStream.END_OF_STREAM);
+              break;
+            case 'session.closed':
+              resourceCleanup();
+              break;
+            case 'error':
+              this.#logger.error(
+                { serverEvent },
+                'Received error message from LiveKit TTS WebSocket',
+              );
+              resourceCleanup();
+              throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
+            default:
+              this.#logger.warn('Unexpected message %s', serverEvent);
+              break;
+          }
+        }
+      } finally {
+        reader.releaseLock();
+        try {
+          await serverEventStream.cancel();
+        } catch (e) {
+          this.#logger.debug('Error cancelling serverEventStream (may already be cancelled):', e);
+        }
+      }
+    };
+    try {
+      ws = await this.tts.connectWs(this.connOptions.timeoutMs);
+      await Promise.all([
+        createInputTask(),
+        createSentenceStreamTask(),
+        createWsListenerTask(ws),
+        createRecvTask(),
+      ]);
+    } catch (e) {
+      this.#logger.error('Error in SynthesizeStream', { error: e });
+    } finally {
+      resourceCleanup();
+    }
+  }
+}

package/src/inference/utils.ts ADDED Viewed

@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { AccessToken } from 'livekit-server-sdk';
+import { WebSocket } from 'ws';
+import { APIConnectionError, APIStatusError } from '../index.js';
+export type AnyString = string & NonNullable<unknown>;
+export async function createAccessToken(
+  apiKey: string,
+  apiSecret: string,
+  ttl: number = 600,
+): Promise<string> {
+  const token = new AccessToken(apiKey, apiSecret, { identity: 'agent', ttl });
+  token.addInferenceGrant({ perform: true });
+  return await token.toJwt();
+}
+export async function connectWs(
+  url: string,
+  headers: Record<string, string>,
+  timeoutMs: number,
+): Promise<WebSocket> {
+  return new Promise<WebSocket>((resolve, reject) => {
+    const socket = new WebSocket(url, { headers: headers });
+    const timeout = setTimeout(() => {
+      reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
+    }, timeoutMs);
+    const onOpen = () => {
+      clearTimeout(timeout);
+      resolve(socket);
+    };
+    const onError = (err: unknown) => {
+      clearTimeout(timeout);
+      if (err && typeof err === 'object' && 'code' in err && (err as any).code === 429) {
+        reject(
+          new APIStatusError({
+            message: 'LiveKit gateway quota exceeded',
+            options: { statusCode: 429 },
+          }),
+        );
+      } else {
+        reject(new APIConnectionError({ message: 'Error connecting to LiveKit WebSocket' }));
+      }
+    };
+    const onClose = (code: number) => {
+      clearTimeout(timeout);
+      if (code !== 1000) {
+        reject(
+          new APIConnectionError({
+            message: 'Connection closed unexpectedly',
+          }),
+        );
+      }
+    };
+    socket.once('open', onOpen);
+    socket.once('error', onError);
+    socket.once('close', onClose);
+  });
+}

package/src/llm/remote_chat_context.ts CHANGED Viewed

@@ -6,9 +6,9 @@ import type { ChatItem } from './chat_context.js';
 export interface RemoteChatItem {
   item: ChatItem;
-  /* @internal */
+  /** @internal */
   _prev?: RemoteChatItem | null;
-  /* @internal */
+  /** @internal */
   _next?: RemoteChatItem | null;
 }

package/src/tts/tts.ts CHANGED Viewed

@@ -443,7 +443,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
     for await (const audio of this.queue) {
       this.output.put(audio);
       requestId = audio.requestId;
-      if (!ttfb) {
+      if (ttfb === BigInt(-1)) {
         ttfb = process.hrtime.bigint() - startTime;
       }
       audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;

package/src/utils.ts CHANGED Viewed

@@ -817,3 +817,14 @@ export async function waitForTrackPublication({
     room.off(RoomEvent.TrackPublished, onTrackPublished);
   }
 }
+export async function waitForAbort(signal: AbortSignal) {
+  const abortFuture = new Future<void>();
+  const handler = () => {
+    abortFuture.resolve();
+    signal.removeEventListener('abort', handler);
+  };
+  signal.addEventListener('abort', handler, { once: true });
+  return await abortFuture.await;
+}

package/src/voice/agent.ts CHANGED Viewed

@@ -4,6 +4,14 @@
 import type { AudioFrame } from '@livekit/rtc-node';
 import { AsyncLocalStorage } from 'node:async_hooks';
 import { ReadableStream } from 'node:stream/web';
+import {
+  LLM as InferenceLLM,
+  STT as InferenceSTT,
+  TTS as InferenceTTS,
+  type LLMModels,
+  type STTModelString,
+  type TTSModelString,
+} from '../inference/index.js';
 import { ReadonlyChatContext } from '../llm/chat_context.js';
 import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';
 import {
@@ -46,7 +54,7 @@ export function isStopResponse(value: unknown): value is StopResponse {
 }
 export interface ModelSettings {
-  /* The tool choice to use when calling the LLM. */
+  /** The tool choice to use when calling the LLM. */
   toolChoice?: ToolChoice;
 }
@@ -55,10 +63,10 @@ export interface AgentOptions<UserData> {
   chatCtx?: ChatContext;
   tools?: ToolContext<UserData>;
   turnDetection?: TurnDetectionMode;
-  stt?: STT;
+  stt?: STT | STTModelString;
   vad?: VAD;
-  llm?: LLM | RealtimeModel;
-  tts?: TTS;
+  llm?: LLM | RealtimeModel | LLMModels;
+  tts?: TTS | TTSModelString;
   allowInterruptions?: boolean;
   minConsecutiveSpeechDelay?: number;
 }
@@ -101,10 +109,26 @@ export class Agent<UserData = any> {
       : ChatContext.empty();
     this.turnDetection = turnDetection;
-    this._stt = stt;
     this._vad = vad;
-    this._llm = llm;
-    this._tts = tts;
+    if (typeof stt === 'string') {
+      this._stt = InferenceSTT.fromModelString(stt);
+    } else {
+      this._stt = stt;
+    }
+    if (typeof llm === 'string') {
+      this._llm = InferenceLLM.fromModelString(llm);
+    } else {
+      this._llm = llm;
+    }
+    if (typeof tts === 'string') {
+      this._tts = InferenceTTS.fromModelString(tts);
+    } else {
+      this._tts = tts;
+    }
     this._agentActivity = undefined;
   }

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -641,6 +641,7 @@ export class AgentActivity implements RecognitionHooks {
       createUserInputTranscribedEvent({
         transcript: ev.alternatives![0].text,
         isFinal: false,
+        language: ev.alternatives![0].language,
         // TODO(AJS-106): add multi participant support
       }),
     );
@@ -657,6 +658,7 @@ export class AgentActivity implements RecognitionHooks {
       createUserInputTranscribedEvent({
         transcript: ev.alternatives![0].text,
         isFinal: true,
+        language: ev.alternatives![0].language,
         // TODO(AJS-106): add multi participant support
       }),
     );

package/src/voice/agent_session.ts CHANGED Viewed

@@ -5,6 +5,14 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
 import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
 import { EventEmitter } from 'node:events';
 import type { ReadableStream } from 'node:stream/web';
+import {
+  LLM as InferenceLLM,
+  STT as InferenceSTT,
+  TTS as InferenceTTS,
+  type LLMModels,
+  type STTModelString,
+  type TTSModelString,
+} from '../inference/index.js';
 import { getJobContext } from '../job.js';
 import { ChatContext, ChatMessage } from '../llm/chat_context.js';
 import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
@@ -77,10 +85,10 @@ export type AgentSessionCallbacks = {
 export type AgentSessionOptions<UserData = UnknownUserData> = {
   turnDetection?: TurnDetectionMode;
-  stt?: STT;
+  stt?: STT | STTModelString;
   vad?: VAD;
-  llm?: LLM | RealtimeModel;
-  tts?: TTS;
+  llm?: LLM | RealtimeModel | LLMModels;
+  tts?: TTS | TTSModelString;
   userData?: UserData;
   voiceOptions?: Partial<VoiceOptions>;
 };
@@ -128,9 +136,25 @@ export class AgentSession<
     } = opts;
     this.vad = vad;
-    this.stt = stt;
-    this.llm = llm;
-    this.tts = tts;
+    if (typeof stt === 'string') {
+      this.stt = InferenceSTT.fromModelString(stt);
+    } else {
+      this.stt = stt;
+    }
+    if (typeof llm === 'string') {
+      this.llm = InferenceLLM.fromModelString(llm);
+    } else {
+      this.llm = llm;
+    }
+    if (typeof tts === 'string') {
+      this.tts = InferenceTTS.fromModelString(tts);
+    } else {
+      this.tts = tts;
+    }
     this.turnDetection = turnDetection;
     this._userData = userData;