@livekit/agents 1.0.41 → 1.0.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/inference/index.cjs +8 -0
  2. package/dist/inference/index.cjs.map +1 -1
  3. package/dist/inference/index.d.cts +2 -2
  4. package/dist/inference/index.d.ts +2 -2
  5. package/dist/inference/index.d.ts.map +1 -1
  6. package/dist/inference/index.js +8 -0
  7. package/dist/inference/index.js.map +1 -1
  8. package/dist/inference/stt.cjs +51 -10
  9. package/dist/inference/stt.cjs.map +1 -1
  10. package/dist/inference/stt.d.cts +33 -0
  11. package/dist/inference/stt.d.ts +33 -0
  12. package/dist/inference/stt.d.ts.map +1 -1
  13. package/dist/inference/stt.js +48 -9
  14. package/dist/inference/stt.js.map +1 -1
  15. package/dist/inference/stt.test.cjs +204 -0
  16. package/dist/inference/stt.test.cjs.map +1 -0
  17. package/dist/inference/stt.test.js +203 -0
  18. package/dist/inference/stt.test.js.map +1 -0
  19. package/dist/inference/tts.cjs +52 -10
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +22 -0
  22. package/dist/inference/tts.d.ts +22 -0
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js +49 -9
  25. package/dist/inference/tts.js.map +1 -1
  26. package/dist/inference/tts.test.cjs +223 -0
  27. package/dist/inference/tts.test.cjs.map +1 -0
  28. package/dist/inference/tts.test.js +222 -0
  29. package/dist/inference/tts.test.js.map +1 -0
  30. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  31. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  32. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  33. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  34. package/dist/ipc/job_proc_lazy_main.cjs +2 -1
  35. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  36. package/dist/ipc/job_proc_lazy_main.js +2 -1
  37. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  38. package/dist/ipc/supervised_proc.cjs.map +1 -1
  39. package/dist/ipc/supervised_proc.d.cts +7 -0
  40. package/dist/ipc/supervised_proc.d.ts +7 -0
  41. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  42. package/dist/ipc/supervised_proc.js.map +1 -1
  43. package/dist/stt/stt.cjs.map +1 -1
  44. package/dist/stt/stt.d.cts +7 -0
  45. package/dist/stt/stt.d.ts +7 -0
  46. package/dist/stt/stt.d.ts.map +1 -1
  47. package/dist/stt/stt.js.map +1 -1
  48. package/dist/transcription.cjs.map +1 -1
  49. package/dist/transcription.d.cts +6 -0
  50. package/dist/transcription.d.ts +6 -0
  51. package/dist/transcription.d.ts.map +1 -1
  52. package/dist/transcription.js.map +1 -1
  53. package/dist/vad.cjs +1 -1
  54. package/dist/vad.cjs.map +1 -1
  55. package/dist/vad.d.cts +3 -2
  56. package/dist/vad.d.ts +3 -2
  57. package/dist/vad.d.ts.map +1 -1
  58. package/dist/vad.js +1 -1
  59. package/dist/vad.js.map +1 -1
  60. package/dist/voice/agent_activity.cjs +1 -2
  61. package/dist/voice/agent_activity.cjs.map +1 -1
  62. package/dist/voice/agent_activity.js +1 -2
  63. package/dist/voice/agent_activity.js.map +1 -1
  64. package/dist/voice/audio_recognition.cjs +1 -1
  65. package/dist/voice/audio_recognition.cjs.map +1 -1
  66. package/dist/voice/audio_recognition.d.cts +14 -0
  67. package/dist/voice/audio_recognition.d.ts +14 -0
  68. package/dist/voice/audio_recognition.d.ts.map +1 -1
  69. package/dist/voice/audio_recognition.js +1 -1
  70. package/dist/voice/audio_recognition.js.map +1 -1
  71. package/package.json +1 -1
  72. package/src/inference/index.ts +8 -0
  73. package/src/inference/stt.test.ts +236 -0
  74. package/src/inference/stt.ts +95 -17
  75. package/src/inference/tts.test.ts +255 -0
  76. package/src/inference/tts.ts +81 -15
  77. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  78. package/src/ipc/job_proc_lazy_main.ts +5 -1
  79. package/src/ipc/supervised_proc.ts +7 -0
  80. package/src/stt/stt.ts +7 -0
  81. package/src/transcription.ts +6 -0
  82. package/src/vad.ts +4 -3
  83. package/src/voice/agent_activity.ts +1 -1
  84. package/src/voice/audio_recognition.ts +16 -1
@@ -16,7 +16,6 @@ import { Event, Future, Task, cancelAndWait, combineSignals, shortuuid } from '.
16
16
  import {
17
17
  type TtsClientEvent,
18
18
  type TtsServerEvent,
19
- type TtsSessionCreateEvent,
20
19
  ttsClientEventSchema,
21
20
  ttsServerEventSchema,
22
21
  } from './api_protos.js';
@@ -46,13 +45,17 @@ export type InworldModels =
46
45
  export type RimeModels = 'rime/arcana' | 'rime/mistv2';
47
46
 
48
47
  export interface CartesiaOptions {
49
- duration?: number; // max duration of audio in seconds
50
- speed?: 'slow' | 'normal' | 'fast'; // default: not specified
48
+ /** Maximum duration of audio in seconds. */
49
+ duration?: number;
50
+ /** Speech speed. Default: not specified. */
51
+ speed?: 'slow' | 'normal' | 'fast';
51
52
  }
52
53
 
53
54
  export interface ElevenlabsOptions {
54
- inactivity_timeout?: number; // default: 60
55
- apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
55
+ /** Inactivity timeout in seconds. Default: 60. */
56
+ inactivity_timeout?: number;
57
+ /** Text normalization mode. Default: "auto". */
58
+ apply_text_normalization?: 'auto' | 'off' | 'on';
56
59
  }
57
60
 
58
61
  export interface DeepgramTTSOptions {}
@@ -90,6 +93,45 @@ export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
90
93
  ? InworldOptions
91
94
  : Record<string, unknown>;
92
95
 
96
+ /** Parse a model string into [model, voice]. Voice is undefined if not specified. */
97
+ export function parseTTSModelString(model: string): [string, string | undefined] {
98
+ const idx = model.lastIndexOf(':');
99
+ if (idx !== -1) {
100
+ return [model.slice(0, idx), model.slice(idx + 1)];
101
+ }
102
+ return [model, undefined];
103
+ }
104
+
105
+ /** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
106
+ export interface TTSFallbackModel {
107
+ /** Model name (e.g. "cartesia/sonic", "elevenlabs/eleven_flash_v2", "rime/arcana"). */
108
+ model: string;
109
+ /** Voice to use for the model. */
110
+ voice: string;
111
+ /** Extra configuration for the model. */
112
+ extraKwargs?: Record<string, unknown>;
113
+ }
114
+
115
+ export type TTSFallbackModelType = TTSFallbackModel | string;
116
+
117
+ /** Normalize a single or list of FallbackModelType into TTSFallbackModel[]. */
118
+ export function normalizeTTSFallback(
119
+ fallback: TTSFallbackModelType | TTSFallbackModelType[],
120
+ ): TTSFallbackModel[] {
121
+ const makeFallback = (model: TTSFallbackModelType): TTSFallbackModel => {
122
+ if (typeof model === 'string') {
123
+ const [name, voice] = parseTTSModelString(model);
124
+ return { model: name, voice: voice ?? '' };
125
+ }
126
+ return model;
127
+ };
128
+
129
+ if (Array.isArray(fallback)) {
130
+ return fallback.map(makeFallback);
131
+ }
132
+ return [makeFallback(fallback)];
133
+ }
134
+
93
135
  type TTSEncoding = 'pcm_s16le';
94
136
 
95
137
  const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
@@ -108,6 +150,8 @@ export interface InferenceTTSOptions<TModel extends TTSModels> {
108
150
  apiKey: string;
109
151
  apiSecret: string;
110
152
  modelOptions: TTSOptions<TModel>;
153
+ fallback?: TTSFallbackModel[];
154
+ connOptions?: APIConnectOptions;
111
155
  }
112
156
 
113
157
  /**
@@ -130,6 +174,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
130
174
  apiKey?: string;
131
175
  apiSecret?: string;
132
176
  modelOptions?: TTSOptions<TModel>;
177
+ fallback?: TTSFallbackModelType | TTSFallbackModelType[];
178
+ connOptions?: APIConnectOptions;
133
179
  }) {
134
180
  const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
135
181
  super(sampleRate, 1, { streaming: true });
@@ -143,6 +189,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
143
189
  apiKey,
144
190
  apiSecret,
145
191
  modelOptions = {} as TTSOptions<TModel>,
192
+ fallback,
193
+ connOptions,
146
194
  } = opts || {};
147
195
 
148
196
  const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
@@ -176,6 +224,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
176
224
  }
177
225
  }
178
226
 
227
+ const normalizedFallback = fallback ? normalizeTTSFallback(fallback) : undefined;
228
+
179
229
  this.opts = {
180
230
  model: nextModel,
181
231
  voice: nextVoice,
@@ -186,6 +236,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
186
236
  apiKey: lkApiKey,
187
237
  apiSecret: lkApiSecret,
188
238
  modelOptions,
239
+ fallback: normalizedFallback,
240
+ connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
189
241
  };
190
242
 
191
243
  // Initialize connection pool
@@ -203,11 +255,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
203
255
  }
204
256
 
205
257
  static fromModelString(modelString: string): TTS<AnyString> {
206
- if (modelString.includes(':')) {
207
- const [model, voice] = modelString.split(':') as [TTSModels, string];
208
- return new TTS({ model, voice });
209
- }
210
- return new TTS({ model: modelString });
258
+ const [model, voice] = parseTTSModelString(modelString);
259
+ return new TTS({ model, voice: voice || undefined });
211
260
  }
212
261
 
213
262
  updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
@@ -222,7 +271,7 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
222
271
  }
223
272
 
224
273
  stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
225
- const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
274
+ const { connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } = options || {};
226
275
  const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
227
276
  this.streams.add(stream);
228
277
  return stream;
@@ -243,11 +292,28 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
243
292
  sample_rate: String(this.opts.sampleRate),
244
293
  encoding: this.opts.encoding,
245
294
  extra: this.opts.modelOptions,
246
- } as TtsSessionCreateEvent;
295
+ } as Record<string, unknown>;
296
+
297
+ if (this.opts.voice) (params as Record<string, unknown>).voice = this.opts.voice;
298
+ if (this.opts.model) (params as Record<string, unknown>).model = this.opts.model;
299
+ if (this.opts.language) (params as Record<string, unknown>).language = this.opts.language;
300
+
301
+ if (this.opts.fallback?.length) {
302
+ params.fallback = {
303
+ models: this.opts.fallback.map((m) => ({
304
+ model: m.model,
305
+ voice: m.voice,
306
+ extra: m.extraKwargs ?? {},
307
+ })),
308
+ };
309
+ }
247
310
 
248
- if (this.opts.voice) params.voice = this.opts.voice;
249
- if (this.opts.model) params.model = this.opts.model;
250
- if (this.opts.language) params.language = this.opts.language;
311
+ if (this.opts.connOptions) {
312
+ params.connection = {
313
+ timeout: this.opts.connOptions.timeoutMs / 1000,
314
+ retries: this.opts.connOptions.maxRetry,
315
+ };
316
+ }
251
317
 
252
318
  this.#logger.debug({ url }, 'inference.TTS creating new websocket connection (pool miss)');
253
319
  const socket = await connectWs(url, headers, timeout);
@@ -36,7 +36,19 @@ const ORPHANED_TIMEOUT = 15 * 1000;
36
36
 
37
37
  const runners: { [id: string]: InferenceRunner } = await Promise.all(
38
38
  Object.entries(JSON.parse(process.argv[2]!)).map(async ([k, v]) => {
39
- return [k, await import(v as string).then((m) => new m.default())];
39
+ return [
40
+ k,
41
+ await import(v as string).then((m) => {
42
+ // Handle both ESM (m.default is the class) and CJS (m.default.default is the class)
43
+ const Runner = typeof m.default === 'function' ? m.default : m.default?.default;
44
+ if (typeof Runner !== 'function') {
45
+ throw new Error(
46
+ `Unable to load inference runner: Missing or invalid default export in ${v}`,
47
+ );
48
+ }
49
+ return new Runner();
50
+ }),
51
+ ];
40
52
  }),
41
53
  ).then(Object.fromEntries);
42
54
 
@@ -156,7 +156,11 @@ const startJob = (
156
156
  // [2] import.meta.filename of function containing entry file
157
157
  const moduleFile = process.argv[2];
158
158
  const agent: Agent = await import(pathToFileURL(moduleFile!).pathname).then((module) => {
159
- const agent = module.default;
159
+ // Handle both ESM (module.default is the agent) and CJS (module.default.default is the agent)
160
+ const agent =
161
+ typeof module.default === 'function' || isAgent(module.default)
162
+ ? module.default
163
+ : module.default?.default;
160
164
  if (agent === undefined || !isAgent(agent)) {
161
165
  throw new Error(`Unable to load agent: Missing or invalid default export in ${moduleFile}`);
162
166
  }
@@ -10,12 +10,19 @@ import { Future } from '../utils.js';
10
10
  import type { IPCMessage } from './message.js';
11
11
 
12
12
  export interface ProcOpts {
13
+ /** Timeout for process initialization in milliseconds. */
13
14
  initializeTimeout: number;
15
+ /** Timeout for process shutdown in milliseconds. */
14
16
  closeTimeout: number;
17
+ /** Memory usage warning threshold in megabytes. */
15
18
  memoryWarnMB: number;
19
+ /** Memory usage limit in megabytes. */
16
20
  memoryLimitMB: number;
21
+ /** Interval for health check pings in milliseconds. */
17
22
  pingInterval: number;
23
+ /** Timeout waiting for pong response in milliseconds. */
18
24
  pingTimeout: number;
25
+ /** Threshold for warning about unresponsive processes in milliseconds. */
19
26
  highPingThreshold: number;
20
27
  }
21
28
 
package/src/stt/stt.ts CHANGED
@@ -49,15 +49,22 @@ export enum SpeechEventType {
49
49
 
50
50
  /** SpeechData contains metadata about this {@link SpeechEvent}. */
51
51
  export interface SpeechData {
52
+ /** Language code of the speech. */
52
53
  language: string;
54
+ /** Transcribed text. */
53
55
  text: string;
56
+ /** Start time of the speech segment in seconds. */
54
57
  startTime: number;
58
+ /** End time of the speech segment in seconds. */
55
59
  endTime: number;
60
+ /** Confidence score of the transcription (0-1). */
56
61
  confidence: number;
62
+ /** Word-level timing information. */
57
63
  words?: TimedString[];
58
64
  }
59
65
 
60
66
  export interface RecognitionUsage {
67
+ /** Duration of the audio that was recognized in seconds. */
61
68
  audioDuration: number;
62
69
  }
63
70
 
@@ -13,11 +13,17 @@ import { AsyncIterableQueue, Future, shortuuid } from './utils.js';
13
13
  const STANDARD_SPEECH_RATE = 3830;
14
14
 
15
15
  export interface TextSyncOptions {
16
+ /** Language code for transcription. */
16
17
  language: string;
18
+ /** Speech speed multiplier. */
17
19
  speed: number;
20
+ /** Delay between sentences in milliseconds. */
18
21
  newSentenceDelay: number;
22
+ /** Tokenizer for splitting text into sentences. */
19
23
  sentenceTokenizer: SentenceTokenizer;
24
+ /** Function to hyphenate words. */
20
25
  hyphenateWord: (word: string) => string[];
26
+ /** Function to split text into words with positions. */
21
27
  splitWords: (words: string) => [string, number, number][];
22
28
  }
23
29
 
package/src/vad.ts CHANGED
@@ -30,9 +30,9 @@ export interface VADEvent {
30
30
  samplesIndex: number;
31
31
  /** Timestamp when the event was fired. */
32
32
  timestamp: number;
33
- /** Duration of the speech segment. */
33
+ /** Duration of the speech segment in seconds. */
34
34
  speechDuration: number;
35
- /** Duration of the silence segment. */
35
+ /** Duration of the silence segment in seconds. */
36
36
  silenceDuration: number;
37
37
  /**
38
38
  * List of audio frames associated with the speech.
@@ -56,6 +56,7 @@ export interface VADEvent {
56
56
  }
57
57
 
58
58
  export interface VADCapabilities {
59
+ /** Duration of each VAD inference window in milliseconds. Used to batch metrics emissions to roughly once per second. */
59
60
  updateInterval: number;
60
61
  }
61
62
 
@@ -154,7 +155,7 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
154
155
  switch (value.type) {
155
156
  case VADEventType.START_OF_SPEECH:
156
157
  inferenceCount++;
157
- if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
158
+ if (inferenceCount >= 1000 / this.#vad.capabilities.updateInterval) {
158
159
  this.#vad.emit('metrics_collected', {
159
160
  type: 'vad_metrics',
160
161
  timestamp: Date.now(),
@@ -1023,7 +1023,7 @@ export class AgentActivity implements RecognitionHooks {
1023
1023
  toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1024
1024
  },
1025
1025
  abortController,
1026
- instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
1026
+ instructions,
1027
1027
  userMessage,
1028
1028
  ),
1029
1029
  ),
@@ -18,11 +18,17 @@ import type { TurnDetectionMode } from './agent_session.js';
18
18
  import type { STTNode } from './io.js';
19
19
 
20
20
  export interface EndOfTurnInfo {
21
+ /** The new transcript text from the user's speech. */
21
22
  newTranscript: string;
23
+ /** Confidence score of the transcript (0-1). */
22
24
  transcriptConfidence: number;
25
+ /** Delay from speech stop to final transcription in milliseconds. */
23
26
  transcriptionDelay: number;
27
+ /** Delay from speech stop to end of utterance detection in milliseconds. */
24
28
  endOfUtteranceDelay: number;
29
+ /** Timestamp when user started speaking (milliseconds since epoch). */
25
30
  startedSpeakingAt: number | undefined;
31
+ /** Timestamp when user stopped speaking (milliseconds since epoch). */
26
32
  stoppedSpeakingAt: number | undefined;
27
33
  }
28
34
 
@@ -50,13 +56,21 @@ export interface _TurnDetector {
50
56
  }
51
57
 
52
58
  export interface AudioRecognitionOptions {
59
+ /** Hooks for recognition events. */
53
60
  recognitionHooks: RecognitionHooks;
61
+ /** Speech-to-text node. */
54
62
  stt?: STTNode;
63
+ /** Voice activity detection. */
55
64
  vad?: VAD;
65
+ /** Turn detector for end-of-turn prediction. */
56
66
  turnDetector?: _TurnDetector;
67
+ /** Turn detection mode. */
57
68
  turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
69
+ /** Minimum endpointing delay in milliseconds. */
58
70
  minEndpointingDelay: number;
71
+ /** Maximum endpointing delay in milliseconds. */
59
72
  maxEndpointingDelay: number;
73
+ /** Root span context for tracing. */
60
74
  rootSpanContext?: Context;
61
75
  }
62
76
 
@@ -161,7 +175,6 @@ export class AudioRecognition {
161
175
 
162
176
  switch (ev.type) {
163
177
  case SpeechEventType.FINAL_TRANSCRIPT:
164
- this.hooks.onFinalTranscript(ev);
165
178
  const transcript = ev.alternatives?.[0]?.text;
166
179
  const confidence = ev.alternatives?.[0]?.confidence ?? 0;
167
180
  this.lastLanguage = ev.alternatives?.[0]?.language;
@@ -171,6 +184,8 @@ export class AudioRecognition {
171
184
  return;
172
185
  }
173
186
 
187
+ this.hooks.onFinalTranscript(ev);
188
+
174
189
  this.logger.debug(
175
190
  {
176
191
  user_transcript: transcript,