@livekit/agents 1.0.41 → 1.0.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/inference/index.cjs +8 -0
  2. package/dist/inference/index.cjs.map +1 -1
  3. package/dist/inference/index.d.cts +2 -2
  4. package/dist/inference/index.d.ts +2 -2
  5. package/dist/inference/index.d.ts.map +1 -1
  6. package/dist/inference/index.js +8 -0
  7. package/dist/inference/index.js.map +1 -1
  8. package/dist/inference/stt.cjs +51 -10
  9. package/dist/inference/stt.cjs.map +1 -1
  10. package/dist/inference/stt.d.cts +33 -0
  11. package/dist/inference/stt.d.ts +33 -0
  12. package/dist/inference/stt.d.ts.map +1 -1
  13. package/dist/inference/stt.js +48 -9
  14. package/dist/inference/stt.js.map +1 -1
  15. package/dist/inference/stt.test.cjs +204 -0
  16. package/dist/inference/stt.test.cjs.map +1 -0
  17. package/dist/inference/stt.test.js +203 -0
  18. package/dist/inference/stt.test.js.map +1 -0
  19. package/dist/inference/tts.cjs +52 -10
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +22 -0
  22. package/dist/inference/tts.d.ts +22 -0
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js +49 -9
  25. package/dist/inference/tts.js.map +1 -1
  26. package/dist/inference/tts.test.cjs +223 -0
  27. package/dist/inference/tts.test.cjs.map +1 -0
  28. package/dist/inference/tts.test.js +222 -0
  29. package/dist/inference/tts.test.js.map +1 -0
  30. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  31. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  32. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  33. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  34. package/dist/ipc/job_proc_lazy_main.cjs +2 -1
  35. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  36. package/dist/ipc/job_proc_lazy_main.js +2 -1
  37. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  38. package/dist/ipc/supervised_proc.cjs.map +1 -1
  39. package/dist/ipc/supervised_proc.d.cts +7 -0
  40. package/dist/ipc/supervised_proc.d.ts +7 -0
  41. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  42. package/dist/ipc/supervised_proc.js.map +1 -1
  43. package/dist/stt/stt.cjs.map +1 -1
  44. package/dist/stt/stt.d.cts +7 -0
  45. package/dist/stt/stt.d.ts +7 -0
  46. package/dist/stt/stt.d.ts.map +1 -1
  47. package/dist/stt/stt.js.map +1 -1
  48. package/dist/transcription.cjs.map +1 -1
  49. package/dist/transcription.d.cts +6 -0
  50. package/dist/transcription.d.ts +6 -0
  51. package/dist/transcription.d.ts.map +1 -1
  52. package/dist/transcription.js.map +1 -1
  53. package/dist/vad.cjs +1 -1
  54. package/dist/vad.cjs.map +1 -1
  55. package/dist/vad.d.cts +3 -2
  56. package/dist/vad.d.ts +3 -2
  57. package/dist/vad.d.ts.map +1 -1
  58. package/dist/vad.js +1 -1
  59. package/dist/vad.js.map +1 -1
  60. package/dist/voice/agent_activity.cjs +1 -2
  61. package/dist/voice/agent_activity.cjs.map +1 -1
  62. package/dist/voice/agent_activity.js +1 -2
  63. package/dist/voice/agent_activity.js.map +1 -1
  64. package/dist/voice/audio_recognition.cjs +1 -1
  65. package/dist/voice/audio_recognition.cjs.map +1 -1
  66. package/dist/voice/audio_recognition.d.cts +14 -0
  67. package/dist/voice/audio_recognition.d.ts +14 -0
  68. package/dist/voice/audio_recognition.d.ts.map +1 -1
  69. package/dist/voice/audio_recognition.js +1 -1
  70. package/dist/voice/audio_recognition.js.map +1 -1
  71. package/package.json +1 -1
  72. package/src/inference/index.ts +8 -0
  73. package/src/inference/stt.test.ts +236 -0
  74. package/src/inference/stt.ts +95 -17
  75. package/src/inference/tts.test.ts +255 -0
  76. package/src/inference/tts.ts +81 -15
  77. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  78. package/src/ipc/job_proc_lazy_main.ts +5 -1
  79. package/src/ipc/supervised_proc.ts +7 -0
  80. package/src/stt/stt.ts +7 -0
  81. package/src/transcription.ts +6 -0
  82. package/src/vad.ts +4 -3
  83. package/src/voice/agent_activity.ts +1 -1
  84. package/src/voice/audio_recognition.ts +16 -1
@@ -0,0 +1,236 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { beforeAll, describe, expect, it } from 'vitest';
5
+ import { initializeLogger } from '../log.js';
6
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
7
+ import { STT, type STTFallbackModel, normalizeSTTFallback, parseSTTModelString } from './stt.js';
8
+
9
+ beforeAll(() => {
10
+ initializeLogger({ level: 'silent', pretty: false });
11
+ });
12
+
13
+ /** Helper to create STT with required credentials. */
14
+ function makeStt(overrides: Record<string, unknown> = {}) {
15
+ const defaults = {
16
+ model: 'deepgram' as const,
17
+ apiKey: 'test-key',
18
+ apiSecret: 'test-secret',
19
+ baseURL: 'https://example.livekit.cloud',
20
+ };
21
+ return new STT({ ...defaults, ...overrides });
22
+ }
23
+
24
+ describe('parseSTTModelString', () => {
25
+ it('simple model without language', () => {
26
+ const [model, language] = parseSTTModelString('deepgram');
27
+ expect(model).toBe('deepgram');
28
+ expect(language).toBeUndefined();
29
+ });
30
+
31
+ it('model with language suffix', () => {
32
+ const [model, language] = parseSTTModelString('deepgram:en');
33
+ expect(model).toBe('deepgram');
34
+ expect(language).toBe('en');
35
+ });
36
+
37
+ it('provider/model format without language', () => {
38
+ const [model, language] = parseSTTModelString('deepgram/nova-3');
39
+ expect(model).toBe('deepgram/nova-3');
40
+ expect(language).toBeUndefined();
41
+ });
42
+
43
+ it('provider/model format with language', () => {
44
+ const [model, language] = parseSTTModelString('deepgram/nova-3:en');
45
+ expect(model).toBe('deepgram/nova-3');
46
+ expect(language).toBe('en');
47
+ });
48
+
49
+ it.each([
50
+ ['cartesia/ink-whisper:de', 'cartesia/ink-whisper', 'de'],
51
+ ['assemblyai:es', 'assemblyai', 'es'],
52
+ ['deepgram/nova-2-medical:ja', 'deepgram/nova-2-medical', 'ja'],
53
+ ['deepgram/nova-3:multi', 'deepgram/nova-3', 'multi'],
54
+ ['cartesia:zh', 'cartesia', 'zh'],
55
+ ])('various providers and languages: %s', (modelStr, expectedModel, expectedLang) => {
56
+ const [model, language] = parseSTTModelString(modelStr);
57
+ expect(model).toBe(expectedModel);
58
+ expect(language).toBe(expectedLang);
59
+ });
60
+
61
+ it('auto model without language', () => {
62
+ const [model, language] = parseSTTModelString('auto');
63
+ expect(model).toBe('auto');
64
+ expect(language).toBeUndefined();
65
+ });
66
+
67
+ it('auto model with language', () => {
68
+ const [model, language] = parseSTTModelString('auto:pt');
69
+ expect(model).toBe('auto');
70
+ expect(language).toBe('pt');
71
+ });
72
+ });
73
+
74
+ describe('normalizeSTTFallback', () => {
75
+ it('single string model', () => {
76
+ const result = normalizeSTTFallback('deepgram/nova-3');
77
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
78
+ });
79
+
80
+ it('single FallbackModel dict', () => {
81
+ const fallback: STTFallbackModel = { model: 'deepgram/nova-3' };
82
+ const result = normalizeSTTFallback(fallback);
83
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
84
+ });
85
+
86
+ it('list of string models', () => {
87
+ const result = normalizeSTTFallback(['deepgram/nova-3', 'cartesia/ink-whisper']);
88
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'cartesia/ink-whisper' }]);
89
+ });
90
+
91
+ it('list of FallbackModel dicts', () => {
92
+ const fallbacks: STTFallbackModel[] = [{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }];
93
+ const result = normalizeSTTFallback(fallbacks);
94
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
95
+ });
96
+
97
+ it('mixed list of strings and dicts', () => {
98
+ const result = normalizeSTTFallback([
99
+ 'deepgram/nova-3',
100
+ { model: 'cartesia/ink-whisper' } as STTFallbackModel,
101
+ 'assemblyai',
102
+ ]);
103
+ expect(result).toEqual([
104
+ { model: 'deepgram/nova-3' },
105
+ { model: 'cartesia/ink-whisper' },
106
+ { model: 'assemblyai' },
107
+ ]);
108
+ });
109
+
110
+ it('string with language suffix discards language', () => {
111
+ const result = normalizeSTTFallback('deepgram/nova-3:en');
112
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
113
+ });
114
+
115
+ it('FallbackModel with extraKwargs is preserved', () => {
116
+ const fallback: STTFallbackModel = {
117
+ model: 'deepgram/nova-3',
118
+ extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
119
+ };
120
+ const result = normalizeSTTFallback(fallback);
121
+ expect(result).toEqual([
122
+ {
123
+ model: 'deepgram/nova-3',
124
+ extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
125
+ },
126
+ ]);
127
+ });
128
+
129
+ it('list with extraKwargs preserved', () => {
130
+ const result = normalizeSTTFallback([
131
+ { model: 'deepgram/nova-3', extraKwargs: { punctuate: true } } as STTFallbackModel,
132
+ 'cartesia/ink-whisper',
133
+ { model: 'assemblyai', extraKwargs: { format_turns: true } } as STTFallbackModel,
134
+ ]);
135
+ expect(result).toEqual([
136
+ { model: 'deepgram/nova-3', extraKwargs: { punctuate: true } },
137
+ { model: 'cartesia/ink-whisper' },
138
+ { model: 'assemblyai', extraKwargs: { format_turns: true } },
139
+ ]);
140
+ });
141
+
142
+ it('empty list returns empty list', () => {
143
+ const result = normalizeSTTFallback([]);
144
+ expect(result).toEqual([]);
145
+ });
146
+
147
+ it('multiple colons in model string splits on last', () => {
148
+ const result = normalizeSTTFallback('some:model:part:fr');
149
+ expect(result).toEqual([{ model: 'some:model:part' }]);
150
+ });
151
+ });
152
+
153
+ describe('STT constructor fallback and connOptions', () => {
154
+ it('fallback not given defaults to undefined', () => {
155
+ const stt = makeStt();
156
+ expect(stt['opts'].fallback).toBeUndefined();
157
+ });
158
+
159
+ it('fallback single string is normalized', () => {
160
+ const stt = makeStt({ fallback: 'cartesia/ink-whisper' });
161
+ expect(stt['opts'].fallback).toEqual([{ model: 'cartesia/ink-whisper' }]);
162
+ });
163
+
164
+ it('fallback list of strings is normalized', () => {
165
+ const stt = makeStt({ fallback: ['deepgram/nova-3', 'assemblyai'] });
166
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
167
+ });
168
+
169
+ it('fallback single FallbackModel is normalized to list', () => {
170
+ const stt = makeStt({ fallback: { model: 'deepgram/nova-3' } });
171
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
172
+ });
173
+
174
+ it('fallback with extraKwargs is preserved', () => {
175
+ const stt = makeStt({
176
+ fallback: {
177
+ model: 'deepgram/nova-3',
178
+ extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
179
+ },
180
+ });
181
+ expect(stt['opts'].fallback).toEqual([
182
+ {
183
+ model: 'deepgram/nova-3',
184
+ extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
185
+ },
186
+ ]);
187
+ });
188
+
189
+ it('fallback mixed list is normalized', () => {
190
+ const stt = makeStt({
191
+ fallback: [
192
+ 'deepgram/nova-3',
193
+ { model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
194
+ 'assemblyai',
195
+ ],
196
+ });
197
+ expect(stt['opts'].fallback).toEqual([
198
+ { model: 'deepgram/nova-3' },
199
+ { model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
200
+ { model: 'assemblyai' },
201
+ ]);
202
+ });
203
+
204
+ it('fallback string with language discards language', () => {
205
+ const stt = makeStt({ fallback: 'deepgram/nova-3:en' });
206
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
207
+ });
208
+
209
+ it('connOptions not given uses default', () => {
210
+ const stt = makeStt();
211
+ expect(stt['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
212
+ });
213
+
214
+ it('connOptions custom timeout', () => {
215
+ const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
216
+ const stt = makeStt({ connOptions: custom });
217
+ expect(stt['opts'].connOptions).toEqual(custom);
218
+ expect(stt['opts'].connOptions!.timeoutMs).toBe(30000);
219
+ });
220
+
221
+ it('connOptions custom maxRetry', () => {
222
+ const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
223
+ const stt = makeStt({ connOptions: custom });
224
+ expect(stt['opts'].connOptions).toEqual(custom);
225
+ expect(stt['opts'].connOptions!.maxRetry).toBe(5);
226
+ });
227
+
228
+ it('connOptions full custom', () => {
229
+ const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
230
+ const stt = makeStt({ connOptions: custom });
231
+ expect(stt['opts'].connOptions).toEqual(custom);
232
+ expect(stt['opts'].connOptions!.timeoutMs).toBe(60000);
233
+ expect(stt['opts'].connOptions!.maxRetry).toBe(10);
234
+ expect(stt['opts'].connOptions!.retryIntervalMs).toBe(2000);
235
+ });
236
+ });
@@ -42,29 +42,46 @@ export type AssemblyaiModels =
42
42
  export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
43
43
 
44
44
  export interface CartesiaOptions {
45
- min_volume?: number; // default: not specified
46
- max_silence_duration_secs?: number; // default: not specified
45
+ /** Minimum volume threshold. Default: not specified. */
46
+ min_volume?: number;
47
+ /** Maximum silence duration in seconds. Default: not specified. */
48
+ max_silence_duration_secs?: number;
47
49
  }
48
50
 
49
51
  export interface DeepgramOptions {
50
- filler_words?: boolean; // default: true
51
- interim_results?: boolean; // default: true
52
- endpointing?: number; // default: 25 (ms)
53
- punctuate?: boolean; // default: false
52
+ /** Enable filler words. Default: true. */
53
+ filler_words?: boolean;
54
+ /** Enable interim results. Default: true. */
55
+ interim_results?: boolean;
56
+ /** Endpointing timeout in milliseconds. Default: 25. */
57
+ endpointing?: number;
58
+ /** Enable punctuation. Default: false. */
59
+ punctuate?: boolean;
60
+ /** Enable smart formatting. */
54
61
  smart_format?: boolean;
62
+ /** Keywords with boost values. */
55
63
  keywords?: Array<[string, number]>;
64
+ /** Key terms for recognition. */
56
65
  keyterms?: string[];
66
+ /** Enable profanity filter. */
57
67
  profanity_filter?: boolean;
68
+ /** Convert spoken numbers to numerals. */
58
69
  numerals?: boolean;
70
+ /** Opt out of model improvement program. */
59
71
  mip_opt_out?: boolean;
60
72
  }
61
73
 
62
74
  export interface AssemblyAIOptions {
63
- format_turns?: boolean; // default: false
64
- end_of_turn_confidence_threshold?: number; // default: 0.01
65
- min_end_of_turn_silence_when_confident?: number; // default: 0
66
- max_turn_silence?: number; // default: not specified
67
- keyterms_prompt?: string[]; // default: not specified
75
+ /** Enable turn formatting. Default: false. */
76
+ format_turns?: boolean;
77
+ /** End of turn confidence threshold. Default: 0.01. */
78
+ end_of_turn_confidence_threshold?: number;
79
+ /** Minimum silence duration in milliseconds when confident about end of turn. Default: 0. */
80
+ min_end_of_turn_silence_when_confident?: number;
81
+ /** Maximum turn silence in milliseconds. Default: not specified. */
82
+ max_turn_silence?: number;
83
+ /** Key terms prompt for recognition. Default: not specified. */
84
+ keyterms_prompt?: string[];
68
85
  }
69
86
 
70
87
  export type STTLanguages =
@@ -93,6 +110,43 @@ export type STTOptions<TModel extends STTModels> = TModel extends DeepgramModels
93
110
  ? AssemblyAIOptions
94
111
  : Record<string, unknown>;
95
112
 
113
+ /** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
114
+ export interface STTFallbackModel {
115
+ /** Model name (e.g. "deepgram/nova-3", "assemblyai/universal-streaming", "cartesia/ink-whisper"). */
116
+ model: string;
117
+ /** Extra configuration for the model. */
118
+ extraKwargs?: Record<string, unknown>;
119
+ }
120
+
121
+ export type STTFallbackModelType = STTFallbackModel | string;
122
+
123
+ /** Parse a model string into [model, language]. Language is undefined if not specified. */
124
+ export function parseSTTModelString(model: string): [string, string | undefined] {
125
+ const idx = model.lastIndexOf(':');
126
+ if (idx !== -1) {
127
+ return [model.slice(0, idx), model.slice(idx + 1)];
128
+ }
129
+ return [model, undefined];
130
+ }
131
+
132
+ /** Normalize a single or list of FallbackModelType into STTFallbackModel[]. */
133
+ export function normalizeSTTFallback(
134
+ fallback: STTFallbackModelType | STTFallbackModelType[],
135
+ ): STTFallbackModel[] {
136
+ const makeFallback = (model: STTFallbackModelType): STTFallbackModel => {
137
+ if (typeof model === 'string') {
138
+ const [name] = parseSTTModelString(model);
139
+ return { model: name };
140
+ }
141
+ return model;
142
+ };
143
+
144
+ if (Array.isArray(fallback)) {
145
+ return fallback.map(makeFallback);
146
+ }
147
+ return [makeFallback(fallback)];
148
+ }
149
+
96
150
  export type STTEncoding = 'pcm_s16le';
97
151
 
98
152
  const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le';
@@ -109,6 +163,8 @@ export interface InferenceSTTOptions<TModel extends STTModels> {
109
163
  apiKey: string;
110
164
  apiSecret: string;
111
165
  modelOptions: STTOptions<TModel>;
166
+ fallback?: STTFallbackModel[];
167
+ connOptions?: APIConnectOptions;
112
168
  }
113
169
 
114
170
  /**
@@ -129,6 +185,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
129
185
  apiKey?: string;
130
186
  apiSecret?: string;
131
187
  modelOptions?: STTOptions<TModel>;
188
+ fallback?: STTFallbackModelType | STTFallbackModelType[];
189
+ connOptions?: APIConnectOptions;
132
190
  }) {
133
191
  super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
134
192
 
@@ -141,6 +199,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
141
199
  apiKey,
142
200
  apiSecret,
143
201
  modelOptions = {} as STTOptions<TModel>,
202
+ fallback,
203
+ connOptions,
144
204
  } = opts || {};
145
205
 
146
206
  const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
@@ -155,6 +215,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
155
215
  throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
156
216
  }
157
217
 
218
+ const normalizedFallback = fallback ? normalizeSTTFallback(fallback) : undefined;
219
+
158
220
  this.opts = {
159
221
  model,
160
222
  language,
@@ -164,6 +226,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
164
226
  apiKey: lkApiKey,
165
227
  apiSecret: lkApiSecret,
166
228
  modelOptions,
229
+ fallback: normalizedFallback,
230
+ connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
167
231
  };
168
232
  }
169
233
 
@@ -172,11 +236,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
172
236
  }
173
237
 
174
238
  static fromModelString(modelString: string): STT<AnyString> {
175
- if (modelString.includes(':')) {
176
- const [model, language] = modelString.split(':') as [AnyString, STTLanguages];
177
- return new STT({ model, language });
178
- }
179
- return new STT({ model: modelString });
239
+ const [model, language] = parseSTTModelString(modelString);
240
+ return new STT({ model, language });
180
241
  }
181
242
 
182
243
  protected async _recognize(_: AudioBuffer): Promise<SpeechEvent> {
@@ -195,7 +256,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
195
256
  language?: STTLanguages | string;
196
257
  connOptions?: APIConnectOptions;
197
258
  }): SpeechStream<TModel> {
198
- const { language, connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
259
+ const { language, connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } =
260
+ options || {};
199
261
  const streamOpts = {
200
262
  ...this.opts,
201
263
  language: language ?? this.opts.language,
@@ -224,6 +286,22 @@ export class STT<TModel extends STTModels> extends BaseSTT {
224
286
  (params.settings as Record<string, unknown>).language = this.opts.language;
225
287
  }
226
288
 
289
+ if (this.opts.fallback?.length) {
290
+ params.fallback = {
291
+ models: this.opts.fallback.map((m) => ({
292
+ model: m.model,
293
+ extra: m.extraKwargs ?? {},
294
+ })),
295
+ };
296
+ }
297
+
298
+ if (this.opts.connOptions) {
299
+ params.connection = {
300
+ timeout: this.opts.connOptions.timeoutMs / 1000,
301
+ retries: this.opts.connOptions.maxRetry,
302
+ };
303
+ }
304
+
227
305
  let baseURL = this.opts.baseURL;
228
306
  if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
229
307
  baseURL = baseURL.replace('http', 'ws');
@@ -0,0 +1,255 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { beforeAll, describe, expect, it } from 'vitest';
5
+ import { initializeLogger } from '../log.js';
6
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
7
+ import { TTS, type TTSFallbackModel, normalizeTTSFallback, parseTTSModelString } from './tts.js';
8
+
9
+ beforeAll(() => {
10
+ initializeLogger({ level: 'silent', pretty: false });
11
+ });
12
+
13
+ /** Helper to create TTS with required credentials. */
14
+ function makeTts(overrides: Record<string, unknown> = {}) {
15
+ const defaults = {
16
+ model: 'cartesia/sonic' as const,
17
+ apiKey: 'test-key',
18
+ apiSecret: 'test-secret',
19
+ baseURL: 'https://example.livekit.cloud',
20
+ };
21
+ return new TTS({ ...defaults, ...overrides });
22
+ }
23
+
24
+ describe('parseTTSModelString', () => {
25
+ it('simple model without voice', () => {
26
+ const [model, voice] = parseTTSModelString('cartesia');
27
+ expect(model).toBe('cartesia');
28
+ expect(voice).toBeUndefined();
29
+ });
30
+
31
+ it('model with voice suffix', () => {
32
+ const [model, voice] = parseTTSModelString('cartesia:my-voice-id');
33
+ expect(model).toBe('cartesia');
34
+ expect(voice).toBe('my-voice-id');
35
+ });
36
+
37
+ it('provider/model format without voice', () => {
38
+ const [model, voice] = parseTTSModelString('cartesia/sonic');
39
+ expect(model).toBe('cartesia/sonic');
40
+ expect(voice).toBeUndefined();
41
+ });
42
+
43
+ it('provider/model format with voice', () => {
44
+ const [model, voice] = parseTTSModelString('cartesia/sonic:my-voice-id');
45
+ expect(model).toBe('cartesia/sonic');
46
+ expect(voice).toBe('my-voice-id');
47
+ });
48
+
49
+ it.each([
50
+ ['elevenlabs/eleven_flash_v2:voice123', 'elevenlabs/eleven_flash_v2', 'voice123'],
51
+ ['rime:speaker-a', 'rime', 'speaker-a'],
52
+ ['rime/mist:narrator', 'rime/mist', 'narrator'],
53
+ ['inworld/inworld-tts-1:character', 'inworld/inworld-tts-1', 'character'],
54
+ ['cartesia/sonic-turbo:deep-voice', 'cartesia/sonic-turbo', 'deep-voice'],
55
+ ])('various providers and voices: %s', (modelStr, expectedModel, expectedVoice) => {
56
+ const [model, voice] = parseTTSModelString(modelStr);
57
+ expect(model).toBe(expectedModel);
58
+ expect(voice).toBe(expectedVoice);
59
+ });
60
+
61
+ it('empty voice after colon', () => {
62
+ const [model, voice] = parseTTSModelString('cartesia/sonic:');
63
+ expect(model).toBe('cartesia/sonic');
64
+ expect(voice).toBe('');
65
+ });
66
+ });
67
+
68
+ describe('normalizeTTSFallback', () => {
69
+ it('single string model', () => {
70
+ const result = normalizeTTSFallback('cartesia/sonic');
71
+ expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
72
+ });
73
+
74
+ it('single string model with voice', () => {
75
+ const result = normalizeTTSFallback('cartesia/sonic:my-voice');
76
+ expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
77
+ });
78
+
79
+ it('single FallbackModel dict', () => {
80
+ const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: 'narrator' };
81
+ const result = normalizeTTSFallback(fallback);
82
+ expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
83
+ });
84
+
85
+ it('list of string models', () => {
86
+ const result = normalizeTTSFallback(['cartesia/sonic', 'elevenlabs/eleven_flash_v2']);
87
+ expect(result).toEqual([
88
+ { model: 'cartesia/sonic', voice: '' },
89
+ { model: 'elevenlabs/eleven_flash_v2', voice: '' },
90
+ ]);
91
+ });
92
+
93
+ it('list of string models with voices', () => {
94
+ const result = normalizeTTSFallback(['cartesia/sonic:voice1', 'elevenlabs:voice2']);
95
+ expect(result).toEqual([
96
+ { model: 'cartesia/sonic', voice: 'voice1' },
97
+ { model: 'elevenlabs', voice: 'voice2' },
98
+ ]);
99
+ });
100
+
101
+ it('list of FallbackModel dicts', () => {
102
+ const fallbacks: TTSFallbackModel[] = [
103
+ { model: 'cartesia/sonic', voice: 'narrator' },
104
+ { model: 'elevenlabs', voice: '' },
105
+ ];
106
+ const result = normalizeTTSFallback(fallbacks);
107
+ expect(result).toEqual([
108
+ { model: 'cartesia/sonic', voice: 'narrator' },
109
+ { model: 'elevenlabs', voice: '' },
110
+ ]);
111
+ });
112
+
113
+ it('mixed list of strings and dicts', () => {
114
+ const result = normalizeTTSFallback([
115
+ 'cartesia/sonic:voice1',
116
+ { model: 'elevenlabs/eleven_flash_v2', voice: 'custom' } as TTSFallbackModel,
117
+ 'rime/mist',
118
+ ]);
119
+ expect(result).toEqual([
120
+ { model: 'cartesia/sonic', voice: 'voice1' },
121
+ { model: 'elevenlabs/eleven_flash_v2', voice: 'custom' },
122
+ { model: 'rime/mist', voice: '' },
123
+ ]);
124
+ });
125
+
126
+ it('FallbackModel with extraKwargs is preserved', () => {
127
+ const fallback: TTSFallbackModel = {
128
+ model: 'cartesia/sonic',
129
+ voice: 'narrator',
130
+ extraKwargs: { duration: 30.0, speed: 'fast' },
131
+ };
132
+ const result = normalizeTTSFallback(fallback);
133
+ expect(result).toEqual([
134
+ {
135
+ model: 'cartesia/sonic',
136
+ voice: 'narrator',
137
+ extraKwargs: { duration: 30.0, speed: 'fast' },
138
+ },
139
+ ]);
140
+ });
141
+
142
+ it('list with extraKwargs preserved', () => {
143
+ const result = normalizeTTSFallback([
144
+ { model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } } as TTSFallbackModel,
145
+ 'elevenlabs:voice2',
146
+ { model: 'rime/mist', voice: '', extraKwargs: { custom: true } } as TTSFallbackModel,
147
+ ]);
148
+ expect(result).toEqual([
149
+ { model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } },
150
+ { model: 'elevenlabs', voice: 'voice2' },
151
+ { model: 'rime/mist', voice: '', extraKwargs: { custom: true } },
152
+ ]);
153
+ });
154
+
155
+ it('empty list returns empty list', () => {
156
+ const result = normalizeTTSFallback([]);
157
+ expect(result).toEqual([]);
158
+ });
159
+
160
+ it('FallbackModel with empty voice', () => {
161
+ const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: '' };
162
+ const result = normalizeTTSFallback(fallback);
163
+ expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
164
+ });
165
+ });
166
+
167
+ describe('TTS constructor fallback and connOptions', () => {
168
+ it('fallback not given defaults to undefined', () => {
169
+ const tts = makeTts();
170
+ expect(tts['opts'].fallback).toBeUndefined();
171
+ });
172
+
173
+ it('fallback single string is normalized', () => {
174
+ const tts = makeTts({ fallback: 'elevenlabs/eleven_flash_v2' });
175
+ expect(tts['opts'].fallback).toEqual([{ model: 'elevenlabs/eleven_flash_v2', voice: '' }]);
176
+ });
177
+
178
+ it('fallback single string with voice is normalized', () => {
179
+ const tts = makeTts({ fallback: 'cartesia/sonic:my-voice' });
180
+ expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
181
+ });
182
+
183
+ it('fallback list of strings is normalized', () => {
184
+ const tts = makeTts({ fallback: ['cartesia/sonic', 'elevenlabs'] });
185
+ expect(tts['opts'].fallback).toEqual([
186
+ { model: 'cartesia/sonic', voice: '' },
187
+ { model: 'elevenlabs', voice: '' },
188
+ ]);
189
+ });
190
+
191
+ it('fallback single FallbackModel is normalized to list', () => {
192
+ const tts = makeTts({ fallback: { model: 'cartesia/sonic', voice: 'narrator' } });
193
+ expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
194
+ });
195
+
196
+ it('fallback with extraKwargs is preserved', () => {
197
+ const tts = makeTts({
198
+ fallback: {
199
+ model: 'cartesia/sonic',
200
+ voice: 'narrator',
201
+ extraKwargs: { duration: 30.0, speed: 'fast' },
202
+ },
203
+ });
204
+ expect(tts['opts'].fallback).toEqual([
205
+ {
206
+ model: 'cartesia/sonic',
207
+ voice: 'narrator',
208
+ extraKwargs: { duration: 30.0, speed: 'fast' },
209
+ },
210
+ ]);
211
+ });
212
+
213
+ it('fallback mixed list is normalized', () => {
214
+ const tts = makeTts({
215
+ fallback: [
216
+ 'cartesia/sonic:voice1',
217
+ { model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
218
+ 'rime/mist',
219
+ ],
220
+ });
221
+ expect(tts['opts'].fallback).toEqual([
222
+ { model: 'cartesia/sonic', voice: 'voice1' },
223
+ { model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
224
+ { model: 'rime/mist', voice: '' },
225
+ ]);
226
+ });
227
+
228
+ it('connOptions not given uses default', () => {
229
+ const tts = makeTts();
230
+ expect(tts['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
231
+ });
232
+
233
+ it('connOptions custom timeout', () => {
234
+ const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
235
+ const tts = makeTts({ connOptions: custom });
236
+ expect(tts['opts'].connOptions).toEqual(custom);
237
+ expect(tts['opts'].connOptions!.timeoutMs).toBe(30000);
238
+ });
239
+
240
+ it('connOptions custom maxRetry', () => {
241
+ const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
242
+ const tts = makeTts({ connOptions: custom });
243
+ expect(tts['opts'].connOptions).toEqual(custom);
244
+ expect(tts['opts'].connOptions!.maxRetry).toBe(5);
245
+ });
246
+
247
+ it('connOptions full custom', () => {
248
+ const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
249
+ const tts = makeTts({ connOptions: custom });
250
+ expect(tts['opts'].connOptions).toEqual(custom);
251
+ expect(tts['opts'].connOptions!.timeoutMs).toBe(60000);
252
+ expect(tts['opts'].connOptions!.maxRetry).toBe(10);
253
+ expect(tts['opts'].connOptions!.retryIntervalMs).toBe(2000);
254
+ });
255
+ });