@livekit/agents 1.0.42 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/dist/inference/index.cjs +8 -0
  2. package/dist/inference/index.cjs.map +1 -1
  3. package/dist/inference/index.d.cts +2 -2
  4. package/dist/inference/index.d.ts +2 -2
  5. package/dist/inference/index.d.ts.map +1 -1
  6. package/dist/inference/index.js +8 -0
  7. package/dist/inference/index.js.map +1 -1
  8. package/dist/inference/stt.cjs +70 -12
  9. package/dist/inference/stt.cjs.map +1 -1
  10. package/dist/inference/stt.d.cts +34 -1
  11. package/dist/inference/stt.d.ts +34 -1
  12. package/dist/inference/stt.d.ts.map +1 -1
  13. package/dist/inference/stt.js +67 -11
  14. package/dist/inference/stt.js.map +1 -1
  15. package/dist/inference/stt.test.cjs +204 -0
  16. package/dist/inference/stt.test.cjs.map +1 -0
  17. package/dist/inference/stt.test.js +203 -0
  18. package/dist/inference/stt.test.js.map +1 -0
  19. package/dist/inference/tts.cjs +52 -10
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +22 -0
  22. package/dist/inference/tts.d.ts +22 -0
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js +49 -9
  25. package/dist/inference/tts.js.map +1 -1
  26. package/dist/inference/tts.test.cjs +223 -0
  27. package/dist/inference/tts.test.cjs.map +1 -0
  28. package/dist/inference/tts.test.js +222 -0
  29. package/dist/inference/tts.test.js.map +1 -0
  30. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  31. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  32. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  33. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  34. package/dist/ipc/job_proc_lazy_main.cjs +8 -1
  35. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  36. package/dist/ipc/job_proc_lazy_main.js +9 -2
  37. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  38. package/dist/ipc/supervised_proc.cjs.map +1 -1
  39. package/dist/ipc/supervised_proc.d.cts +7 -0
  40. package/dist/ipc/supervised_proc.d.ts +7 -0
  41. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  42. package/dist/ipc/supervised_proc.js.map +1 -1
  43. package/dist/stt/stt.cjs +4 -0
  44. package/dist/stt/stt.cjs.map +1 -1
  45. package/dist/stt/stt.d.cts +7 -0
  46. package/dist/stt/stt.d.ts +7 -0
  47. package/dist/stt/stt.d.ts.map +1 -1
  48. package/dist/stt/stt.js +4 -0
  49. package/dist/stt/stt.js.map +1 -1
  50. package/dist/transcription.cjs.map +1 -1
  51. package/dist/transcription.d.cts +6 -0
  52. package/dist/transcription.d.ts +6 -0
  53. package/dist/transcription.d.ts.map +1 -1
  54. package/dist/transcription.js.map +1 -1
  55. package/dist/utils.cjs +10 -2
  56. package/dist/utils.cjs.map +1 -1
  57. package/dist/utils.d.ts.map +1 -1
  58. package/dist/utils.js +10 -2
  59. package/dist/utils.js.map +1 -1
  60. package/dist/vad.cjs +1 -1
  61. package/dist/vad.cjs.map +1 -1
  62. package/dist/vad.d.cts +3 -2
  63. package/dist/vad.d.ts +3 -2
  64. package/dist/vad.d.ts.map +1 -1
  65. package/dist/vad.js +1 -1
  66. package/dist/vad.js.map +1 -1
  67. package/dist/voice/agent_activity.cjs +1 -2
  68. package/dist/voice/agent_activity.cjs.map +1 -1
  69. package/dist/voice/agent_activity.js +1 -2
  70. package/dist/voice/agent_activity.js.map +1 -1
  71. package/dist/voice/audio_recognition.cjs.map +1 -1
  72. package/dist/voice/audio_recognition.d.cts +14 -0
  73. package/dist/voice/audio_recognition.d.ts +14 -0
  74. package/dist/voice/audio_recognition.d.ts.map +1 -1
  75. package/dist/voice/audio_recognition.js.map +1 -1
  76. package/package.json +1 -1
  77. package/src/inference/index.ts +8 -0
  78. package/src/inference/stt.test.ts +236 -0
  79. package/src/inference/stt.ts +116 -20
  80. package/src/inference/tts.test.ts +255 -0
  81. package/src/inference/tts.ts +81 -15
  82. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  83. package/src/ipc/job_proc_lazy_main.ts +18 -2
  84. package/src/ipc/supervised_proc.ts +7 -0
  85. package/src/stt/stt.ts +12 -0
  86. package/src/transcription.ts +6 -0
  87. package/src/utils.ts +10 -2
  88. package/src/vad.ts +4 -3
  89. package/src/voice/agent_activity.ts +1 -1
  90. package/src/voice/audio_recognition.ts +14 -0
@@ -0,0 +1,236 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { beforeAll, describe, expect, it } from 'vitest';
5
+ import { initializeLogger } from '../log.js';
6
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
7
+ import { STT, type STTFallbackModel, normalizeSTTFallback, parseSTTModelString } from './stt.js';
8
+
9
+ beforeAll(() => {
10
+ initializeLogger({ level: 'silent', pretty: false });
11
+ });
12
+
13
+ /** Helper to create STT with required credentials. */
14
+ function makeStt(overrides: Record<string, unknown> = {}) {
15
+ const defaults = {
16
+ model: 'deepgram' as const,
17
+ apiKey: 'test-key',
18
+ apiSecret: 'test-secret',
19
+ baseURL: 'https://example.livekit.cloud',
20
+ };
21
+ return new STT({ ...defaults, ...overrides });
22
+ }
23
+
24
+ describe('parseSTTModelString', () => {
25
+ it('simple model without language', () => {
26
+ const [model, language] = parseSTTModelString('deepgram');
27
+ expect(model).toBe('deepgram');
28
+ expect(language).toBeUndefined();
29
+ });
30
+
31
+ it('model with language suffix', () => {
32
+ const [model, language] = parseSTTModelString('deepgram:en');
33
+ expect(model).toBe('deepgram');
34
+ expect(language).toBe('en');
35
+ });
36
+
37
+ it('provider/model format without language', () => {
38
+ const [model, language] = parseSTTModelString('deepgram/nova-3');
39
+ expect(model).toBe('deepgram/nova-3');
40
+ expect(language).toBeUndefined();
41
+ });
42
+
43
+ it('provider/model format with language', () => {
44
+ const [model, language] = parseSTTModelString('deepgram/nova-3:en');
45
+ expect(model).toBe('deepgram/nova-3');
46
+ expect(language).toBe('en');
47
+ });
48
+
49
+ it.each([
50
+ ['cartesia/ink-whisper:de', 'cartesia/ink-whisper', 'de'],
51
+ ['assemblyai:es', 'assemblyai', 'es'],
52
+ ['deepgram/nova-2-medical:ja', 'deepgram/nova-2-medical', 'ja'],
53
+ ['deepgram/nova-3:multi', 'deepgram/nova-3', 'multi'],
54
+ ['cartesia:zh', 'cartesia', 'zh'],
55
+ ])('various providers and languages: %s', (modelStr, expectedModel, expectedLang) => {
56
+ const [model, language] = parseSTTModelString(modelStr);
57
+ expect(model).toBe(expectedModel);
58
+ expect(language).toBe(expectedLang);
59
+ });
60
+
61
+ it('auto model without language', () => {
62
+ const [model, language] = parseSTTModelString('auto');
63
+ expect(model).toBe('auto');
64
+ expect(language).toBeUndefined();
65
+ });
66
+
67
+ it('auto model with language', () => {
68
+ const [model, language] = parseSTTModelString('auto:pt');
69
+ expect(model).toBe('auto');
70
+ expect(language).toBe('pt');
71
+ });
72
+ });
73
+
74
+ describe('normalizeSTTFallback', () => {
75
+ it('single string model', () => {
76
+ const result = normalizeSTTFallback('deepgram/nova-3');
77
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
78
+ });
79
+
80
+ it('single FallbackModel dict', () => {
81
+ const fallback: STTFallbackModel = { model: 'deepgram/nova-3' };
82
+ const result = normalizeSTTFallback(fallback);
83
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
84
+ });
85
+
86
+ it('list of string models', () => {
87
+ const result = normalizeSTTFallback(['deepgram/nova-3', 'cartesia/ink-whisper']);
88
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'cartesia/ink-whisper' }]);
89
+ });
90
+
91
+ it('list of FallbackModel dicts', () => {
92
+ const fallbacks: STTFallbackModel[] = [{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }];
93
+ const result = normalizeSTTFallback(fallbacks);
94
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
95
+ });
96
+
97
+ it('mixed list of strings and dicts', () => {
98
+ const result = normalizeSTTFallback([
99
+ 'deepgram/nova-3',
100
+ { model: 'cartesia/ink-whisper' } as STTFallbackModel,
101
+ 'assemblyai',
102
+ ]);
103
+ expect(result).toEqual([
104
+ { model: 'deepgram/nova-3' },
105
+ { model: 'cartesia/ink-whisper' },
106
+ { model: 'assemblyai' },
107
+ ]);
108
+ });
109
+
110
+ it('string with language suffix discards language', () => {
111
+ const result = normalizeSTTFallback('deepgram/nova-3:en');
112
+ expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
113
+ });
114
+
115
+ it('FallbackModel with extraKwargs is preserved', () => {
116
+ const fallback: STTFallbackModel = {
117
+ model: 'deepgram/nova-3',
118
+ extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
119
+ };
120
+ const result = normalizeSTTFallback(fallback);
121
+ expect(result).toEqual([
122
+ {
123
+ model: 'deepgram/nova-3',
124
+ extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
125
+ },
126
+ ]);
127
+ });
128
+
129
+ it('list with extraKwargs preserved', () => {
130
+ const result = normalizeSTTFallback([
131
+ { model: 'deepgram/nova-3', extraKwargs: { punctuate: true } } as STTFallbackModel,
132
+ 'cartesia/ink-whisper',
133
+ { model: 'assemblyai', extraKwargs: { format_turns: true } } as STTFallbackModel,
134
+ ]);
135
+ expect(result).toEqual([
136
+ { model: 'deepgram/nova-3', extraKwargs: { punctuate: true } },
137
+ { model: 'cartesia/ink-whisper' },
138
+ { model: 'assemblyai', extraKwargs: { format_turns: true } },
139
+ ]);
140
+ });
141
+
142
+ it('empty list returns empty list', () => {
143
+ const result = normalizeSTTFallback([]);
144
+ expect(result).toEqual([]);
145
+ });
146
+
147
+ it('multiple colons in model string splits on last', () => {
148
+ const result = normalizeSTTFallback('some:model:part:fr');
149
+ expect(result).toEqual([{ model: 'some:model:part' }]);
150
+ });
151
+ });
152
+
153
+ describe('STT constructor fallback and connOptions', () => {
154
+ it('fallback not given defaults to undefined', () => {
155
+ const stt = makeStt();
156
+ expect(stt['opts'].fallback).toBeUndefined();
157
+ });
158
+
159
+ it('fallback single string is normalized', () => {
160
+ const stt = makeStt({ fallback: 'cartesia/ink-whisper' });
161
+ expect(stt['opts'].fallback).toEqual([{ model: 'cartesia/ink-whisper' }]);
162
+ });
163
+
164
+ it('fallback list of strings is normalized', () => {
165
+ const stt = makeStt({ fallback: ['deepgram/nova-3', 'assemblyai'] });
166
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
167
+ });
168
+
169
+ it('fallback single FallbackModel is normalized to list', () => {
170
+ const stt = makeStt({ fallback: { model: 'deepgram/nova-3' } });
171
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
172
+ });
173
+
174
+ it('fallback with extraKwargs is preserved', () => {
175
+ const stt = makeStt({
176
+ fallback: {
177
+ model: 'deepgram/nova-3',
178
+ extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
179
+ },
180
+ });
181
+ expect(stt['opts'].fallback).toEqual([
182
+ {
183
+ model: 'deepgram/nova-3',
184
+ extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
185
+ },
186
+ ]);
187
+ });
188
+
189
+ it('fallback mixed list is normalized', () => {
190
+ const stt = makeStt({
191
+ fallback: [
192
+ 'deepgram/nova-3',
193
+ { model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
194
+ 'assemblyai',
195
+ ],
196
+ });
197
+ expect(stt['opts'].fallback).toEqual([
198
+ { model: 'deepgram/nova-3' },
199
+ { model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
200
+ { model: 'assemblyai' },
201
+ ]);
202
+ });
203
+
204
+ it('fallback string with language discards language', () => {
205
+ const stt = makeStt({ fallback: 'deepgram/nova-3:en' });
206
+ expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
207
+ });
208
+
209
+ it('connOptions not given uses default', () => {
210
+ const stt = makeStt();
211
+ expect(stt['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
212
+ });
213
+
214
+ it('connOptions custom timeout', () => {
215
+ const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
216
+ const stt = makeStt({ connOptions: custom });
217
+ expect(stt['opts'].connOptions).toEqual(custom);
218
+ expect(stt['opts'].connOptions!.timeoutMs).toBe(30000);
219
+ });
220
+
221
+ it('connOptions custom maxRetry', () => {
222
+ const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
223
+ const stt = makeStt({ connOptions: custom });
224
+ expect(stt['opts'].connOptions).toEqual(custom);
225
+ expect(stt['opts'].connOptions!.maxRetry).toBe(5);
226
+ });
227
+
228
+ it('connOptions full custom', () => {
229
+ const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
230
+ const stt = makeStt({ connOptions: custom });
231
+ expect(stt['opts'].connOptions).toEqual(custom);
232
+ expect(stt['opts'].connOptions!.timeoutMs).toBe(60000);
233
+ expect(stt['opts'].connOptions!.maxRetry).toBe(10);
234
+ expect(stt['opts'].connOptions!.retryIntervalMs).toBe(2000);
235
+ });
236
+ });
@@ -42,29 +42,46 @@ export type AssemblyaiModels =
42
42
  export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
43
43
 
44
44
  export interface CartesiaOptions {
45
- min_volume?: number; // default: not specified
46
- max_silence_duration_secs?: number; // default: not specified
45
+ /** Minimum volume threshold. Default: not specified. */
46
+ min_volume?: number;
47
+ /** Maximum silence duration in seconds. Default: not specified. */
48
+ max_silence_duration_secs?: number;
47
49
  }
48
50
 
49
51
  export interface DeepgramOptions {
50
- filler_words?: boolean; // default: true
51
- interim_results?: boolean; // default: true
52
- endpointing?: number; // default: 25 (ms)
53
- punctuate?: boolean; // default: false
52
+ /** Enable filler words. Default: true. */
53
+ filler_words?: boolean;
54
+ /** Enable interim results. Default: true. */
55
+ interim_results?: boolean;
56
+ /** Endpointing timeout in milliseconds. Default: 25. */
57
+ endpointing?: number;
58
+ /** Enable punctuation. Default: false. */
59
+ punctuate?: boolean;
60
+ /** Enable smart formatting. */
54
61
  smart_format?: boolean;
62
+ /** Keywords with boost values. */
55
63
  keywords?: Array<[string, number]>;
64
+ /** Key terms for recognition. */
56
65
  keyterms?: string[];
66
+ /** Enable profanity filter. */
57
67
  profanity_filter?: boolean;
68
+ /** Convert spoken numbers to numerals. */
58
69
  numerals?: boolean;
70
+ /** Opt out of model improvement program. */
59
71
  mip_opt_out?: boolean;
60
72
  }
61
73
 
62
74
  export interface AssemblyAIOptions {
63
- format_turns?: boolean; // default: false
64
- end_of_turn_confidence_threshold?: number; // default: 0.01
65
- min_end_of_turn_silence_when_confident?: number; // default: 0
66
- max_turn_silence?: number; // default: not specified
67
- keyterms_prompt?: string[]; // default: not specified
75
+ /** Enable turn formatting. Default: false. */
76
+ format_turns?: boolean;
77
+ /** End of turn confidence threshold. Default: 0.01. */
78
+ end_of_turn_confidence_threshold?: number;
79
+ /** Minimum silence duration in milliseconds when confident about end of turn. Default: 0. */
80
+ min_end_of_turn_silence_when_confident?: number;
81
+ /** Maximum turn silence in milliseconds. Default: not specified. */
82
+ max_turn_silence?: number;
83
+ /** Key terms prompt for recognition. Default: not specified. */
84
+ keyterms_prompt?: string[];
68
85
  }
69
86
 
70
87
  export type STTLanguages =
@@ -93,6 +110,43 @@ export type STTOptions<TModel extends STTModels> = TModel extends DeepgramModels
93
110
  ? AssemblyAIOptions
94
111
  : Record<string, unknown>;
95
112
 
113
+ /** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
114
+ export interface STTFallbackModel {
115
+ /** Model name (e.g. "deepgram/nova-3", "assemblyai/universal-streaming", "cartesia/ink-whisper"). */
116
+ model: string;
117
+ /** Extra configuration for the model. */
118
+ extraKwargs?: Record<string, unknown>;
119
+ }
120
+
121
+ export type STTFallbackModelType = STTFallbackModel | string;
122
+
123
+ /** Parse a model string into [model, language]. Language is undefined if not specified. */
124
+ export function parseSTTModelString(model: string): [string, string | undefined] {
125
+ const idx = model.lastIndexOf(':');
126
+ if (idx !== -1) {
127
+ return [model.slice(0, idx), model.slice(idx + 1)];
128
+ }
129
+ return [model, undefined];
130
+ }
131
+
132
+ /** Normalize a single or list of FallbackModelType into STTFallbackModel[]. */
133
+ export function normalizeSTTFallback(
134
+ fallback: STTFallbackModelType | STTFallbackModelType[],
135
+ ): STTFallbackModel[] {
136
+ const makeFallback = (model: STTFallbackModelType): STTFallbackModel => {
137
+ if (typeof model === 'string') {
138
+ const [name] = parseSTTModelString(model);
139
+ return { model: name };
140
+ }
141
+ return model;
142
+ };
143
+
144
+ if (Array.isArray(fallback)) {
145
+ return fallback.map(makeFallback);
146
+ }
147
+ return [makeFallback(fallback)];
148
+ }
149
+
96
150
  export type STTEncoding = 'pcm_s16le';
97
151
 
98
152
  const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le';
@@ -109,6 +163,8 @@ export interface InferenceSTTOptions<TModel extends STTModels> {
109
163
  apiKey: string;
110
164
  apiSecret: string;
111
165
  modelOptions: STTOptions<TModel>;
166
+ fallback?: STTFallbackModel[];
167
+ connOptions?: APIConnectOptions;
112
168
  }
113
169
 
114
170
  /**
@@ -121,7 +177,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
121
177
  #logger = log();
122
178
 
123
179
  constructor(opts?: {
124
- model?: TModel;
180
+ model?: ModelWithLanguage;
125
181
  language?: STTLanguages;
126
182
  baseURL?: string;
127
183
  encoding?: STTEncoding;
@@ -129,6 +185,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
129
185
  apiKey?: string;
130
186
  apiSecret?: string;
131
187
  modelOptions?: STTOptions<TModel>;
188
+ fallback?: STTFallbackModelType | STTFallbackModelType[];
189
+ connOptions?: APIConnectOptions;
132
190
  }) {
133
191
  super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
134
192
 
@@ -141,6 +199,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
141
199
  apiKey,
142
200
  apiSecret,
143
201
  modelOptions = {} as STTOptions<TModel>,
202
+ fallback,
203
+ connOptions,
144
204
  } = opts || {};
145
205
 
146
206
  const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
@@ -155,15 +215,37 @@ export class STT<TModel extends STTModels> extends BaseSTT {
155
215
  throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
156
216
  }
157
217
 
218
+ // Parse language from model string if provided: "provider/model:language"
219
+ let nextModel = model;
220
+ let nextLanguage = language;
221
+ if (typeof nextModel === 'string') {
222
+ const idx = nextModel.lastIndexOf(':');
223
+ if (idx !== -1) {
224
+ const languageFromModel = nextModel.slice(idx + 1) as STTLanguages;
225
+ if (nextLanguage && nextLanguage !== languageFromModel) {
226
+ this.#logger.warn(
227
+ '`language` is provided via both argument and model, using the one from the argument',
228
+ { language: nextLanguage, model: nextModel },
229
+ );
230
+ } else {
231
+ nextLanguage = languageFromModel;
232
+ }
233
+ nextModel = nextModel.slice(0, idx) as TModel;
234
+ }
235
+ }
236
+ const normalizedFallback = fallback ? normalizeSTTFallback(fallback) : undefined;
237
+
158
238
  this.opts = {
159
- model,
160
- language,
239
+ model: nextModel as TModel,
240
+ language: nextLanguage,
161
241
  encoding,
162
242
  sampleRate,
163
243
  baseURL: lkBaseURL,
164
244
  apiKey: lkApiKey,
165
245
  apiSecret: lkApiSecret,
166
246
  modelOptions,
247
+ fallback: normalizedFallback,
248
+ connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
167
249
  };
168
250
  }
169
251
 
@@ -172,11 +254,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
172
254
  }
173
255
 
174
256
  static fromModelString(modelString: string): STT<AnyString> {
175
- if (modelString.includes(':')) {
176
- const [model, language] = modelString.split(':') as [AnyString, STTLanguages];
177
- return new STT({ model, language });
178
- }
179
- return new STT({ model: modelString });
257
+ const [model, language] = parseSTTModelString(modelString);
258
+ return new STT({ model, language });
180
259
  }
181
260
 
182
261
  protected async _recognize(_: AudioBuffer): Promise<SpeechEvent> {
@@ -195,7 +274,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
195
274
  language?: STTLanguages | string;
196
275
  connOptions?: APIConnectOptions;
197
276
  }): SpeechStream<TModel> {
198
- const { language, connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
277
+ const { language, connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } =
278
+ options || {};
199
279
  const streamOpts = {
200
280
  ...this.opts,
201
281
  language: language ?? this.opts.language,
@@ -224,6 +304,22 @@ export class STT<TModel extends STTModels> extends BaseSTT {
224
304
  (params.settings as Record<string, unknown>).language = this.opts.language;
225
305
  }
226
306
 
307
+ if (this.opts.fallback?.length) {
308
+ params.fallback = {
309
+ models: this.opts.fallback.map((m) => ({
310
+ model: m.model,
311
+ extra: m.extraKwargs ?? {},
312
+ })),
313
+ };
314
+ }
315
+
316
+ if (this.opts.connOptions) {
317
+ params.connection = {
318
+ timeout: this.opts.connOptions.timeoutMs / 1000,
319
+ retries: this.opts.connOptions.maxRetry,
320
+ };
321
+ }
322
+
227
323
  let baseURL = this.opts.baseURL;
228
324
  if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
229
325
  baseURL = baseURL.replace('http', 'ws');