@livekit/agents-plugin-sarvam 1.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +110 -0
  3. package/dist/index.cjs +52 -0
  4. package/dist/index.cjs.map +1 -0
  5. package/dist/index.d.cts +4 -0
  6. package/dist/index.d.ts +4 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +29 -0
  9. package/dist/index.js.map +1 -0
  10. package/dist/models.cjs +17 -0
  11. package/dist/models.cjs.map +1 -0
  12. package/dist/models.d.cts +36 -0
  13. package/dist/models.d.ts +36 -0
  14. package/dist/models.d.ts.map +1 -0
  15. package/dist/models.js +1 -0
  16. package/dist/models.js.map +1 -0
  17. package/dist/stt.cjs +499 -0
  18. package/dist/stt.cjs.map +1 -0
  19. package/dist/stt.d.cts +104 -0
  20. package/dist/stt.d.ts +104 -0
  21. package/dist/stt.d.ts.map +1 -0
  22. package/dist/stt.js +483 -0
  23. package/dist/stt.js.map +1 -0
  24. package/dist/stt.test.cjs +18 -0
  25. package/dist/stt.test.cjs.map +1 -0
  26. package/dist/stt.test.d.cts +2 -0
  27. package/dist/stt.test.d.ts +2 -0
  28. package/dist/stt.test.d.ts.map +1 -0
  29. package/dist/stt.test.js +17 -0
  30. package/dist/stt.test.js.map +1 -0
  31. package/dist/tts.cjs +405 -0
  32. package/dist/tts.cjs.map +1 -0
  33. package/dist/tts.d.cts +111 -0
  34. package/dist/tts.d.ts +111 -0
  35. package/dist/tts.d.ts.map +1 -0
  36. package/dist/tts.js +385 -0
  37. package/dist/tts.js.map +1 -0
  38. package/dist/tts.test.cjs +17 -0
  39. package/dist/tts.test.cjs.map +1 -0
  40. package/dist/tts.test.d.cts +2 -0
  41. package/dist/tts.test.d.ts +2 -0
  42. package/dist/tts.test.d.ts.map +1 -0
  43. package/dist/tts.test.js +16 -0
  44. package/dist/tts.test.js.map +1 -0
  45. package/package.json +54 -0
  46. package/src/index.ts +34 -0
  47. package/src/models.ts +135 -0
  48. package/src/stt.test.ts +23 -0
  49. package/src/stt.ts +770 -0
  50. package/src/tts.test.ts +22 -0
  51. package/src/tts.ts +571 -0
package/src/stt.ts ADDED
@@ -0,0 +1,770 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import {
5
+ type APIConnectOptions,
6
+ type AudioBuffer,
7
+ AudioByteStream,
8
+ AudioEnergyFilter,
9
+ Future,
10
+ Task,
11
+ log,
12
+ mergeFrames,
13
+ stt,
14
+ waitForAbort,
15
+ } from '@livekit/agents';
16
+ import type { AudioFrame } from '@livekit/rtc-node';
17
+ import { type RawData, WebSocket } from 'ws';
18
+ import type {
19
+ STTLanguages,
20
+ STTModels,
21
+ STTModes,
22
+ STTV2Languages,
23
+ STTV3Languages,
24
+ } from './models.js';
25
+
26
+ // ---------------------------------------------------------------------------
27
+ // Endpoint URLs
28
+ // ---------------------------------------------------------------------------
29
+
30
+ const SARVAM_STT_REST_URL = 'https://api.sarvam.ai/speech-to-text';
31
+ const SARVAM_STT_TRANSLATE_REST_URL = 'https://api.sarvam.ai/speech-to-text-translate';
32
+ const SARVAM_STT_WS_URL = 'wss://api.sarvam.ai/speech-to-text/ws';
33
+ const SARVAM_STT_TRANSLATE_WS_URL = 'wss://api.sarvam.ai/speech-to-text-translate/ws';
34
+
35
+ const SAMPLE_RATE = 16000;
36
+ const NUM_CHANNELS = 1;
37
+
38
+ // ---------------------------------------------------------------------------
39
+ // Model-specific option types
40
+ // ---------------------------------------------------------------------------
41
+
42
+ interface STTBaseOptions {
43
+ /** Sarvam API key. Defaults to $SARVAM_API_KEY */
44
+ apiKey?: string;
45
+ /**
46
+ * Whether to use native WebSocket streaming for `stream()`.
47
+ * Set to `false` to prefer non-streaming REST recognition (used by Agent via StreamAdapter + VAD).
48
+ * Default: `true`.
49
+ */
50
+ streaming?: boolean;
51
+ /** Increase VAD sensitivity (WS only). Maps to `high_vad_sensitivity` query param. */
52
+ highVadSensitivity?: boolean;
53
+ /** Enable flush signal events from server (WS only). Maps to `flush_signal` query param. */
54
+ flushSignal?: boolean;
55
+ }
56
+
57
+ /**
58
+ * Options specific to saarika:v2.5.
59
+ * saarika:v2.5 will be deprecated soon — prefer {@link STTV3Options} with `saaras:v3` for new integrations.
60
+ * All v2.5 language codes are also supported by v3.
61
+ * @see {@link https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe | Sarvam STT API docs}
62
+ */
63
+ export interface STTV2Options extends STTBaseOptions {
64
+ model: 'saarika:v2.5';
65
+ /** Language code (BCP-47). Default: 'en-IN'. Set to 'unknown' for auto-detection. */
66
+ languageCode?: STTV2Languages | string;
67
+ /** Return chunk-level timestamps in REST response */
68
+ withTimestamps?: boolean;
69
+ }
70
+
71
+ /**
72
+ * Options specific to saaras:v2.5 (dedicated translate endpoint).
73
+ * Uses the `/speech-to-text-translate` endpoint for Indic-to-English translation.
74
+ * Auto-detects the source language; does not accept language codes or timestamps.
75
+ * @see {@link https://docs.sarvam.ai/api-reference-docs/speech-to-text-translate/translate | Sarvam STT Translate docs}
76
+ */
77
+ export interface STTTranslateOptions extends STTBaseOptions {
78
+ model: 'saaras:v2.5';
79
+ /** Conversation context to boost model accuracy */
80
+ prompt?: string;
81
+ /** Mode for translate WS. Default: 'translate'. */
82
+ mode?: STTModes | string;
83
+ }
84
+
85
+ /**
86
+ * Options specific to saaras:v3 (recommended).
87
+ * @see {@link https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe | Sarvam STT API docs}
88
+ */
89
+ export interface STTV3Options extends STTBaseOptions {
90
+ model?: 'saaras:v3';
91
+ /** Language code (BCP-47). Default: 'en-IN'. Set to 'unknown' for auto-detection. */
92
+ languageCode?: STTV3Languages | string;
93
+ /** Transcription mode (v3 only). Default: 'transcribe' */
94
+ mode?: STTModes | string;
95
+ /** Conversation context to boost model accuracy */
96
+ prompt?: string;
97
+ /** Return chunk-level timestamps in REST response */
98
+ withTimestamps?: boolean;
99
+ }
100
+
101
+ /** Combined options — discriminated by `model` field */
102
+ export type STTOptions = STTV2Options | STTTranslateOptions | STTV3Options;
103
+
104
+ // ---------------------------------------------------------------------------
105
+ // Resolved (internal) options — flat union of all fields
106
+ // ---------------------------------------------------------------------------
107
+
108
+ interface ResolvedSTTOptions {
109
+ apiKey: string;
110
+ model: STTModels;
111
+ streaming: boolean;
112
+ // saarika:v2.5 and saaras:v3 only — not used by saaras:v2.5 (translate auto-detects)
113
+ languageCode?: STTLanguages | string;
114
+ // saaras:v3 and saaras:v2.5 (translate)
115
+ mode?: STTModes | string;
116
+ // saaras:v2.5 (translate) and saaras:v3
117
+ prompt?: string;
118
+ // saarika:v2.5 and saaras:v3 (/speech-to-text only, not translate)
119
+ withTimestamps?: boolean;
120
+ // WS-only flags
121
+ highVadSensitivity?: boolean;
122
+ flushSignal?: boolean;
123
+ }
124
+
125
+ // ---------------------------------------------------------------------------
126
+ // Defaults per model
127
+ // ---------------------------------------------------------------------------
128
+
129
+ const SAARIKA_DEFAULTS = {
130
+ languageCode: 'en-IN',
131
+ };
132
+
133
+ const SAARAS_V3_DEFAULTS = {
134
+ languageCode: 'en-IN',
135
+ mode: 'transcribe',
136
+ };
137
+
138
+ const SAARAS_TRANSLATE_DEFAULTS = {
139
+ mode: 'translate',
140
+ };
141
+
142
+ /** Runtime set of languages supported by saarika:v2.5 (for validation on model switch) */
143
+ const STTV2_LANGUAGE_SET: ReadonlySet<string> = new Set<STTV2Languages>([
144
+ 'unknown',
145
+ 'hi-IN',
146
+ 'bn-IN',
147
+ 'kn-IN',
148
+ 'ml-IN',
149
+ 'mr-IN',
150
+ 'od-IN',
151
+ 'pa-IN',
152
+ 'ta-IN',
153
+ 'te-IN',
154
+ 'en-IN',
155
+ 'gu-IN',
156
+ ]);
157
+
158
+ // ---------------------------------------------------------------------------
159
+ // Resolve caller options into a fully-populated internal struct
160
+ // ---------------------------------------------------------------------------
161
+
162
+ function resolveOptions(opts: Partial<STTOptions>): ResolvedSTTOptions {
163
+ const apiKey = opts.apiKey ?? process.env.SARVAM_API_KEY;
164
+ if (!apiKey) {
165
+ throw new Error('Sarvam API key is required, whether as an argument or as $SARVAM_API_KEY');
166
+ }
167
+
168
+ const model: STTModels = opts.model ?? 'saaras:v3';
169
+
170
+ const base: ResolvedSTTOptions = {
171
+ apiKey,
172
+ model,
173
+ streaming: opts.streaming ?? true,
174
+ highVadSensitivity: opts.highVadSensitivity,
175
+ flushSignal: opts.flushSignal,
176
+ };
177
+
178
+ if (model === 'saaras:v2.5') {
179
+ const translateOpts = opts as STTTranslateOptions;
180
+ base.prompt = translateOpts.prompt;
181
+ base.mode = translateOpts.mode ?? SAARAS_TRANSLATE_DEFAULTS.mode;
182
+ } else if (model === 'saaras:v3') {
183
+ const v3Opts = opts as STTV3Options;
184
+ base.languageCode = v3Opts.languageCode ?? SAARAS_V3_DEFAULTS.languageCode;
185
+ base.mode = v3Opts.mode ?? SAARAS_V3_DEFAULTS.mode;
186
+ base.prompt = v3Opts.prompt;
187
+ base.withTimestamps = v3Opts.withTimestamps;
188
+ } else {
189
+ // saarika:v2.5
190
+ let languageCode = (opts as STTV2Options).languageCode ?? SAARIKA_DEFAULTS.languageCode;
191
+ if (!STTV2_LANGUAGE_SET.has(languageCode)) {
192
+ languageCode = SAARIKA_DEFAULTS.languageCode;
193
+ }
194
+ base.languageCode = languageCode;
195
+ base.withTimestamps = (opts as STTV2Options).withTimestamps;
196
+ }
197
+
198
+ return base;
199
+ }
200
+
201
+ // ---------------------------------------------------------------------------
202
+ // URL helpers
203
+ // ---------------------------------------------------------------------------
204
+
205
+ function getRestUrl(model: STTModels): string {
206
+ return model === 'saaras:v2.5' ? SARVAM_STT_TRANSLATE_REST_URL : SARVAM_STT_REST_URL;
207
+ }
208
+
209
+ function getWsUrl(model: STTModels): string {
210
+ return model === 'saaras:v2.5' ? SARVAM_STT_TRANSLATE_WS_URL : SARVAM_STT_WS_URL;
211
+ }
212
+
213
+ function buildWsUrl(opts: ResolvedSTTOptions): string {
214
+ const base = getWsUrl(opts.model);
215
+ const params = new URLSearchParams();
216
+ params.set('model', opts.model);
217
+ params.set('vad_signals', 'true');
218
+ params.set('sample_rate', String(SAMPLE_RATE));
219
+ params.set('input_audio_codec', 'pcm_s16le');
220
+
221
+ if (opts.model !== 'saaras:v2.5' && opts.languageCode != null) {
222
+ params.set('language-code', opts.languageCode);
223
+ }
224
+
225
+ // mode: v3 on STT WS, and translate WS (both endpoints support it)
226
+ if (opts.mode != null) {
227
+ params.set('mode', opts.mode);
228
+ }
229
+
230
+ // Optional WS params
231
+ if (opts.highVadSensitivity != null) {
232
+ params.set('high_vad_sensitivity', String(opts.highVadSensitivity));
233
+ }
234
+ if (opts.flushSignal != null) {
235
+ params.set('flush_signal', String(opts.flushSignal));
236
+ }
237
+
238
+ return `${base}?${params.toString()}`;
239
+ }
240
+
241
+ // ---------------------------------------------------------------------------
242
+ // Build the multipart form data (REST) — only sends model-relevant fields
243
+ // ---------------------------------------------------------------------------
244
+
245
+ function buildFormData(wavBlob: Blob, opts: ResolvedSTTOptions): FormData {
246
+ const formData = new FormData();
247
+ formData.append('file', wavBlob, 'audio.wav');
248
+ formData.append('model', opts.model);
249
+
250
+ if (opts.model !== 'saaras:v2.5' && opts.languageCode != null) {
251
+ formData.append('language_code', opts.languageCode);
252
+ }
253
+ if (opts.model === 'saaras:v3' && opts.mode != null) {
254
+ formData.append('mode', opts.mode);
255
+ }
256
+ if ((opts.model === 'saaras:v2.5' || opts.model === 'saaras:v3') && opts.prompt != null) {
257
+ formData.append('prompt', opts.prompt);
258
+ }
259
+ if (opts.model !== 'saaras:v2.5' && opts.withTimestamps) {
260
+ formData.append('with_timestamps', 'true');
261
+ }
262
+
263
+ return formData;
264
+ }
265
+
266
+ // ---------------------------------------------------------------------------
267
+ // WAV encoding helper
268
+ // ---------------------------------------------------------------------------
269
+
270
+ function createWav(frame: AudioFrame): Buffer {
271
+ const bitsPerSample = 16;
272
+ const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
273
+ const blockAlign = (frame.channels * bitsPerSample) / 8;
274
+
275
+ const header = Buffer.alloc(44);
276
+ header.write('RIFF', 0);
277
+ header.writeUInt32LE(36 + frame.data.byteLength, 4);
278
+ header.write('WAVE', 8);
279
+ header.write('fmt ', 12);
280
+ header.writeUInt32LE(16, 16);
281
+ header.writeUInt16LE(1, 20);
282
+ header.writeUInt16LE(frame.channels, 22);
283
+ header.writeUInt32LE(frame.sampleRate, 24);
284
+ header.writeUInt32LE(byteRate, 28);
285
+ header.writeUInt16LE(blockAlign, 32);
286
+ header.writeUInt16LE(bitsPerSample, 34);
287
+ header.write('data', 36);
288
+ header.writeUInt32LE(frame.data.byteLength, 40);
289
+
290
+ const pcm = Buffer.from(frame.data.buffer, frame.data.byteOffset, frame.data.byteLength);
291
+ return Buffer.concat([header, pcm]);
292
+ }
293
+
294
+ // ---------------------------------------------------------------------------
295
+ // REST response type
296
+ // ---------------------------------------------------------------------------
297
+
298
+ interface SarvamSTTResponse {
299
+ request_id: string | null;
300
+ transcript: string;
301
+ language_code: string | null;
302
+ language_probability?: number | null;
303
+ timestamps?: {
304
+ words: string[];
305
+ start_time_seconds: number[];
306
+ end_time_seconds: number[];
307
+ } | null;
308
+ }
309
+
310
+ // ---------------------------------------------------------------------------
311
+ // WS response types (from server Publish messages)
312
+ // ---------------------------------------------------------------------------
313
+
314
+ /** type: "data" */
315
+ interface SarvamWSTranscriptData {
316
+ request_id?: string;
317
+ transcript?: string;
318
+ language_code?: string | null;
319
+ language_probability?: number | null;
320
+ timestamps?: Record<string, unknown> | null;
321
+ diarized_transcript?: Record<string, unknown> | null;
322
+ metrics?: {
323
+ audio_duration?: number;
324
+ processing_latency?: number;
325
+ };
326
+ }
327
+
328
+ /** type: "events" */
329
+ interface SarvamWSEventData {
330
+ event_type?: string;
331
+ timestamp?: string;
332
+ signal_type?: 'START_SPEECH' | 'END_SPEECH';
333
+ occured_at?: number;
334
+ }
335
+
336
+ /** type: "error" — server sends data with message and code fields */
337
+ interface SarvamWSErrorData {
338
+ message?: string;
339
+ error?: string;
340
+ code?: string;
341
+ }
342
+
343
+ // ---------------------------------------------------------------------------
344
+ // STT class — supports both REST (recognize) and WebSocket (stream)
345
+ // ---------------------------------------------------------------------------
346
+
347
+ export class STT extends stt.STT {
348
+ private opts: ResolvedSTTOptions;
349
+ label = 'sarvam.STT';
350
+
351
+ /**
352
+ * Create a new instance of Sarvam AI STT.
353
+ *
354
+ * @remarks
355
+ * `apiKey` must be set to your Sarvam API key, either using the argument or by setting the
356
+ * `SARVAM_API_KEY` environment variable.
357
+ *
358
+ * Supported models:
359
+ * - `saaras:v3` (default, recommended) — supports all 22 languages, modes, prompt, timestamps, and uses `/speech-to-text`.
360
+ * - `saaras:v2.5` — Indic-to-English translation via `/speech-to-text-translate`. Auto-detects source language. Supports prompt.
361
+ * - `saarika:v2.5` — will be deprecated soon. Supports timestamps. All its languages are available in `saaras:v3`.
362
+ *
363
+ * @see {@link https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe | Sarvam STT API docs}
364
+ * @see {@link https://docs.sarvam.ai/api-reference-docs/speech-to-text-translate/translate | Sarvam STT Translate docs}
365
+ */
366
+ constructor(opts: Partial<STTOptions> = {}) {
367
+ const resolved = resolveOptions(opts);
368
+ super({
369
+ streaming: resolved.streaming,
370
+ interimResults: false,
371
+ alignedTranscript: false,
372
+ });
373
+ this.opts = resolved;
374
+ }
375
+
376
+ updateOptions(opts: Partial<STTOptions>) {
377
+ const modelChanging = opts.model != null && opts.model !== this.opts.model;
378
+
379
+ const base: Partial<STTOptions> = modelChanging
380
+ ? {
381
+ apiKey: this.opts.apiKey,
382
+ streaming: this.opts.streaming,
383
+ ...(this.opts.highVadSensitivity != null
384
+ ? { highVadSensitivity: this.opts.highVadSensitivity }
385
+ : {}),
386
+ ...(this.opts.flushSignal != null ? { flushSignal: this.opts.flushSignal } : {}),
387
+ ...(this.opts.languageCode != null && opts.model !== 'saaras:v2.5'
388
+ ? { languageCode: this.opts.languageCode as STTV3Languages }
389
+ : {}),
390
+ }
391
+ : ({ ...this.opts } as Partial<STTOptions>);
392
+
393
+ this.opts = resolveOptions({ ...base, ...opts } as STTOptions);
394
+ }
395
+
396
+ async _recognize(buffer: AudioBuffer, abortSignal?: AbortSignal): Promise<stt.SpeechEvent> {
397
+ const frame = mergeFrames(buffer);
398
+ const wavBuffer = createWav(frame);
399
+ const wavBlob = new Blob([new Uint8Array(wavBuffer)], { type: 'audio/wav' });
400
+
401
+ const formData = buildFormData(wavBlob, this.opts);
402
+
403
+ const response = await fetch(getRestUrl(this.opts.model), {
404
+ method: 'POST',
405
+ headers: {
406
+ 'api-subscription-key': this.opts.apiKey,
407
+ },
408
+ body: formData,
409
+ signal: abortSignal ?? null,
410
+ });
411
+
412
+ if (!response.ok) {
413
+ const errorBody = await response.text();
414
+ throw new Error(`Sarvam STT API error ${response.status}: ${errorBody}`);
415
+ }
416
+
417
+ const data = (await response.json()) as SarvamSTTResponse;
418
+
419
+ let startTime = 0;
420
+ let endTime = 0;
421
+ if (data.timestamps) {
422
+ const starts = data.timestamps.start_time_seconds;
423
+ const ends = data.timestamps.end_time_seconds;
424
+ if (starts.length > 0) startTime = starts[0] ?? 0;
425
+ if (ends.length > 0) endTime = ends[ends.length - 1] ?? 0;
426
+ }
427
+
428
+ return {
429
+ type: stt.SpeechEventType.FINAL_TRANSCRIPT,
430
+ requestId: data.request_id ?? undefined,
431
+ alternatives: [
432
+ {
433
+ text: data.transcript || '',
434
+ language: data.language_code ?? this.opts.languageCode ?? 'unknown',
435
+ startTime,
436
+ endTime,
437
+ confidence: data.language_probability ?? 0,
438
+ },
439
+ ],
440
+ };
441
+ }
442
+
443
+ stream(options?: { connOptions?: APIConnectOptions }): SpeechStream {
444
+ if (!this.capabilities.streaming) {
445
+ throw new Error(
446
+ 'Sarvam STT streaming is disabled (`streaming: false`). Use recognize() for REST or wrap with stt.StreamAdapter + VAD for streaming behavior.',
447
+ );
448
+ }
449
+ return new SpeechStream(this, this.opts, options?.connOptions);
450
+ }
451
+ }
452
+
453
+ // ---------------------------------------------------------------------------
454
+ // WebSocket streaming SpeechStream
455
+ // ---------------------------------------------------------------------------
456
+
457
+ export class SpeechStream extends stt.SpeechStream {
458
+ #opts: ResolvedSTTOptions;
459
+ #audioEnergyFilter: AudioEnergyFilter;
460
+ #logger = log();
461
+ #speaking = false;
462
+ #resetWS = new Future();
463
+ #requestId = '';
464
+ label = 'sarvam.SpeechStream';
465
+
466
+ constructor(sttInstance: STT, opts: ResolvedSTTOptions, connOptions?: APIConnectOptions) {
467
+ super(sttInstance, SAMPLE_RATE, connOptions);
468
+ this.#opts = opts;
469
+ this.closed = false;
470
+ this.#audioEnergyFilter = new AudioEnergyFilter();
471
+ }
472
+
473
+ updateOptions(opts: Partial<STTOptions>) {
474
+ const modelChanging = opts.model != null && opts.model !== this.#opts.model;
475
+
476
+ const base: Partial<STTOptions> = modelChanging
477
+ ? {
478
+ apiKey: this.#opts.apiKey,
479
+ ...(this.#opts.highVadSensitivity != null
480
+ ? { highVadSensitivity: this.#opts.highVadSensitivity }
481
+ : {}),
482
+ ...(this.#opts.flushSignal != null ? { flushSignal: this.#opts.flushSignal } : {}),
483
+ ...(this.#opts.languageCode != null && opts.model !== 'saaras:v2.5'
484
+ ? { languageCode: this.#opts.languageCode as STTV3Languages }
485
+ : {}),
486
+ }
487
+ : ({ ...this.#opts } as Partial<STTOptions>);
488
+
489
+ this.#opts = resolveOptions({ ...base, ...opts } as STTOptions);
490
+ this.#resetWS.resolve();
491
+ }
492
+
493
+ protected async run() {
494
+ const maxRetry = 32;
495
+ let retries = 0;
496
+
497
+ while (!this.input.closed && !this.closed) {
498
+ const wsUrl = buildWsUrl(this.#opts);
499
+ this.#logger.info(`Sarvam STT connecting to: ${wsUrl}`);
500
+ const ws = new WebSocket(wsUrl, {
501
+ headers: { 'api-subscription-key': this.#opts.apiKey },
502
+ });
503
+
504
+ let sessionStart = 0;
505
+ try {
506
+ await new Promise<void>((resolve, reject) => {
507
+ ws.once('open', () => resolve());
508
+ ws.once('error', (err: Error) => reject(err));
509
+ ws.once('close', (code: number) =>
510
+ reject(new Error(`WebSocket closed with code ${code}`)),
511
+ );
512
+ });
513
+
514
+ sessionStart = Date.now();
515
+ await this.#runWS(ws);
516
+ retries = 0;
517
+ } catch (e) {
518
+ // Clean up the WebSocket on failure to prevent listener leaks
519
+ ws.removeAllListeners();
520
+ ws.close();
521
+
522
+ if (!this.closed && !this.input.closed) {
523
+ // If the session ran for a meaningful duration (>5s), this was a working
524
+ // session that ended normally (e.g. server idle timeout ~20s). Reset retries
525
+ // so expected idle-timeout reconnections don't accumulate toward the fatal limit.
526
+ if (sessionStart > 0 && Date.now() - sessionStart > 5000) {
527
+ retries = 0;
528
+ }
529
+ if (retries >= maxRetry) {
530
+ throw new Error(`Failed to connect to Sarvam STT after ${retries} attempts: ${e}`);
531
+ }
532
+ const delay = Math.min(retries * 5, 10);
533
+ retries++;
534
+ this.#logger.warn(
535
+ `Failed to connect to Sarvam STT, retrying in ${delay}s: ${e} (${retries}/${maxRetry})`,
536
+ );
537
+ await new Promise((resolve) => setTimeout(resolve, delay * 1000));
538
+ } else {
539
+ this.#logger.warn(
540
+ `Sarvam STT disconnected, connection is closed: ${e} (inputClosed: ${this.input.closed}, isClosed: ${this.closed})`,
541
+ );
542
+ }
543
+ }
544
+ }
545
+
546
+ this.closed = true;
547
+ }
548
+
549
+ async #runWS(ws: WebSocket) {
550
+ this.#resetWS = new Future();
551
+ this.#speaking = false;
552
+ let closing = false;
553
+ // Session-scoped controller: aborted in finally to cancel sendTask on WS reset
554
+ const sessionController = new AbortController();
555
+
556
+ // Config message: only supported on translate WS endpoint (saaras:v2.5)
557
+ // @see https://docs.sarvam.ai/api-reference-docs/speech-to-text-translate/translate/ws
558
+ if (this.#opts.model === 'saaras:v2.5' && this.#opts.prompt != null) {
559
+ ws.send(JSON.stringify({ type: 'config', prompt: this.#opts.prompt }));
560
+ }
561
+
562
+ // No keepalive — Sarvam rejects messages without 'audio' field, and sending
563
+ // silent audio could confuse server-side VAD. On idle timeout (~20s), the
564
+ // server closes the connection and the outer retry loop in run() reconnects.
565
+ // This matches the Python SDK's approach.
566
+
567
+ const wsMonitor = Task.from(async (controller) => {
568
+ const closed = new Promise<void>((_, reject) => {
569
+ ws.once('close', (code: number, reason: Buffer) => {
570
+ if (!closing) {
571
+ this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
572
+ reject(new Error('WebSocket closed'));
573
+ }
574
+ });
575
+ });
576
+ await Promise.race([closed, waitForAbort(controller.signal)]);
577
+ });
578
+
579
+ const sendTask = async () => {
580
+ const samples50Ms = Math.floor(SAMPLE_RATE / 20); // 50ms chunks
581
+ const stream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, samples50Ms);
582
+ const abortPromise = waitForAbort(this.abortSignal);
583
+ const sessionAbort = waitForAbort(sessionController.signal);
584
+
585
+ try {
586
+ while (!this.closed) {
587
+ const result = await Promise.race([this.input.next(), abortPromise, sessionAbort]);
588
+ if (result === undefined) return; // aborted
589
+ if (result.done) break;
590
+
591
+ const data = result.value;
592
+
593
+ let frames: AudioFrame[];
594
+ if (data === SpeechStream.FLUSH_SENTINEL) {
595
+ frames = stream.flush();
596
+ } else if (data.sampleRate !== SAMPLE_RATE || data.channels !== NUM_CHANNELS) {
597
+ throw new Error(
598
+ `Expected ${SAMPLE_RATE}Hz/${NUM_CHANNELS}ch, got ${data.sampleRate}Hz/${data.channels}ch`,
599
+ );
600
+ } else {
601
+ frames = stream.write(
602
+ data.data.buffer.slice(
603
+ data.data.byteOffset,
604
+ data.data.byteOffset + data.data.byteLength,
605
+ ) as ArrayBuffer,
606
+ );
607
+ }
608
+
609
+ for (const frame of frames) {
610
+ if (this.#audioEnergyFilter.pushFrame(frame)) {
611
+ // Sarvam expects base64-encoded PCM in a JSON message
612
+ const pcmBuffer = Buffer.from(
613
+ frame.data.buffer,
614
+ frame.data.byteOffset,
615
+ frame.data.byteLength,
616
+ );
617
+ const base64Audio = pcmBuffer.toString('base64');
618
+ ws.send(
619
+ JSON.stringify({
620
+ audio: {
621
+ data: base64Audio,
622
+ encoding: 'audio/wav',
623
+ sample_rate: SAMPLE_RATE,
624
+ },
625
+ }),
626
+ );
627
+ }
628
+ }
629
+
630
+ // Send flush message on FLUSH_SENTINEL (VAD end of speech)
631
+ if (data === SpeechStream.FLUSH_SENTINEL) {
632
+ ws.send(JSON.stringify({ type: 'flush' }));
633
+ }
634
+ }
635
+ } finally {
636
+ closing = true;
637
+ // Match Python: end_of_stream includes an empty audio field to avoid
638
+ // "audio must not be None" rejection from the server
639
+ try {
640
+ ws.send(
641
+ JSON.stringify({
642
+ type: 'end_of_stream',
643
+ audio: { data: '', encoding: 'audio/wav', sample_rate: SAMPLE_RATE },
644
+ }),
645
+ );
646
+ } catch {
647
+ // ws may already be closed
648
+ }
649
+ wsMonitor.cancel();
650
+ }
651
+ };
652
+
653
+ const listenTask = Task.from(async (controller) => {
654
+ const putMessage = (event: stt.SpeechEvent) => {
655
+ if (!this.queue.closed) {
656
+ try {
657
+ this.queue.put(event);
658
+ } catch {
659
+ // ignore
660
+ }
661
+ }
662
+ };
663
+
664
+ const listenMessage = new Promise<void>((resolve, reject) => {
665
+ ws.once('close', () => resolve());
666
+ ws.on('message', (msg: RawData) => {
667
+ try {
668
+ const raw = msg.toString();
669
+ this.#logger.debug(`Sarvam STT raw WS message: ${raw.substring(0, 500)}`);
670
+ const json = JSON.parse(raw);
671
+ const msgType: string = json['type'] ?? '';
672
+
673
+ if (msgType === 'events') {
674
+ const eventData = (json['data'] as SarvamWSEventData | undefined) ?? {};
675
+ const signalType = eventData.signal_type;
676
+
677
+ if (signalType === 'START_SPEECH') {
678
+ if (!this.#speaking) {
679
+ this.#speaking = true;
680
+ putMessage({ type: stt.SpeechEventType.START_OF_SPEECH });
681
+ }
682
+ } else if (signalType === 'END_SPEECH') {
683
+ if (this.#speaking) {
684
+ this.#speaking = false;
685
+ putMessage({ type: stt.SpeechEventType.END_OF_SPEECH });
686
+ }
687
+ }
688
+ } else if (msgType === 'data') {
689
+ const td = (json['data'] as SarvamWSTranscriptData | undefined) ?? {};
690
+ const transcript = td.transcript ?? '';
691
+ const language = td.language_code ?? this.#opts.languageCode ?? 'unknown';
692
+ const requestId = td.request_id ?? '';
693
+ const confidence = td.language_probability ?? 0;
694
+ this.#requestId = requestId;
695
+
696
+ // Log metrics when available
697
+ if (td.metrics) {
698
+ this.#logger.debug(
699
+ `Sarvam STT metrics: audio_duration=${td.metrics.audio_duration}s, latency=${td.metrics.processing_latency}s`,
700
+ );
701
+ }
702
+
703
+ if (transcript) {
704
+ if (!this.#speaking) {
705
+ this.#speaking = true;
706
+ putMessage({ type: stt.SpeechEventType.START_OF_SPEECH });
707
+ }
708
+
709
+ putMessage({
710
+ type: stt.SpeechEventType.FINAL_TRANSCRIPT,
711
+ requestId,
712
+ alternatives: [
713
+ {
714
+ text: transcript,
715
+ language,
716
+ startTime: 0,
717
+ endTime: td.metrics?.audio_duration ?? 0,
718
+ confidence,
719
+ },
720
+ ],
721
+ });
722
+ }
723
+ } else if (msgType === 'error') {
724
+ // Server format: { type: "error", data: { message: "...", code: "..." } }
725
+ // Also check top-level and 'error' field as fallback
726
+ const nested = json['data'] as SarvamWSErrorData | undefined;
727
+ const errorInfo =
728
+ nested?.message ??
729
+ nested?.error ??
730
+ json['error'] ??
731
+ json['message'] ??
732
+ 'Unknown error';
733
+ const errorCode = nested?.code ?? json['code'] ?? '';
734
+ this.#logger.error(`Sarvam STT WebSocket error [${errorCode}]: ${errorInfo}`);
735
+ reject(new Error(`Sarvam STT API error [${errorCode}]: ${errorInfo}`));
736
+ return;
737
+ }
738
+
739
+ if (this.closed || closing) {
740
+ resolve();
741
+ }
742
+ } catch (err) {
743
+ this.#logger.error(`Error processing Sarvam STT message: ${msg}`);
744
+ reject(err);
745
+ }
746
+ });
747
+ });
748
+
749
+ await Promise.race([listenMessage, waitForAbort(controller.signal)]);
750
+ }, this.abortController);
751
+
752
+ try {
753
+ await Promise.race([
754
+ this.#resetWS.await,
755
+ Promise.all([sendTask(), listenTask.result, wsMonitor.result]),
756
+ ]);
757
+ } finally {
758
+ closing = true;
759
+ sessionController.abort();
760
+ // Do NOT call listenTask.cancel() — it would abort this.abortController
761
+ // (passed to Task.from) and permanently break the stream. Instead, ws.close()
762
+ // triggers the ws.once('close') handler inside listenMessage, letting listenTask
763
+ // exit naturally. On close(), the parent abort signal handles it directly.
764
+ wsMonitor.cancel();
765
+ ws.close();
766
+ // Suppress unhandled rejection from orphaned listenTask on reconnect
767
+ listenTask.result.catch(() => {});
768
+ }
769
+ }
770
+ }