@livekit/agents-plugin-sarvam 1.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +110 -0
- package/dist/index.cjs +52 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/models.cjs +17 -0
- package/dist/models.cjs.map +1 -0
- package/dist/models.d.cts +36 -0
- package/dist/models.d.ts +36 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +1 -0
- package/dist/models.js.map +1 -0
- package/dist/stt.cjs +499 -0
- package/dist/stt.cjs.map +1 -0
- package/dist/stt.d.cts +104 -0
- package/dist/stt.d.ts +104 -0
- package/dist/stt.d.ts.map +1 -0
- package/dist/stt.js +483 -0
- package/dist/stt.js.map +1 -0
- package/dist/stt.test.cjs +18 -0
- package/dist/stt.test.cjs.map +1 -0
- package/dist/stt.test.d.cts +2 -0
- package/dist/stt.test.d.ts +2 -0
- package/dist/stt.test.d.ts.map +1 -0
- package/dist/stt.test.js +17 -0
- package/dist/stt.test.js.map +1 -0
- package/dist/tts.cjs +405 -0
- package/dist/tts.cjs.map +1 -0
- package/dist/tts.d.cts +111 -0
- package/dist/tts.d.ts +111 -0
- package/dist/tts.d.ts.map +1 -0
- package/dist/tts.js +385 -0
- package/dist/tts.js.map +1 -0
- package/dist/tts.test.cjs +17 -0
- package/dist/tts.test.cjs.map +1 -0
- package/dist/tts.test.d.cts +2 -0
- package/dist/tts.test.d.ts +2 -0
- package/dist/tts.test.d.ts.map +1 -0
- package/dist/tts.test.js +16 -0
- package/dist/tts.test.js.map +1 -0
- package/package.json +54 -0
- package/src/index.ts +34 -0
- package/src/models.ts +135 -0
- package/src/stt.test.ts +23 -0
- package/src/stt.ts +770 -0
- package/src/tts.test.ts +22 -0
- package/src/tts.ts +571 -0
package/src/tts.test.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { STT } from '@livekit/agents-plugin-openai';
|
|
5
|
+
import { tts } from '@livekit/agents-plugins-test';
|
|
6
|
+
import { describe, expect, it } from 'vitest';
|
|
7
|
+
import { TTS } from './tts.js';
|
|
8
|
+
|
|
9
|
+
const hasSarvamApiKey = Boolean(process.env.SARVAM_API_KEY);
|
|
10
|
+
|
|
11
|
+
describe('Sarvam TTS', () => {
|
|
12
|
+
it.skipIf(!hasSarvamApiKey)('runs integration suite with real API key', async () => {
|
|
13
|
+
await tts(new TTS({ apiKey: process.env.SARVAM_API_KEY }), new STT(), { streaming: false });
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('supports opting into non-streaming mode', () => {
|
|
17
|
+
const nonStreamingTts = new TTS({ apiKey: 'dummy-api-key', streaming: false });
|
|
18
|
+
|
|
19
|
+
expect(nonStreamingTts.capabilities.streaming).toBe(false);
|
|
20
|
+
expect(() => nonStreamingTts.stream()).toThrow(/streaming is disabled/i);
|
|
21
|
+
});
|
|
22
|
+
});
|
package/src/tts.ts
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import {
|
|
5
|
+
type APIConnectOptions,
|
|
6
|
+
AudioByteStream,
|
|
7
|
+
log,
|
|
8
|
+
shortuuid,
|
|
9
|
+
tokenize,
|
|
10
|
+
tts,
|
|
11
|
+
} from '@livekit/agents';
|
|
12
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
13
|
+
import { type RawData, WebSocket } from 'ws';
|
|
14
|
+
import type {
|
|
15
|
+
TTSLanguages,
|
|
16
|
+
TTSModels,
|
|
17
|
+
TTSSampleRates,
|
|
18
|
+
TTSSpeakers,
|
|
19
|
+
TTSV2Speakers,
|
|
20
|
+
TTSV3Speakers,
|
|
21
|
+
} from './models.js';
|
|
22
|
+
|
|
23
|
+
const SARVAM_TTS_SAMPLE_RATE = 24000;
|
|
24
|
+
const SARVAM_TTS_CHANNELS = 1;
|
|
25
|
+
const SARVAM_BASE_URL = 'https://api.sarvam.ai';
|
|
26
|
+
const SARVAM_WS_URL_PATH = '/text-to-speech/ws';
|
|
27
|
+
const MIN_SENTENCE_LENGTH = 8;
|
|
28
|
+
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Model-specific option types
|
|
31
|
+
// V2 supports pitch / loudness / enablePreprocessing
|
|
32
|
+
// V3 supports temperature (pitch, loudness, enablePreprocessing are NOT supported)
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
interface TTSBaseOptions {
|
|
36
|
+
/** Sarvam API key. Defaults to $SARVAM_API_KEY */
|
|
37
|
+
apiKey?: string;
|
|
38
|
+
/**
|
|
39
|
+
* Whether to use native WebSocket streaming for `stream()`.
|
|
40
|
+
* Set to `false` to prefer non-streaming REST synthesis (used by Agent via TTS StreamAdapter).
|
|
41
|
+
* Default: `true`.
|
|
42
|
+
*/
|
|
43
|
+
streaming?: boolean;
|
|
44
|
+
/** Target language code (BCP-47) */
|
|
45
|
+
targetLanguageCode?: TTSLanguages | string;
|
|
46
|
+
/** Speech pace. v2: 0.3–3.0, v3: 0.5–2.0 (default 1.0) */
|
|
47
|
+
pace?: number;
|
|
48
|
+
/** Output sample rate in Hz (default 24000) */
|
|
49
|
+
sampleRate?: TTSSampleRates | number;
|
|
50
|
+
/** Base URL for the Sarvam API */
|
|
51
|
+
baseURL?: string;
|
|
52
|
+
/** Sentence tokenizer for streaming (default: basic sentence tokenizer) */
|
|
53
|
+
sentenceTokenizer?: tokenize.SentenceTokenizer;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Options specific to bulbul:v2 */
|
|
57
|
+
export interface TTSV2Options extends TTSBaseOptions {
|
|
58
|
+
model?: 'bulbul:v2';
|
|
59
|
+
/** Speaker voice (v2 voices). Default: 'anushka' */
|
|
60
|
+
speaker?: TTSV2Speakers | string;
|
|
61
|
+
/** Pitch adjustment, -0.75 to 0.75 (v2 only) */
|
|
62
|
+
pitch?: number;
|
|
63
|
+
/** Loudness, 0.3 to 3.0 (v2 only) */
|
|
64
|
+
loudness?: number;
|
|
65
|
+
/** Enable text preprocessing (v2 only) */
|
|
66
|
+
enablePreprocessing?: boolean;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Options specific to bulbul:v3 */
|
|
70
|
+
export interface TTSV3Options extends TTSBaseOptions {
|
|
71
|
+
model: 'bulbul:v3';
|
|
72
|
+
/** Speaker voice (v3 voices). Default: 'shubh' */
|
|
73
|
+
speaker?: TTSV3Speakers | string;
|
|
74
|
+
/** Temperature for voice variation, 0.01 to 2.0 (v3 only, default 0.6) */
|
|
75
|
+
temperature?: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Combined options — discriminated by `model` field */
|
|
79
|
+
export type TTSOptions = TTSV2Options | TTSV3Options;
|
|
80
|
+
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
// Resolved (internal) options — flat union of all fields
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
interface ResolvedTTSOptions {
|
|
86
|
+
apiKey: string;
|
|
87
|
+
streaming: boolean;
|
|
88
|
+
model: TTSModels;
|
|
89
|
+
speaker: TTSSpeakers | string;
|
|
90
|
+
targetLanguageCode: string;
|
|
91
|
+
pace: number;
|
|
92
|
+
sampleRate: number;
|
|
93
|
+
baseURL: string;
|
|
94
|
+
sentenceTokenizer: tokenize.SentenceTokenizer;
|
|
95
|
+
// V2 only
|
|
96
|
+
pitch?: number;
|
|
97
|
+
loudness?: number;
|
|
98
|
+
enablePreprocessing?: boolean;
|
|
99
|
+
// V3 only
|
|
100
|
+
temperature?: number;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
// Defaults per model
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
const V2_DEFAULTS = {
|
|
108
|
+
speaker: 'anushka' as const,
|
|
109
|
+
pitch: 0,
|
|
110
|
+
pace: 1.0,
|
|
111
|
+
loudness: 1.0,
|
|
112
|
+
enablePreprocessing: false,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
const V3_DEFAULTS = {
|
|
116
|
+
speaker: 'shubh' as const,
|
|
117
|
+
pace: 1.0,
|
|
118
|
+
temperature: 0.6,
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
// Resolve caller options into a fully-populated internal struct
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
function resolveOptions(opts: Partial<TTSOptions>): ResolvedTTSOptions {
|
|
126
|
+
const apiKey = opts.apiKey ?? process.env.SARVAM_API_KEY;
|
|
127
|
+
if (!apiKey) {
|
|
128
|
+
throw new Error('Sarvam API key is required, whether as an argument or as $SARVAM_API_KEY');
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const model: TTSModels = opts.model ?? 'bulbul:v2';
|
|
132
|
+
const isV3 = model === 'bulbul:v3';
|
|
133
|
+
|
|
134
|
+
const base: ResolvedTTSOptions = {
|
|
135
|
+
apiKey,
|
|
136
|
+
streaming: opts.streaming ?? true,
|
|
137
|
+
model,
|
|
138
|
+
speaker: opts.speaker ?? (isV3 ? V3_DEFAULTS.speaker : V2_DEFAULTS.speaker),
|
|
139
|
+
targetLanguageCode: opts.targetLanguageCode ?? 'en-IN',
|
|
140
|
+
pace: opts.pace ?? (isV3 ? V3_DEFAULTS.pace : V2_DEFAULTS.pace),
|
|
141
|
+
sampleRate: opts.sampleRate ?? SARVAM_TTS_SAMPLE_RATE,
|
|
142
|
+
baseURL: opts.baseURL ?? SARVAM_BASE_URL,
|
|
143
|
+
sentenceTokenizer:
|
|
144
|
+
opts.sentenceTokenizer ??
|
|
145
|
+
new tokenize.basic.SentenceTokenizer({ minSentenceLength: MIN_SENTENCE_LENGTH }),
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
if (isV3) {
|
|
149
|
+
base.temperature = (opts as TTSV3Options).temperature ?? V3_DEFAULTS.temperature;
|
|
150
|
+
} else {
|
|
151
|
+
const v2 = opts as TTSV2Options;
|
|
152
|
+
base.pitch = v2.pitch ?? V2_DEFAULTS.pitch;
|
|
153
|
+
base.loudness = v2.loudness ?? V2_DEFAULTS.loudness;
|
|
154
|
+
base.enablePreprocessing = v2.enablePreprocessing ?? V2_DEFAULTS.enablePreprocessing;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return base;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
// Build the API request body — only sends model-relevant fields
|
|
162
|
+
// ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
function buildRequestBody(text: string, opts: ResolvedTTSOptions): Record<string, unknown> {
|
|
165
|
+
const body: Record<string, unknown> = {
|
|
166
|
+
text,
|
|
167
|
+
target_language_code: opts.targetLanguageCode,
|
|
168
|
+
speaker: opts.speaker,
|
|
169
|
+
model: opts.model,
|
|
170
|
+
pace: opts.pace,
|
|
171
|
+
speech_sample_rate: String(opts.sampleRate),
|
|
172
|
+
// Always request WAV — AudioByteStream requires raw PCM, which we get by
|
|
173
|
+
// stripping the 44-byte WAV header. Other codecs produce compressed audio
|
|
174
|
+
// that cannot be fed into AudioByteStream.
|
|
175
|
+
output_audio_codec: 'wav',
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
if (opts.model === 'bulbul:v3') {
|
|
179
|
+
if (opts.temperature != null) body.temperature = opts.temperature;
|
|
180
|
+
} else {
|
|
181
|
+
if (opts.pitch != null) body.pitch = opts.pitch;
|
|
182
|
+
if (opts.loudness != null) body.loudness = opts.loudness;
|
|
183
|
+
if (opts.enablePreprocessing != null) body.enable_preprocessing = opts.enablePreprocessing;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return body;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
// Build WS config message (sent as first message after connection)
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
function buildWsConfigMessage(opts: ResolvedTTSOptions): string {
|
|
194
|
+
const data: Record<string, unknown> = {
|
|
195
|
+
target_language_code: opts.targetLanguageCode,
|
|
196
|
+
speaker: opts.speaker,
|
|
197
|
+
model: opts.model,
|
|
198
|
+
pace: opts.pace,
|
|
199
|
+
speech_sample_rate: String(opts.sampleRate),
|
|
200
|
+
output_audio_codec: 'linear16',
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
if (opts.model === 'bulbul:v3') {
|
|
204
|
+
if (opts.temperature != null) data.temperature = opts.temperature;
|
|
205
|
+
} else {
|
|
206
|
+
if (opts.pitch != null) data.pitch = opts.pitch;
|
|
207
|
+
if (opts.loudness != null) data.loudness = opts.loudness;
|
|
208
|
+
if (opts.enablePreprocessing != null) data.enable_preprocessing = opts.enablePreprocessing;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return JSON.stringify({ type: 'config', data });
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// ---------------------------------------------------------------------------
|
|
215
|
+
// TTS class
|
|
216
|
+
// ---------------------------------------------------------------------------
|
|
217
|
+
|
|
218
|
+
export class TTS extends tts.TTS {
|
|
219
|
+
#opts: ResolvedTTSOptions;
|
|
220
|
+
label = 'sarvam.TTS';
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Create a new instance of Sarvam AI TTS.
|
|
224
|
+
*
|
|
225
|
+
* @remarks
|
|
226
|
+
* `apiKey` must be set to your Sarvam API key, either using the argument or by setting the
|
|
227
|
+
* `SARVAM_API_KEY` environment variable.
|
|
228
|
+
*/
|
|
229
|
+
constructor(opts: Partial<TTSOptions> = {}) {
|
|
230
|
+
const resolved = resolveOptions(opts);
|
|
231
|
+
super(resolved.sampleRate, SARVAM_TTS_CHANNELS, { streaming: resolved.streaming });
|
|
232
|
+
this.#opts = resolved;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Update TTS options after initialization.
|
|
237
|
+
*
|
|
238
|
+
* @remarks
|
|
239
|
+
* When the model changes, only truly shared fields (apiKey,
|
|
240
|
+
* targetLanguageCode, pace, sampleRate, baseURL) carry over.
|
|
241
|
+
* Model-specific fields (speaker, pitch, loudness, temperature,
|
|
242
|
+
* enablePreprocessing) are dropped so resolveOptions re-applies
|
|
243
|
+
* the correct defaults for the new model.
|
|
244
|
+
*/
|
|
245
|
+
updateOptions(opts: Partial<TTSOptions>) {
|
|
246
|
+
const modelChanging = opts.model != null && opts.model !== this.#opts.model;
|
|
247
|
+
|
|
248
|
+
const base: Partial<TTSOptions> = modelChanging
|
|
249
|
+
? {
|
|
250
|
+
apiKey: this.#opts.apiKey,
|
|
251
|
+
streaming: this.#opts.streaming,
|
|
252
|
+
targetLanguageCode: this.#opts.targetLanguageCode as TTSLanguages,
|
|
253
|
+
pace: this.#opts.pace,
|
|
254
|
+
sampleRate: this.#opts.sampleRate as TTSSampleRates,
|
|
255
|
+
baseURL: this.#opts.baseURL,
|
|
256
|
+
sentenceTokenizer: this.#opts.sentenceTokenizer,
|
|
257
|
+
}
|
|
258
|
+
: ({ ...this.#opts } as Partial<TTSOptions>);
|
|
259
|
+
|
|
260
|
+
this.#opts = resolveOptions({ ...base, ...opts } as TTSOptions);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Synthesize text to audio using Sarvam AI TTS.
|
|
265
|
+
*
|
|
266
|
+
* @param text - Text to synthesize (max 2500 chars for v3, 1500 for v2)
|
|
267
|
+
* @param connOptions - API connection options
|
|
268
|
+
* @param abortSignal - Abort signal for cancellation
|
|
269
|
+
* @returns A chunked stream of synthesized audio
|
|
270
|
+
*/
|
|
271
|
+
synthesize(
|
|
272
|
+
text: string,
|
|
273
|
+
connOptions?: APIConnectOptions,
|
|
274
|
+
abortSignal?: AbortSignal,
|
|
275
|
+
): ChunkedStream {
|
|
276
|
+
return new ChunkedStream(this, text, this.#opts, connOptions, abortSignal);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
stream(): tts.SynthesizeStream {
|
|
280
|
+
if (!this.capabilities.streaming) {
|
|
281
|
+
throw new Error(
|
|
282
|
+
'Sarvam TTS streaming is disabled (`streaming: false`). Use synthesize() for REST or wrap with tts.StreamAdapter for streaming behavior.',
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
return new SynthesizeStream(this, this.#opts);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// ---------------------------------------------------------------------------
|
|
290
|
+
// Chunked stream (non-streaming synthesis)
|
|
291
|
+
// ---------------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
/** Chunked stream for Sarvam AI TTS that processes a single synthesis request. */
|
|
294
|
+
export class ChunkedStream extends tts.ChunkedStream {
|
|
295
|
+
label = 'sarvam.ChunkedStream';
|
|
296
|
+
private opts: ResolvedTTSOptions;
|
|
297
|
+
|
|
298
|
+
/** @internal */
|
|
299
|
+
constructor(
|
|
300
|
+
tts: TTS,
|
|
301
|
+
text: string,
|
|
302
|
+
opts: ResolvedTTSOptions,
|
|
303
|
+
connOptions?: APIConnectOptions,
|
|
304
|
+
abortSignal?: AbortSignal,
|
|
305
|
+
) {
|
|
306
|
+
super(text, tts, connOptions, abortSignal);
|
|
307
|
+
this.opts = opts;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
protected async run() {
|
|
311
|
+
const requestId = shortuuid();
|
|
312
|
+
|
|
313
|
+
const response = await fetch(`${this.opts.baseURL}/text-to-speech`, {
|
|
314
|
+
method: 'POST',
|
|
315
|
+
headers: {
|
|
316
|
+
'Content-Type': 'application/json',
|
|
317
|
+
'api-subscription-key': this.opts.apiKey,
|
|
318
|
+
},
|
|
319
|
+
body: JSON.stringify(buildRequestBody(this.inputText, this.opts)),
|
|
320
|
+
signal: this.abortSignal,
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
if (!response.ok) {
|
|
324
|
+
const errorBody = await response.text();
|
|
325
|
+
throw new Error(`Sarvam TTS API error ${response.status}: ${errorBody}`);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const data = (await response.json()) as { audios: string[] };
|
|
329
|
+
const audioBase64 = data.audios[0];
|
|
330
|
+
if (!audioBase64) {
|
|
331
|
+
throw new Error('Sarvam TTS returned empty audio');
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Decode base64 WAV and strip 44-byte header to get raw PCM
|
|
335
|
+
const raw = Buffer.from(audioBase64, 'base64');
|
|
336
|
+
const pcmData = raw.buffer.slice(raw.byteOffset + 44, raw.byteOffset + raw.byteLength);
|
|
337
|
+
|
|
338
|
+
const audioByteStream = new AudioByteStream(this.opts.sampleRate, SARVAM_TTS_CHANNELS);
|
|
339
|
+
const frames = [...audioByteStream.write(pcmData), ...audioByteStream.flush()];
|
|
340
|
+
|
|
341
|
+
let lastFrame: AudioFrame | undefined;
|
|
342
|
+
const sendLastFrame = (segmentId: string, final: boolean) => {
|
|
343
|
+
if (lastFrame) {
|
|
344
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
345
|
+
lastFrame = undefined;
|
|
346
|
+
}
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
for (const frame of frames) {
|
|
350
|
+
sendLastFrame(requestId, false);
|
|
351
|
+
lastFrame = frame;
|
|
352
|
+
}
|
|
353
|
+
sendLastFrame(requestId, true);
|
|
354
|
+
|
|
355
|
+
this.queue.close();
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// ---------------------------------------------------------------------------
|
|
360
|
+
// WebSocket streaming synthesis
|
|
361
|
+
// ---------------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
export class SynthesizeStream extends tts.SynthesizeStream {
|
|
364
|
+
private opts: ResolvedTTSOptions;
|
|
365
|
+
private tokenizer: tokenize.SentenceStream;
|
|
366
|
+
#logger = log();
|
|
367
|
+
label = 'sarvam.SynthesizeStream';
|
|
368
|
+
|
|
369
|
+
constructor(tts: TTS, opts: ResolvedTTSOptions) {
|
|
370
|
+
super(tts);
|
|
371
|
+
this.opts = opts;
|
|
372
|
+
this.tokenizer = opts.sentenceTokenizer.stream();
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
private async closeWebSocket(ws: WebSocket): Promise<void> {
|
|
376
|
+
try {
|
|
377
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
378
|
+
ws.send(JSON.stringify({ type: 'flush' }));
|
|
379
|
+
|
|
380
|
+
try {
|
|
381
|
+
await new Promise<void>((resolve) => {
|
|
382
|
+
const timeout = setTimeout(() => resolve(), 1000);
|
|
383
|
+
|
|
384
|
+
ws.once('message', () => {
|
|
385
|
+
clearTimeout(timeout);
|
|
386
|
+
resolve();
|
|
387
|
+
});
|
|
388
|
+
ws.once('close', () => {
|
|
389
|
+
clearTimeout(timeout);
|
|
390
|
+
resolve();
|
|
391
|
+
});
|
|
392
|
+
ws.once('error', () => {
|
|
393
|
+
clearTimeout(timeout);
|
|
394
|
+
resolve();
|
|
395
|
+
});
|
|
396
|
+
});
|
|
397
|
+
} catch {
|
|
398
|
+
// Ignore timeout or other errors during close sequence
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
} catch (e) {
|
|
402
|
+
this.#logger.warn(`Error during WebSocket close sequence: ${e}`);
|
|
403
|
+
} finally {
|
|
404
|
+
if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) {
|
|
405
|
+
ws.close();
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
protected async run() {
|
|
411
|
+
const requestId = shortuuid();
|
|
412
|
+
const segmentId = shortuuid();
|
|
413
|
+
|
|
414
|
+
// Build WS URL: wss://api.sarvam.ai/text-to-speech/ws?model=...&send_completion_event=true
|
|
415
|
+
const wsBaseUrl = this.opts.baseURL.replace(/^http/, 'ws');
|
|
416
|
+
const url = new URL(`${wsBaseUrl}${SARVAM_WS_URL_PATH}`);
|
|
417
|
+
url.searchParams.set('model', this.opts.model);
|
|
418
|
+
url.searchParams.set('send_completion_event', 'true');
|
|
419
|
+
|
|
420
|
+
const ws = new WebSocket(url, {
|
|
421
|
+
headers: {
|
|
422
|
+
'api-subscription-key': this.opts.apiKey,
|
|
423
|
+
},
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
await new Promise<void>((resolve, reject) => {
|
|
427
|
+
const onOpen = () => {
|
|
428
|
+
cleanup();
|
|
429
|
+
resolve();
|
|
430
|
+
};
|
|
431
|
+
const onError = (error: Error) => {
|
|
432
|
+
cleanup();
|
|
433
|
+
reject(new Error(`Sarvam TTS WS connection error: ${error.message}`));
|
|
434
|
+
};
|
|
435
|
+
const onClose = (code: number) => {
|
|
436
|
+
cleanup();
|
|
437
|
+
reject(new Error(`Sarvam TTS WS closed during connect: ${code}`));
|
|
438
|
+
};
|
|
439
|
+
const cleanup = () => {
|
|
440
|
+
ws.removeListener('open', onOpen);
|
|
441
|
+
ws.removeListener('error', onError);
|
|
442
|
+
ws.removeListener('close', onClose);
|
|
443
|
+
};
|
|
444
|
+
ws.on('open', onOpen);
|
|
445
|
+
ws.on('error', onError);
|
|
446
|
+
ws.on('close', onClose);
|
|
447
|
+
});
|
|
448
|
+
|
|
449
|
+
// Send config message immediately after connection
|
|
450
|
+
ws.send(buildWsConfigMessage(this.opts));
|
|
451
|
+
|
|
452
|
+
const inputTask = async () => {
|
|
453
|
+
for await (const data of this.input) {
|
|
454
|
+
if (data === SynthesizeStream.FLUSH_SENTINEL) {
|
|
455
|
+
this.tokenizer.flush();
|
|
456
|
+
continue;
|
|
457
|
+
}
|
|
458
|
+
this.tokenizer.pushText(data);
|
|
459
|
+
}
|
|
460
|
+
this.tokenizer.endInput();
|
|
461
|
+
this.tokenizer.close();
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
const sendTask = async () => {
|
|
465
|
+
for await (const event of this.tokenizer) {
|
|
466
|
+
if (this.abortController.signal.aborted) break;
|
|
467
|
+
|
|
468
|
+
const text = event.token;
|
|
469
|
+
ws.send(JSON.stringify({ type: 'text', data: { text } }));
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if (!this.abortController.signal.aborted) {
|
|
473
|
+
ws.send(JSON.stringify({ type: 'flush' }));
|
|
474
|
+
}
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
const recvTask = async () => {
|
|
478
|
+
const bstream = new AudioByteStream(this.opts.sampleRate, SARVAM_TTS_CHANNELS);
|
|
479
|
+
let finalReceived = false;
|
|
480
|
+
let lastFrame: AudioFrame | undefined;
|
|
481
|
+
|
|
482
|
+
const sendLastFrame = (final: boolean) => {
|
|
483
|
+
if (lastFrame && !this.queue.closed) {
|
|
484
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
485
|
+
lastFrame = undefined;
|
|
486
|
+
}
|
|
487
|
+
};
|
|
488
|
+
|
|
489
|
+
return new Promise<void>((resolve, reject) => {
|
|
490
|
+
ws.on('message', (data: RawData) => {
|
|
491
|
+
let msg: { type: string; data?: Record<string, unknown> };
|
|
492
|
+
try {
|
|
493
|
+
msg = JSON.parse(data.toString());
|
|
494
|
+
} catch {
|
|
495
|
+
this.#logger.warn('Sarvam WS: received non-JSON message');
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
switch (msg.type) {
|
|
500
|
+
case 'audio': {
|
|
501
|
+
const audioB64 = (msg.data?.audio as string) ?? '';
|
|
502
|
+
if (!audioB64) break;
|
|
503
|
+
|
|
504
|
+
const raw = Buffer.from(audioB64, 'base64');
|
|
505
|
+
const pcm = raw.buffer.slice(raw.byteOffset, raw.byteOffset + raw.byteLength);
|
|
506
|
+
|
|
507
|
+
for (const frame of bstream.write(pcm as ArrayBuffer)) {
|
|
508
|
+
sendLastFrame(false);
|
|
509
|
+
lastFrame = frame;
|
|
510
|
+
}
|
|
511
|
+
break;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
case 'event': {
|
|
515
|
+
const eventType = msg.data?.event_type as string | undefined;
|
|
516
|
+
if (eventType === 'final') {
|
|
517
|
+
finalReceived = true;
|
|
518
|
+
for (const frame of bstream.flush()) {
|
|
519
|
+
sendLastFrame(false);
|
|
520
|
+
lastFrame = frame;
|
|
521
|
+
}
|
|
522
|
+
sendLastFrame(true);
|
|
523
|
+
|
|
524
|
+
if (!this.queue.closed) {
|
|
525
|
+
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
526
|
+
}
|
|
527
|
+
resolve();
|
|
528
|
+
}
|
|
529
|
+
break;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
case 'error': {
|
|
533
|
+
const errMsg = (msg.data?.message as string) ?? 'Unknown Sarvam WS error';
|
|
534
|
+
const errCode = msg.data?.code as number | undefined;
|
|
535
|
+
reject(new Error(`Sarvam WS error ${errCode ?? ''}: ${errMsg}`));
|
|
536
|
+
break;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
ws.on('close', () => {
|
|
542
|
+
if (!finalReceived) {
|
|
543
|
+
for (const frame of bstream.flush()) {
|
|
544
|
+
sendLastFrame(false);
|
|
545
|
+
lastFrame = frame;
|
|
546
|
+
}
|
|
547
|
+
sendLastFrame(true);
|
|
548
|
+
|
|
549
|
+
if (!this.queue.closed) {
|
|
550
|
+
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
resolve();
|
|
554
|
+
});
|
|
555
|
+
|
|
556
|
+
ws.on('error', (error) => {
|
|
557
|
+
reject(error);
|
|
558
|
+
});
|
|
559
|
+
});
|
|
560
|
+
};
|
|
561
|
+
|
|
562
|
+
try {
|
|
563
|
+
await Promise.all([inputTask(), sendTask(), recvTask()]);
|
|
564
|
+
} catch (e) {
|
|
565
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
566
|
+
throw new Error(`Sarvam TTS streaming failed: ${msg}`);
|
|
567
|
+
} finally {
|
|
568
|
+
await this.closeWebSocket(ws);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
}
|