@ai-sdk/deepgram 2.0.7 → 2.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,498 @@
1
+ import { SpeechModelV3, SharedV3Warning } from '@ai-sdk/provider';
2
+ import {
3
+ combineHeaders,
4
+ createBinaryResponseHandler,
5
+ parseProviderOptions,
6
+ postJsonToApi,
7
+ } from '@ai-sdk/provider-utils';
8
+ import { z } from 'zod/v4';
9
+ import { DeepgramConfig } from './deepgram-config';
10
+ import { deepgramFailedResponseHandler } from './deepgram-error';
11
+ import { DeepgramSpeechModelId } from './deepgram-speech-options';
12
+
13
+ // https://developers.deepgram.com/reference/text-to-speech/speak-request
14
+ const deepgramSpeechProviderOptionsSchema = z.object({
15
+ /** Bitrate of the audio in bits per second. Can be a number or predefined enum value. */
16
+ bitRate: z.union([z.number(), z.string()]).nullish(),
17
+ /** Container format for the output audio (mp3, wav, etc.). */
18
+ container: z.string().nullish(),
19
+ /** Encoding type for the audio output (linear16, mulaw, alaw, etc.). */
20
+ encoding: z.string().nullish(),
21
+ /** Sample rate for the output audio in Hz (8000, 16000, 24000, 44100, 48000). */
22
+ sampleRate: z.number().nullish(),
23
+ /** URL to which we'll make the callback request. */
24
+ callback: z.string().url().nullish(),
25
+ /** HTTP method by which the callback request will be made (POST or PUT). */
26
+ callbackMethod: z.enum(['POST', 'PUT']).nullish(),
27
+ /** Opts out requests from the Deepgram Model Improvement Program. */
28
+ mipOptOut: z.boolean().nullish(),
29
+ /** Label your requests for the purpose of identification during usage reporting. */
30
+ tag: z.union([z.string(), z.array(z.string())]).nullish(),
31
+ });
32
+
33
+ export type DeepgramSpeechCallOptions = z.infer<
34
+ typeof deepgramSpeechProviderOptionsSchema
35
+ >;
36
+
37
+ interface DeepgramSpeechModelConfig extends DeepgramConfig {
38
+ _internal?: {
39
+ currentDate?: () => Date;
40
+ };
41
+ }
42
+
43
+ export class DeepgramSpeechModel implements SpeechModelV3 {
44
+ readonly specificationVersion = 'v3';
45
+
46
+ get provider(): string {
47
+ return this.config.provider;
48
+ }
49
+
50
+ constructor(
51
+ readonly modelId: DeepgramSpeechModelId,
52
+ private readonly config: DeepgramSpeechModelConfig,
53
+ ) {}
54
+
55
+ private async getArgs({
56
+ text,
57
+ voice,
58
+ outputFormat = 'mp3',
59
+ speed,
60
+ language,
61
+ instructions,
62
+ providerOptions,
63
+ }: Parameters<SpeechModelV3['doGenerate']>[0]) {
64
+ const warnings: SharedV3Warning[] = [];
65
+
66
+ // Parse provider options
67
+ const deepgramOptions = await parseProviderOptions({
68
+ provider: 'deepgram',
69
+ providerOptions,
70
+ schema: deepgramSpeechProviderOptionsSchema,
71
+ });
72
+
73
+ // Create request body
74
+ const requestBody = {
75
+ text,
76
+ };
77
+
78
+ // Prepare query parameters
79
+ const queryParams: Record<string, string> = {
80
+ model: this.modelId,
81
+ };
82
+
83
+ // Map outputFormat to encoding/container/sample_rate
84
+ // https://developers.deepgram.com/docs/tts-media-output-settings#audio-format-combinations
85
+ if (outputFormat) {
86
+ const formatLower = outputFormat.toLowerCase();
87
+
88
+ // Common format mappings based on Deepgram's valid combinations
89
+ const formatMap: Record<
90
+ string,
91
+ {
92
+ encoding?: string;
93
+ container?: string;
94
+ sampleRate?: number;
95
+ bitRate?: number;
96
+ }
97
+ > = {
98
+ // MP3: no container, fixed 22050 sample rate, bitrate 32000/48000
99
+ mp3: { encoding: 'mp3' }, // Don't set container or sample_rate for mp3
100
+ // Linear16: wav/none container, configurable sample rate
101
+ wav: { container: 'wav', encoding: 'linear16' },
102
+ linear16: { encoding: 'linear16', container: 'wav' },
103
+ // MuLaw: wav/none container, 8000/16000 sample rate
104
+ mulaw: { encoding: 'mulaw', container: 'wav' },
105
+ // ALaw: wav/none container, 8000/16000 sample rate
106
+ alaw: { encoding: 'alaw', container: 'wav' },
107
+ // Opus: ogg container, fixed 48000 sample rate
108
+ opus: { encoding: 'opus', container: 'ogg' },
109
+ ogg: { encoding: 'opus', container: 'ogg' },
110
+ // FLAC: no container, configurable sample rate
111
+ flac: { encoding: 'flac' },
112
+ // AAC: no container, fixed 22050 sample rate
113
+ aac: { encoding: 'aac' },
114
+ // Raw audio (no container)
115
+ pcm: { encoding: 'linear16', container: 'none' },
116
+ };
117
+
118
+ const mappedFormat = formatMap[formatLower];
119
+ if (mappedFormat) {
120
+ if (mappedFormat.encoding) {
121
+ queryParams.encoding = mappedFormat.encoding;
122
+ }
123
+ // Only set container if specified and valid for the encoding
124
+ if (mappedFormat.container) {
125
+ queryParams.container = mappedFormat.container;
126
+ }
127
+ // Only set sample_rate if specified and valid for the encoding
128
+ if (mappedFormat.sampleRate) {
129
+ queryParams.sample_rate = String(mappedFormat.sampleRate);
130
+ }
131
+ // Set bitrate for formats that support it
132
+ if (mappedFormat.bitRate) {
133
+ queryParams.bit_rate = String(mappedFormat.bitRate);
134
+ }
135
+ } else {
136
+ // Try to parse format like "wav_44100" or "linear16_24000"
137
+ const parts = formatLower.split('_');
138
+ if (parts.length >= 2) {
139
+ const firstPart = parts[0];
140
+ const secondPart = parts[1];
141
+ const sampleRate = parseInt(secondPart, 10);
142
+
143
+ // Check if first part is an encoding
144
+ if (
145
+ [
146
+ 'linear16',
147
+ 'mulaw',
148
+ 'alaw',
149
+ 'mp3',
150
+ 'opus',
151
+ 'flac',
152
+ 'aac',
153
+ ].includes(firstPart)
154
+ ) {
155
+ queryParams.encoding = firstPart;
156
+
157
+ // Set container based on encoding
158
+ if (['linear16', 'mulaw', 'alaw'].includes(firstPart)) {
159
+ // These can use wav or none, default to wav
160
+ queryParams.container = 'wav';
161
+ } else if (firstPart === 'opus') {
162
+ queryParams.container = 'ogg';
163
+ }
164
+ // mp3, flac, aac don't use container
165
+
166
+ // Set sample rate if valid for encoding
167
+ if (!isNaN(sampleRate)) {
168
+ if (
169
+ firstPart === 'linear16' &&
170
+ [8000, 16000, 24000, 32000, 48000].includes(sampleRate)
171
+ ) {
172
+ queryParams.sample_rate = String(sampleRate);
173
+ } else if (
174
+ firstPart === 'mulaw' &&
175
+ [8000, 16000].includes(sampleRate)
176
+ ) {
177
+ queryParams.sample_rate = String(sampleRate);
178
+ } else if (
179
+ firstPart === 'alaw' &&
180
+ [8000, 16000].includes(sampleRate)
181
+ ) {
182
+ queryParams.sample_rate = String(sampleRate);
183
+ } else if (
184
+ firstPart === 'flac' &&
185
+ [8000, 16000, 22050, 32000, 48000].includes(sampleRate)
186
+ ) {
187
+ queryParams.sample_rate = String(sampleRate);
188
+ }
189
+ // mp3, opus, aac have fixed sample rates, don't set
190
+ }
191
+ } else if (['wav', 'ogg'].includes(firstPart)) {
192
+ // First part is container
193
+ if (firstPart === 'wav') {
194
+ queryParams.container = 'wav';
195
+ queryParams.encoding = 'linear16'; // Default encoding for wav
196
+ } else if (firstPart === 'ogg') {
197
+ queryParams.container = 'ogg';
198
+ queryParams.encoding = 'opus'; // Default encoding for ogg
199
+ }
200
+ if (!isNaN(sampleRate)) {
201
+ queryParams.sample_rate = String(sampleRate);
202
+ }
203
+ }
204
+ }
205
+ }
206
+ }
207
+
208
+ // Add provider-specific options - map camelCase to snake_case
209
+ // Validate combinations according to Deepgram's spec
210
+ if (deepgramOptions) {
211
+ if (deepgramOptions.encoding) {
212
+ const newEncoding = deepgramOptions.encoding.toLowerCase();
213
+
214
+ // If encoding changes, we may need to clear incompatible parameters
215
+ queryParams.encoding = newEncoding;
216
+
217
+ // Validate container based on encoding
218
+ if (deepgramOptions.container) {
219
+ // Validate container is valid for this encoding
220
+ if (['linear16', 'mulaw', 'alaw'].includes(newEncoding)) {
221
+ if (
222
+ !['wav', 'none'].includes(deepgramOptions.container.toLowerCase())
223
+ ) {
224
+ warnings.push({
225
+ type: 'unsupported',
226
+ feature: 'providerOptions',
227
+ details: `Encoding "${newEncoding}" only supports containers "wav" or "none". Container "${deepgramOptions.container}" was ignored.`,
228
+ });
229
+ } else {
230
+ queryParams.container = deepgramOptions.container.toLowerCase();
231
+ }
232
+ } else if (newEncoding === 'opus') {
233
+ // opus requires ogg container, override any previous container setting
234
+ queryParams.container = 'ogg';
235
+ } else if (['mp3', 'flac', 'aac'].includes(newEncoding)) {
236
+ warnings.push({
237
+ type: 'unsupported',
238
+ feature: 'providerOptions',
239
+ details: `Encoding "${newEncoding}" does not support container parameter. Container "${deepgramOptions.container}" was ignored.`,
240
+ });
241
+ // Remove container if it was set by outputFormat
242
+ delete queryParams.container;
243
+ }
244
+ } else {
245
+ // No container specified in providerOptions
246
+ // If encoding changed to one that doesn't support container, remove it
247
+ if (['mp3', 'flac', 'aac'].includes(newEncoding)) {
248
+ delete queryParams.container;
249
+ } else if (['linear16', 'mulaw', 'alaw'].includes(newEncoding)) {
250
+ // Set default container if not already set
251
+ if (!queryParams.container) {
252
+ queryParams.container = 'wav'; // Default for these encodings
253
+ }
254
+ } else if (newEncoding === 'opus') {
255
+ // opus requires ogg container, override any previous container setting
256
+ queryParams.container = 'ogg';
257
+ }
258
+ }
259
+
260
+ // Clean up sample_rate and bit_rate if they're incompatible with the new encoding
261
+ // Fixed sample rate encodings (mp3, opus, aac) don't support sample_rate parameter
262
+ if (['mp3', 'opus', 'aac'].includes(newEncoding)) {
263
+ delete queryParams.sample_rate;
264
+ }
265
+ // Lossless encodings without bitrate support (linear16, mulaw, alaw, flac) don't support bit_rate
266
+ if (['linear16', 'mulaw', 'alaw', 'flac'].includes(newEncoding)) {
267
+ delete queryParams.bit_rate;
268
+ }
269
+ } else if (deepgramOptions.container) {
270
+ // Container specified without encoding - set default encoding
271
+ const container = deepgramOptions.container.toLowerCase();
272
+ const oldEncoding = queryParams.encoding?.toLowerCase();
273
+ let newEncoding: string | undefined;
274
+
275
+ if (container === 'wav') {
276
+ queryParams.container = 'wav';
277
+ newEncoding = 'linear16'; // Default encoding for wav
278
+ } else if (container === 'ogg') {
279
+ queryParams.container = 'ogg';
280
+ newEncoding = 'opus'; // Default encoding for ogg
281
+ } else if (container === 'none') {
282
+ queryParams.container = 'none';
283
+ newEncoding = 'linear16'; // Default encoding for raw audio
284
+ }
285
+
286
+ // If encoding changed, clean up incompatible parameters
287
+ if (newEncoding && newEncoding !== oldEncoding) {
288
+ queryParams.encoding = newEncoding;
289
+ // Clean up sample_rate and bit_rate if they're incompatible with the new encoding
290
+ if (['mp3', 'opus', 'aac'].includes(newEncoding)) {
291
+ delete queryParams.sample_rate;
292
+ }
293
+ if (['linear16', 'mulaw', 'alaw', 'flac'].includes(newEncoding)) {
294
+ delete queryParams.bit_rate;
295
+ }
296
+ }
297
+ }
298
+
299
+ if (deepgramOptions.sampleRate != null) {
300
+ const encoding = queryParams.encoding?.toLowerCase() || '';
301
+ const sampleRate = deepgramOptions.sampleRate;
302
+
303
+ // Validate sample rate based on encoding
304
+ if (encoding === 'linear16') {
305
+ if (![8000, 16000, 24000, 32000, 48000].includes(sampleRate)) {
306
+ warnings.push({
307
+ type: 'unsupported',
308
+ feature: 'providerOptions',
309
+ details: `Encoding "linear16" only supports sample rates: 8000, 16000, 24000, 32000, 48000. Sample rate ${sampleRate} was ignored.`,
310
+ });
311
+ } else {
312
+ queryParams.sample_rate = String(sampleRate);
313
+ }
314
+ } else if (encoding === 'mulaw' || encoding === 'alaw') {
315
+ if (![8000, 16000].includes(sampleRate)) {
316
+ warnings.push({
317
+ type: 'unsupported',
318
+ feature: 'providerOptions',
319
+ details: `Encoding "${encoding}" only supports sample rates: 8000, 16000. Sample rate ${sampleRate} was ignored.`,
320
+ });
321
+ } else {
322
+ queryParams.sample_rate = String(sampleRate);
323
+ }
324
+ } else if (encoding === 'flac') {
325
+ if (![8000, 16000, 22050, 32000, 48000].includes(sampleRate)) {
326
+ warnings.push({
327
+ type: 'unsupported',
328
+ feature: 'providerOptions',
329
+ details: `Encoding "flac" only supports sample rates: 8000, 16000, 22050, 32000, 48000. Sample rate ${sampleRate} was ignored.`,
330
+ });
331
+ } else {
332
+ queryParams.sample_rate = String(sampleRate);
333
+ }
334
+ } else if (['mp3', 'opus', 'aac'].includes(encoding)) {
335
+ warnings.push({
336
+ type: 'unsupported',
337
+ feature: 'providerOptions',
338
+ details: `Encoding "${encoding}" has a fixed sample rate and does not support sample_rate parameter. Sample rate ${sampleRate} was ignored.`,
339
+ });
340
+ } else {
341
+ // No encoding set yet, allow it (will be validated when encoding is set)
342
+ queryParams.sample_rate = String(sampleRate);
343
+ }
344
+ }
345
+
346
+ if (deepgramOptions.bitRate != null) {
347
+ const encoding = queryParams.encoding?.toLowerCase() || '';
348
+ const bitRate = deepgramOptions.bitRate;
349
+
350
+ // Validate bitrate based on encoding
351
+ if (encoding === 'mp3') {
352
+ if (![32000, 48000].includes(Number(bitRate))) {
353
+ warnings.push({
354
+ type: 'unsupported',
355
+ feature: 'providerOptions',
356
+ details: `Encoding "mp3" only supports bit rates: 32000, 48000. Bit rate ${bitRate} was ignored.`,
357
+ });
358
+ } else {
359
+ queryParams.bit_rate = String(bitRate);
360
+ }
361
+ } else if (encoding === 'opus') {
362
+ const bitRateNum = Number(bitRate);
363
+ if (bitRateNum < 4000 || bitRateNum > 650000) {
364
+ warnings.push({
365
+ type: 'unsupported',
366
+ feature: 'providerOptions',
367
+ details: `Encoding "opus" supports bit rates between 4000 and 650000. Bit rate ${bitRate} was ignored.`,
368
+ });
369
+ } else {
370
+ queryParams.bit_rate = String(bitRate);
371
+ }
372
+ } else if (encoding === 'aac') {
373
+ const bitRateNum = Number(bitRate);
374
+ if (bitRateNum < 4000 || bitRateNum > 192000) {
375
+ warnings.push({
376
+ type: 'unsupported',
377
+ feature: 'providerOptions',
378
+ details: `Encoding "aac" supports bit rates between 4000 and 192000. Bit rate ${bitRate} was ignored.`,
379
+ });
380
+ } else {
381
+ queryParams.bit_rate = String(bitRate);
382
+ }
383
+ } else if (['linear16', 'mulaw', 'alaw', 'flac'].includes(encoding)) {
384
+ warnings.push({
385
+ type: 'unsupported',
386
+ feature: 'providerOptions',
387
+ details: `Encoding "${encoding}" does not support bit_rate parameter. Bit rate ${bitRate} was ignored.`,
388
+ });
389
+ } else {
390
+ // No encoding set yet, allow it
391
+ queryParams.bit_rate = String(bitRate);
392
+ }
393
+ }
394
+
395
+ if (deepgramOptions.callback) {
396
+ queryParams.callback = deepgramOptions.callback;
397
+ }
398
+ if (deepgramOptions.callbackMethod) {
399
+ queryParams.callback_method = deepgramOptions.callbackMethod;
400
+ }
401
+ if (deepgramOptions.mipOptOut != null) {
402
+ queryParams.mip_opt_out = String(deepgramOptions.mipOptOut);
403
+ }
404
+ if (deepgramOptions.tag) {
405
+ if (Array.isArray(deepgramOptions.tag)) {
406
+ queryParams.tag = deepgramOptions.tag.join(',');
407
+ } else {
408
+ queryParams.tag = deepgramOptions.tag;
409
+ }
410
+ }
411
+ }
412
+
413
+ // Handle voice parameter - Deepgram embeds voice in model ID
414
+ // If voice is provided and different from model, warn user
415
+ if (voice && voice !== this.modelId) {
416
+ warnings.push({
417
+ type: 'unsupported',
418
+ feature: 'voice',
419
+ details: `Deepgram TTS models embed the voice in the model ID. The voice parameter "${voice}" was ignored. Use the model ID to select a voice (e.g., "aura-2-helena-en").`,
420
+ });
421
+ }
422
+
423
+ // Handle speed - not supported in Deepgram REST API
424
+ if (speed != null) {
425
+ warnings.push({
426
+ type: 'unsupported',
427
+ feature: 'speed',
428
+ details: `Deepgram TTS REST API does not support speed adjustment. Speed parameter was ignored.`,
429
+ });
430
+ }
431
+
432
+ // Handle language - Deepgram models are language-specific via model ID
433
+ if (language) {
434
+ warnings.push({
435
+ type: 'unsupported',
436
+ feature: 'language',
437
+ details: `Deepgram TTS models are language-specific via the model ID. Language parameter "${language}" was ignored. Select a model with the appropriate language suffix (e.g., "-en" for English).`,
438
+ });
439
+ }
440
+
441
+ // Handle instructions - not supported in Deepgram REST API
442
+ if (instructions) {
443
+ warnings.push({
444
+ type: 'unsupported',
445
+ feature: 'instructions',
446
+ details: `Deepgram TTS REST API does not support instructions. Instructions parameter was ignored.`,
447
+ });
448
+ }
449
+
450
+ return {
451
+ requestBody,
452
+ queryParams,
453
+ warnings,
454
+ };
455
+ }
456
+
457
+ async doGenerate(
458
+ options: Parameters<SpeechModelV3['doGenerate']>[0],
459
+ ): Promise<Awaited<ReturnType<SpeechModelV3['doGenerate']>>> {
460
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
461
+ const { requestBody, queryParams, warnings } = await this.getArgs(options);
462
+
463
+ const {
464
+ value: audio,
465
+ responseHeaders,
466
+ rawValue: rawResponse,
467
+ } = await postJsonToApi({
468
+ url: (() => {
469
+ const baseUrl = this.config.url({
470
+ path: '/v1/speak',
471
+ modelId: this.modelId,
472
+ });
473
+ const queryString = new URLSearchParams(queryParams).toString();
474
+ return queryString ? `${baseUrl}?${queryString}` : baseUrl;
475
+ })(),
476
+ headers: combineHeaders(this.config.headers(), options.headers),
477
+ body: requestBody,
478
+ failedResponseHandler: deepgramFailedResponseHandler,
479
+ successfulResponseHandler: createBinaryResponseHandler(),
480
+ abortSignal: options.abortSignal,
481
+ fetch: this.config.fetch,
482
+ });
483
+
484
+ return {
485
+ audio,
486
+ warnings,
487
+ request: {
488
+ body: JSON.stringify(requestBody),
489
+ },
490
+ response: {
491
+ timestamp: currentDate,
492
+ modelId: this.modelId,
493
+ headers: responseHeaders,
494
+ body: rawResponse,
495
+ },
496
+ };
497
+ }
498
+ }
@@ -0,0 +1,10 @@
1
+ export type DeepgramSpeechModelId =
2
+ | 'aura-asteria-en'
3
+ | 'aura-2-asteria-en'
4
+ | 'aura-2-thalia-en'
5
+ | 'aura-2-helena-en'
6
+ | 'aura-2-orpheus-en'
7
+ | 'aura-2-zeus-en'
8
+ | 'aura-luna-en'
9
+ | 'aura-stella-en'
10
+ | (string & {});