sarvam-ai-sdk 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # AI SDK - Sarvam Provider
2
2
 
3
- The **[Sarvam provider](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://ai-sdk.dev/docs)
3
+ The **[Sarvam provider](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://v4.ai-sdk.dev/docs)
4
4
  contains language model support for the Sarvam chat completion, Text-to-Speech and Speech-to-Text APIs.
5
5
 
6
6
  ## Setup
@@ -11,6 +11,9 @@ The **[Sarvam](http://sarvam.ai)** provider is available in the `sarvam-ai-sdk`
11
11
  npm i sarvam-ai-sdk
12
12
  ```
13
13
 
14
+ > [!WARNING]
15
+ > This package only works with Vercel AI-SDK v4, not latest v6. Make sure to install `ai@4` in your project.
16
+
14
17
  ## Provider Instance
15
18
 
16
19
  You can import the default provider instance `sarvam` from `sarvam-ai-sdk`:
@@ -31,7 +34,7 @@ import { sarvam } from 'sarvam-ai-sdk';
31
34
  import { generateText } from 'ai';
32
35
 
33
36
  const { text } = await generateText({
34
- model: sarvam("sarvam-m"),
37
+ model: sarvam("sarvam-30b"),
35
38
  prompt: "Translate this to malayalam: 'Keep cooking, guys'",
36
39
  });
37
40
 
@@ -46,7 +49,7 @@ import { experimental_generateSpeech as generateSpeech } from "ai";
46
49
  import { writeFile } from "fs/promises";
47
50
 
48
51
  const { audio } = await generateSpeech({
49
- model: sarvam.speech("bulbul:v2", "ml-IN"),
52
+ model: sarvam.speech("bulbul:v3", "ml-IN"),
50
53
  text: "പാചകം തുടരൂ, സുഹൃത്തുക്കളേ",
51
54
  });
52
55
 
@@ -62,13 +65,26 @@ import { experimental_transcribe as transcribe } from "ai";
62
65
  import { readFile } from "fs/promises";
63
66
 
64
67
  const { text } = await transcribe({
65
- model: sarvam.transcription("saarika:v2", "ml-IN")
68
+ model: sarvam.transcription("saarika:v2.5", "ml-IN"),
66
69
  audio: await readFile("./src/transcript-test.wav"),
67
70
  });
68
71
 
69
72
  console.log(text); // പാചകം തുടരും സുഹൃത്തുക്കളെ
70
73
  ```
71
74
 
75
+ ```ts
76
+ import { sarvam } from "sarvam-ai-sdk";
77
+ import { experimental_transcribe as transcribe } from "ai";
78
+ import { readFile } from "fs/promises";
79
+
80
+ const { text } = await transcribe({
81
+ model: sarvam.transcription("saaras:v3", "en-IN"),
82
+ audio: await readFile("./src/transcript-test.wav"),
83
+ });
84
+
85
+ console.log(text); // Pachakam thudaroo, suhruthukkale.
86
+ ```
87
+
72
88
  ## Speech-to-Text-Translate
73
89
 
74
90
  ```ts
@@ -77,7 +93,7 @@ import { experimental_transcribe as transcribe } from "ai";
77
93
  import { readFile } from "fs/promises";
78
94
 
79
95
  const result = await transcribe({
80
- model: sarvam.speechTranslation("saaras:v2"),
96
+ model: sarvam.speechTranslation("saaras:v2.5"),
81
97
  audio: await readFile("./src/transcript-test.wav"),
82
98
  });
83
99
 
@@ -141,7 +157,7 @@ console.log(result.text); // ml-IN
141
157
  ## Tool Calling
142
158
 
143
159
  > [!WARNING]
144
- > Latest `sarvam-m` model isn't trained on native tool calling feature (aka JSON mode). So we simulate this with prompt engineering technique.
160
+ > Latest `sarvam` models isn't trained on native tool calling feature (aka JSON mode). So we simulate this with prompt engineering technique.
145
161
 
146
162
  ```ts
147
163
  import { z } from "zod";
@@ -150,7 +166,7 @@ import { sarvam } from "sarvam-ai-sdk";
150
166
 
151
167
 
152
168
  const result = await generateText({
153
- model: sarvam("sarvam-m", {
169
+ model: sarvam("sarvam-30b", {
154
170
  simulate: "tool-calling" // ⚠️ important
155
171
  }),
156
172
  tools: {
@@ -174,7 +190,7 @@ console.log(result.toolResults);
174
190
  ## Generate JSON object
175
191
 
176
192
  > [!WARNING]
177
- > Latest `sarvam-m` model isn't trained on native JSON object generation. So we simulate this with prompt engineering technique.
193
+ > Latest `sarvam` models isn't trained on native JSON object generation. So we simulate this with prompt engineering technique.
178
194
 
179
195
  ```ts
180
196
  import { z } from "zod";
@@ -182,7 +198,7 @@ import { sarvam } from "sarvam-ai-sdk";
182
198
  import { generateObject } from 'ai';
183
199
 
184
200
  const { object } = await generateObject({
185
- model: sarvam("sarvam-m", {
201
+ model: sarvam("sarvam-30b", {
186
202
  simulate: "json-object" // ⚠️ important
187
203
  }),
188
204
  schema: z.object({
@@ -198,6 +214,35 @@ const { object } = await generateObject({
198
214
  console.log(object);
199
215
  ```
200
216
 
217
+ ## All APIs
218
+
219
+ ```ts
220
+ import { sarvam } from "sarvam-ai-sdk";
221
+
222
+ // Text-to-Text + Chat Completion
223
+ sarvam("sarvam-105b");
224
+ sarvam.languageModel("sarvam-30b");
225
+
226
+ // Text-to-Text + Transliteration
227
+ sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
228
+
229
+ // Text-to-Text + Translation
230
+ sarvam.translation({ from: "en-IN", to: "ml-IN" });
231
+
232
+ // Text-to-Text + Language identification
233
+ sarvam.languageIdentification();
234
+
235
+ // Text-to-Speech
236
+ sarvam.speech("bulbul:v3", "ml-IN");
237
+
238
+ // Speech-to-Text + Transcribe to same language
239
+ sarvam.transcription("saarika:v2.5");
240
+
241
+ // Speech-to-Text + Translate to English
242
+ sarvam.speechTranslation("saaras:v3");
243
+ ```
244
+
245
+
201
246
  ## Documentation
202
247
 
203
- Please check out the **[Sarvam provider documentation](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
248
+ Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
package/dist/index.d.mts CHANGED
@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
2
2
  import { FetchFunction } from '@ai-sdk/provider-utils';
3
3
  import { z } from 'zod';
4
4
 
5
- type SarvamChatModelId = "sarvam-m" | (string & {});
5
+ /**
6
+ * @description Product models
7
+ */
8
+ type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
9
+ /**
10
+ * @description Legacy models
11
+ * @deprecated
12
+ */
13
+ type SarvamChatLegacyModelId = "sarvam-m";
6
14
  interface SarvamChatSettings {
7
15
  /**
8
16
  * Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
42
50
  type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
43
51
  declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
44
52
 
45
- type SarvamSpeechModelId = "bulbul:v1" | "bulbul:v2" | (string & {});
53
+ type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
46
54
  type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
47
- declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["meera", "pavithra", "maitreyi", "arvind", "amol", "amartya", "diya", "neel", "misha", "vian", "arjun", "maya", "anushka", "abhilash", "manisha", "vidya", "arya", "karun", "hitesh"]>>;
55
+ declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
48
56
  /**
49
57
  * Configuration settings for Sarvam Text-to-Speech API.
50
58
  *
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
56
64
  /**
57
65
  * The speaker voice to be used for the output audio.
58
66
  *
59
- * @default "meera"
60
- * @example "meera" (Default female voice for bulbul:v1)
61
- * @example "arvind" (Male voice for bulbul:v1)
62
- * @example "anushka" (Female voice for bulbul:v2)
67
+ * @default
68
+ * - "shubh" (Male voice for bulbul:v3)
69
+ * - "anushka" (Female voice for bulbul:v2)
70
+ * - "meera" (Female voice for bulbul:v1)
63
71
  */
64
72
  speaker?: SarvamSpeechVoices;
65
73
  /**
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
103
111
  * @example false (Disable preprocessing)
104
112
  */
105
113
  enable_preprocessing?: boolean;
114
+ /**
115
+ * Specifies the audio codec for the output audio file.
116
+ * Different codecs offer various compression and quality characteristics.
117
+ */
118
+ output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
119
+ /**
120
+ * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
121
+ * Lower values produce more stable and consistent output,
122
+ * while higher values sound more expressive but may introduce artifacts or errors.
123
+ *
124
+ * Any number inbetween 0.01 - 2
125
+ * @default 0.6
126
+ *
127
+ * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
128
+ */
129
+ temperature?: number;
130
+ /**
131
+ * The ID of a pronunciation dictionary to apply during synthesis.
132
+ * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
133
+ *
134
+ * Only supported by bulbul:v3.
135
+ */
136
+ dict_id?: string;
137
+ /**
138
+ * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
139
+ *
140
+ * @default false
141
+ *
142
+ * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
143
+ */
144
+ enable_cached_responses?: boolean;
106
145
  };
107
146
 
108
- type SarvamTranscriptionModelId = "saarika:v2" | "saarika:v1" | "saarika:flash" | (string & {});
109
- type SarvamSpeechTranslationModelId = "saaras:v1" | "saaras:v2" | "saaras:turbo" | "saaras:flash" | (string & {});
147
+ type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
148
+ type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
149
+ declare const SarvamProviderOptionsSchema: z.ZodObject<{
150
+ mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
151
+ with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
152
+ with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
153
+ num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
154
+ }, "strip", z.ZodTypeAny, {
155
+ mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
156
+ with_timestamps: boolean | null;
157
+ with_diarization: boolean | null;
158
+ num_speakers?: number | null | undefined;
159
+ }, {
160
+ mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
161
+ with_timestamps?: boolean | null | undefined;
162
+ with_diarization?: boolean | null | undefined;
163
+ num_speakers?: number | null | undefined;
164
+ }>;
110
165
  type SarvamTranscriptionCallOptions = {
166
+ /**
167
+ * @default "transcribe"
168
+ *
169
+ * @description
170
+ * - `transcribe`: Standard transcription in the original language, `output`: Text in source language
171
+ * - `translate`: Transcribe and translate to English, `output`: English text
172
+ * - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
173
+ * - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
174
+ * - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
175
+ */
176
+ mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
177
+ /**
178
+ * - Chunk-level timestamp support
179
+ * - Useful for subtitle alignment and audio navigation
180
+ * - Provides start and end times for each segment of text
181
+ */
111
182
  with_timestamps?: boolean;
112
183
  /**
113
184
  * Enables speaker diarization, which identifies and separates different speakers in the audio.
@@ -240,11 +311,19 @@ interface SarvamProvider {
240
311
  /**
241
312
  * Creates a model for text generation.
242
313
  */
243
- (modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
314
+ (
315
+ /**
316
+ * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
317
+ */
318
+ modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
244
319
  /**
245
320
  * Creates an Sarvam chat model for text generation.
246
321
  */
247
- languageModel(modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
322
+ languageModel(
323
+ /**
324
+ * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
325
+ */
326
+ modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
248
327
  /**
249
328
  * Creates a Sarvam model for transcription.
250
329
  */
package/dist/index.d.ts CHANGED
@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
2
2
  import { FetchFunction } from '@ai-sdk/provider-utils';
3
3
  import { z } from 'zod';
4
4
 
5
- type SarvamChatModelId = "sarvam-m" | (string & {});
5
+ /**
6
+ * @description Product models
7
+ */
8
+ type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
9
+ /**
10
+ * @description Legacy models
11
+ * @deprecated
12
+ */
13
+ type SarvamChatLegacyModelId = "sarvam-m";
6
14
  interface SarvamChatSettings {
7
15
  /**
8
16
  * Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
42
50
  type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
43
51
  declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
44
52
 
45
- type SarvamSpeechModelId = "bulbul:v1" | "bulbul:v2" | (string & {});
53
+ type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
46
54
  type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
47
- declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["meera", "pavithra", "maitreyi", "arvind", "amol", "amartya", "diya", "neel", "misha", "vian", "arjun", "maya", "anushka", "abhilash", "manisha", "vidya", "arya", "karun", "hitesh"]>>;
55
+ declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
48
56
  /**
49
57
  * Configuration settings for Sarvam Text-to-Speech API.
50
58
  *
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
56
64
  /**
57
65
  * The speaker voice to be used for the output audio.
58
66
  *
59
- * @default "meera"
60
- * @example "meera" (Default female voice for bulbul:v1)
61
- * @example "arvind" (Male voice for bulbul:v1)
62
- * @example "anushka" (Female voice for bulbul:v2)
67
+ * @default
68
+ * - "shubh" (Male voice for bulbul:v3)
69
+ * - "anushka" (Female voice for bulbul:v2)
70
+ * - "meera" (Female voice for bulbul:v1)
63
71
  */
64
72
  speaker?: SarvamSpeechVoices;
65
73
  /**
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
103
111
  * @example false (Disable preprocessing)
104
112
  */
105
113
  enable_preprocessing?: boolean;
114
+ /**
115
+ * Specifies the audio codec for the output audio file.
116
+ * Different codecs offer various compression and quality characteristics.
117
+ */
118
+ output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
119
+ /**
120
+ * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
121
+ * Lower values produce more stable and consistent output,
122
+ * while higher values sound more expressive but may introduce artifacts or errors.
123
+ *
124
+ * Any number inbetween 0.01 - 2
125
+ * @default 0.6
126
+ *
127
+ * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
128
+ */
129
+ temperature?: number;
130
+ /**
131
+ * The ID of a pronunciation dictionary to apply during synthesis.
132
+ * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
133
+ *
134
+ * Only supported by bulbul:v3.
135
+ */
136
+ dict_id?: string;
137
+ /**
138
+ * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
139
+ *
140
+ * @default false
141
+ *
142
+ * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
143
+ */
144
+ enable_cached_responses?: boolean;
106
145
  };
107
146
 
108
- type SarvamTranscriptionModelId = "saarika:v2" | "saarika:v1" | "saarika:flash" | (string & {});
109
- type SarvamSpeechTranslationModelId = "saaras:v1" | "saaras:v2" | "saaras:turbo" | "saaras:flash" | (string & {});
147
+ type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
148
+ type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
149
+ declare const SarvamProviderOptionsSchema: z.ZodObject<{
150
+ mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
151
+ with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
152
+ with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
153
+ num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
154
+ }, "strip", z.ZodTypeAny, {
155
+ mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
156
+ with_timestamps: boolean | null;
157
+ with_diarization: boolean | null;
158
+ num_speakers?: number | null | undefined;
159
+ }, {
160
+ mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
161
+ with_timestamps?: boolean | null | undefined;
162
+ with_diarization?: boolean | null | undefined;
163
+ num_speakers?: number | null | undefined;
164
+ }>;
110
165
  type SarvamTranscriptionCallOptions = {
166
+ /**
167
+ * @default "transcribe"
168
+ *
169
+ * @description
170
+ * - `transcribe`: Standard transcription in the original language, `output`: Text in source language
171
+ * - `translate`: Transcribe and translate to English, `output`: English text
172
+ * - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
173
+ * - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
174
+ * - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
175
+ */
176
+ mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
177
+ /**
178
+ * - Chunk-level timestamp support
179
+ * - Useful for subtitle alignment and audio navigation
180
+ * - Provides start and end times for each segment of text
181
+ */
111
182
  with_timestamps?: boolean;
112
183
  /**
113
184
  * Enables speaker diarization, which identifies and separates different speakers in the audio.
@@ -240,11 +311,19 @@ interface SarvamProvider {
240
311
  /**
241
312
  * Creates a model for text generation.
242
313
  */
243
- (modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
314
+ (
315
+ /**
316
+ * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
317
+ */
318
+ modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
244
319
  /**
245
320
  * Creates an Sarvam chat model for text generation.
246
321
  */
247
- languageModel(modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
322
+ languageModel(
323
+ /**
324
+ * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
325
+ */
326
+ modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
248
327
  /**
249
328
  * Creates a Sarvam model for transcription.
250
329
  */
package/dist/index.js CHANGED
@@ -767,30 +767,72 @@ var sarvamChatChunkSchema = import_zod2.z.union([
767
767
 
768
768
  // src/sarvam-speech-model.ts
769
769
  var import_provider_utils5 = require("@ai-sdk/provider-utils");
770
+ var import_zod4 = require("zod");
770
771
 
771
772
  // src/sarvam-speech-settings.ts
772
773
  var import_zod3 = require("zod");
773
774
  var SpeakerSchema = import_zod3.z.enum([
774
- "meera",
775
- "pavithra",
776
- "maitreyi",
777
- "arvind",
778
- "amol",
779
- "amartya",
780
- "diya",
781
- "neel",
782
- "misha",
783
- "vian",
784
- "arjun",
785
- "maya",
786
- "anushka",
775
+ // male bulbul:v2
787
776
  "abhilash",
777
+ "karun",
778
+ "hitesh",
779
+ // female bulbul:v2
780
+ "anushka",
788
781
  "manisha",
789
782
  "vidya",
790
783
  "arya",
791
- "karun",
792
- "hitesh"
793
- ]).default("meera");
784
+ // male bulbul:v3
785
+ "shubh",
786
+ "aditya",
787
+ "rahul",
788
+ "rohan",
789
+ "amit",
790
+ "dev",
791
+ "ratan",
792
+ "varun",
793
+ "manan",
794
+ "sumit",
795
+ "kabir",
796
+ "aayan",
797
+ "ashutosh",
798
+ "advait",
799
+ "anand",
800
+ "tarun",
801
+ "sunny",
802
+ "mani",
803
+ "gokul",
804
+ "vijay",
805
+ "mohit",
806
+ "rehan",
807
+ "soham",
808
+ // female bulbul:v3
809
+ "ritu",
810
+ "priya",
811
+ "neha",
812
+ "pooja",
813
+ "simran",
814
+ "kavya",
815
+ "ishita",
816
+ "shreya",
817
+ "roopa",
818
+ "amelia",
819
+ "sophia",
820
+ "tanya",
821
+ "shruti",
822
+ "suhani",
823
+ "kavitha",
824
+ "rupali"
825
+ ]);
826
+ var outputAudioCodecSchema = import_zod3.z.enum([
827
+ "mp3",
828
+ "linear16",
829
+ "mulaw",
830
+ "alaw",
831
+ "opus",
832
+ "flac",
833
+ "aac",
834
+ "wav"
835
+ ]);
794
836
  var SarvamProviderOptionsSchema = import_zod3.z.object({
795
837
  speaker: SpeakerSchema,
796
838
  pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
@@ -802,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
802
844
  import_zod3.z.literal(22050),
803
845
  import_zod3.z.literal(24e3)
804
846
  ]).default(22050),
805
- enable_preprocessing: import_zod3.z.boolean().default(false)
847
+ enable_preprocessing: import_zod3.z.boolean().default(false),
848
+ output_audio_codec: outputAudioCodecSchema.optional(),
849
+ temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
850
+ dict_id: import_zod3.z.string().optional(),
851
+ enable_cached_responses: import_zod3.z.boolean().default(false)
806
852
  }).partial();
807
853
 
808
854
  // src/sarvam-speech-model.ts
809
- var import_zod4 = require("zod");
810
855
  var SarvamSpeechModel = class {
811
856
  constructor(modelId, languageCode, config) {
812
857
  this.modelId = modelId;
@@ -821,7 +866,7 @@ var SarvamSpeechModel = class {
821
866
  text,
822
867
  voice,
823
868
  outputFormat = "wav",
824
- // speed,
869
+ speed,
825
870
  // instructions,
826
871
  providerOptions
827
872
  }) {
@@ -837,48 +882,42 @@ var SarvamSpeechModel = class {
837
882
  schema: SarvamProviderOptionsSchema
838
883
  });
839
884
  const getSpeaker = () => {
840
- if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
841
885
  if (voice) {
842
886
  return SpeakerSchema.parse(voice);
843
887
  }
844
888
  switch (this.modelId) {
845
- case "bulbul:v1":
846
- return "meera";
847
889
  case "bulbul:v2":
848
890
  return "manisha";
891
+ case "bulbul:v3":
892
+ return "shubh";
849
893
  }
850
- return "meera";
894
+ return "shubh";
851
895
  };
852
896
  const requestBody = {
853
897
  model: this.modelId,
854
898
  text,
855
899
  target_language_code: this.languageCode,
856
- speaker: getSpeaker()
857
- // response_format: "wav",
858
- // speed,
859
- // instructions,
900
+ speaker: getSpeaker(),
901
+ pace: speed
860
902
  };
861
903
  if (outputFormat) {
862
- if (["mp3", "opus", "aac", "flac", "wav", "pcm"].includes(
863
- outputFormat
864
- )) {
865
- requestBody.response_format = outputFormat;
904
+ const of = outputAudioCodecSchema.safeParse(outputFormat);
905
+ if (of.success) {
906
+ requestBody.output_audio_codec = of.data;
866
907
  } else {
867
908
  warnings.push({
868
909
  type: "unsupported-setting",
869
910
  setting: "outputFormat",
870
- details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
911
+ details: `Unsupported output format: ${outputFormat}. Using wav instead.`
871
912
  });
872
913
  }
873
914
  }
874
915
  if (sarvamOptions) {
875
- const speechModelOptions = {};
876
- for (const key in speechModelOptions) {
877
- const value = speechModelOptions[key];
916
+ Object.entries(sarvamOptions).forEach(([key, value]) => {
878
917
  if (value !== void 0) {
879
918
  requestBody[key] = value;
880
919
  }
881
- }
920
+ });
882
921
  }
883
922
  return {
884
923
  requestBody,
@@ -934,6 +973,7 @@ var import_zod6 = require("zod");
934
973
  // src/sarvam-transcription-settings.ts
935
974
  var import_zod5 = require("zod");
936
975
  var SarvamProviderOptionsSchema2 = import_zod5.z.object({
976
+ mode: import_zod5.z.enum(["transcribe", "translate", "verbatim", "translit", "codemix"]).default("transcribe"),
937
977
  with_timestamps: import_zod5.z.boolean().nullish().default(false),
938
978
  with_diarization: import_zod5.z.boolean().nullish().default(false),
939
979
  num_speakers: import_zod5.z.number().int().nullish()
@@ -956,10 +996,6 @@ var SarvamTranscriptionModel = class {
956
996
  providerOptions
957
997
  }) {
958
998
  const warnings = [];
959
- if (this.modelId === "saarika:v1" && this.languageCode === "unknown")
960
- throw new Error(
961
- "Language code unknown is not supported for model saarika:v1"
962
- );
963
999
  const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
964
1000
  provider: "sarvam",
965
1001
  providerOptions: {
@@ -974,22 +1010,13 @@ var SarvamTranscriptionModel = class {
974
1010
  const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
975
1011
  formData.append("file", blob);
976
1012
  formData.append("model", this.modelId);
1013
+ formData.append("language_code", this.languageCode);
977
1014
  if (sarvamOptions) {
978
- formData.append("language_code", this.languageCode);
979
- formData.append(
980
- "with_timestamps",
981
- sarvamOptions.with_timestamps ? "true" : "false"
982
- );
983
- formData.append(
984
- "with_diarization",
985
- sarvamOptions.with_diarization ? "true" : "false"
986
- );
987
- if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
988
- formData.append(
989
- "num_speakers",
990
- sarvamOptions.num_speakers.toString()
991
- );
992
- }
1015
+ Object.entries(sarvamOptions).forEach(([key, value]) => {
1016
+ if (value) {
1017
+ formData.append(key, String(value));
1018
+ }
1019
+ });
993
1020
  }
994
1021
  return {
995
1022
  formData,