sarvam-ai-sdk 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -10
- package/dist/index.d.mts +90 -11
- package/dist/index.d.ts +90 -11
- package/dist/index.js +82 -55
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +82 -55
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# AI SDK - Sarvam Provider
|
|
2
2
|
|
|
3
|
-
The **[Sarvam provider](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://ai-sdk.dev/docs)
|
|
3
|
+
The **[Sarvam provider](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://v4.ai-sdk.dev/docs)
|
|
4
4
|
contains language model support for the Sarvam chat completion, Text-to-Speech and Speech-to-Text APIs.
|
|
5
5
|
|
|
6
6
|
## Setup
|
|
@@ -11,6 +11,9 @@ The **[Sarvam](http://sarvam.ai)** provider is available in the `sarvam-ai-sdk`
|
|
|
11
11
|
npm i sarvam-ai-sdk
|
|
12
12
|
```
|
|
13
13
|
|
|
14
|
+
> [!WARNING]
|
|
15
|
+
> This package only works with Vercel AI-SDK v4, not latest v6. Make sure to install `ai@4` in your project.
|
|
16
|
+
|
|
14
17
|
## Provider Instance
|
|
15
18
|
|
|
16
19
|
You can import the default provider instance `sarvam` from `sarvam-ai-sdk`:
|
|
@@ -31,7 +34,7 @@ import { sarvam } from 'sarvam-ai-sdk';
|
|
|
31
34
|
import { generateText } from 'ai';
|
|
32
35
|
|
|
33
36
|
const { text } = await generateText({
|
|
34
|
-
|
|
37
|
+
model: sarvam("sarvam-30b"),
|
|
35
38
|
prompt: "Translate this to malayalam: 'Keep cooking, guys'",
|
|
36
39
|
});
|
|
37
40
|
|
|
@@ -46,7 +49,7 @@ import { experimental_generateSpeech as generateSpeech } from "ai";
|
|
|
46
49
|
import { writeFile } from "fs/promises";
|
|
47
50
|
|
|
48
51
|
const { audio } = await generateSpeech({
|
|
49
|
-
model: sarvam.speech("bulbul:
|
|
52
|
+
model: sarvam.speech("bulbul:v3", "ml-IN"),
|
|
50
53
|
text: "പാചകം തുടരൂ, സുഹൃത്തുക്കളേ",
|
|
51
54
|
});
|
|
52
55
|
|
|
@@ -62,13 +65,26 @@ import { experimental_transcribe as transcribe } from "ai";
|
|
|
62
65
|
import { readFile } from "fs/promises";
|
|
63
66
|
|
|
64
67
|
const { text } = await transcribe({
|
|
65
|
-
model: sarvam.transcription("saarika:v2", "ml-IN")
|
|
68
|
+
model: sarvam.transcription("saarika:v2.5", "ml-IN"),
|
|
66
69
|
audio: await readFile("./src/transcript-test.wav"),
|
|
67
70
|
});
|
|
68
71
|
|
|
69
72
|
console.log(text); // പാചകം തുടരും സുഹൃത്തുക്കളെ
|
|
70
73
|
```
|
|
71
74
|
|
|
75
|
+
```ts
|
|
76
|
+
import { sarvam } from "sarvam-ai-sdk";
|
|
77
|
+
import { experimental_transcribe as transcribe } from "ai";
|
|
78
|
+
import { readFile } from "fs/promises";
|
|
79
|
+
|
|
80
|
+
const { text } = await transcribe({
|
|
81
|
+
model: sarvam.transcription("saaras:v3", "en-IN"),
|
|
82
|
+
audio: await readFile("./src/transcript-test.wav"),
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
console.log(text); // Pachakam thudaroo, suhruthukkale.
|
|
86
|
+
```
|
|
87
|
+
|
|
72
88
|
## Speech-to-Text-Translate
|
|
73
89
|
|
|
74
90
|
```ts
|
|
@@ -77,7 +93,7 @@ import { experimental_transcribe as transcribe } from "ai";
|
|
|
77
93
|
import { readFile } from "fs/promises";
|
|
78
94
|
|
|
79
95
|
const result = await transcribe({
|
|
80
|
-
model: sarvam.speechTranslation("saaras:v2"),
|
|
96
|
+
model: sarvam.speechTranslation("saaras:v2.5"),
|
|
81
97
|
audio: await readFile("./src/transcript-test.wav"),
|
|
82
98
|
});
|
|
83
99
|
|
|
@@ -141,7 +157,7 @@ console.log(result.text); // ml-IN
|
|
|
141
157
|
## Tool Calling
|
|
142
158
|
|
|
143
159
|
> [!WARNING]
|
|
144
|
-
> Latest `sarvam
|
|
160
|
+
> Latest `sarvam` models isn't trained on native tool calling feature (aka JSON mode). So we simulate this with prompt engineering technique.
|
|
145
161
|
|
|
146
162
|
```ts
|
|
147
163
|
import { z } from "zod";
|
|
@@ -150,7 +166,7 @@ import { sarvam } from "sarvam-ai-sdk";
|
|
|
150
166
|
|
|
151
167
|
|
|
152
168
|
const result = await generateText({
|
|
153
|
-
model: sarvam("sarvam-
|
|
169
|
+
model: sarvam("sarvam-30b", {
|
|
154
170
|
simulate: "tool-calling" // ⚠️ important
|
|
155
171
|
}),
|
|
156
172
|
tools: {
|
|
@@ -174,7 +190,7 @@ console.log(result.toolResults);
|
|
|
174
190
|
## Generate JSON object
|
|
175
191
|
|
|
176
192
|
> [!WARNING]
|
|
177
|
-
> Latest `sarvam
|
|
193
|
+
> Latest `sarvam` models isn't trained on native JSON object generation. So we simulate this with prompt engineering technique.
|
|
178
194
|
|
|
179
195
|
```ts
|
|
180
196
|
import { z } from "zod";
|
|
@@ -182,7 +198,7 @@ import { sarvam } from "sarvam-ai-sdk";
|
|
|
182
198
|
import { generateObject } from 'ai';
|
|
183
199
|
|
|
184
200
|
const { object } = await generateObject({
|
|
185
|
-
model: sarvam("sarvam-
|
|
201
|
+
model: sarvam("sarvam-30b", {
|
|
186
202
|
simulate: "json-object" // ⚠️ important
|
|
187
203
|
}),
|
|
188
204
|
schema: z.object({
|
|
@@ -198,6 +214,35 @@ const { object } = await generateObject({
|
|
|
198
214
|
console.log(object);
|
|
199
215
|
```
|
|
200
216
|
|
|
217
|
+
## All APIs
|
|
218
|
+
|
|
219
|
+
```ts
|
|
220
|
+
import { sarvam } from "sarvam-ai-sdk";
|
|
221
|
+
|
|
222
|
+
// Text-to-Text + Chat Completion
|
|
223
|
+
sarvam("sarvam-105b");
|
|
224
|
+
sarvam.languageModel("sarvam-30b");
|
|
225
|
+
|
|
226
|
+
// Text-to-Text + Transliteration
|
|
227
|
+
sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
|
|
228
|
+
|
|
229
|
+
// Text-to-Text + Translation
|
|
230
|
+
sarvam.translation({ from: "en-IN", to: "ml-IN" });
|
|
231
|
+
|
|
232
|
+
// Text-to-Text + Language identification
|
|
233
|
+
sarvam.languageIdentification();
|
|
234
|
+
|
|
235
|
+
// Text-to-Speech
|
|
236
|
+
sarvam.speech("bulbul:v3", "ml-IN");
|
|
237
|
+
|
|
238
|
+
// Speech-to-Text + Transcribe to same language
|
|
239
|
+
sarvam.transcription("saarika:v2.5");
|
|
240
|
+
|
|
241
|
+
// Speech-to-Text + Translate to English
|
|
242
|
+
sarvam.speechTranslation("saaras:v3");
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
|
|
201
246
|
## Documentation
|
|
202
247
|
|
|
203
|
-
Please check out the **[Sarvam provider documentation](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
|
|
248
|
+
Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
|
package/dist/index.d.mts
CHANGED
|
@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
|
|
|
2
2
|
import { FetchFunction } from '@ai-sdk/provider-utils';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
/**
|
|
6
|
+
* @description Product models
|
|
7
|
+
*/
|
|
8
|
+
type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
|
|
9
|
+
/**
|
|
10
|
+
* @description Legacy models
|
|
11
|
+
* @deprecated
|
|
12
|
+
*/
|
|
13
|
+
type SarvamChatLegacyModelId = "sarvam-m";
|
|
6
14
|
interface SarvamChatSettings {
|
|
7
15
|
/**
|
|
8
16
|
* Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
|
|
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
|
|
|
42
50
|
type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
|
|
43
51
|
declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
|
|
44
52
|
|
|
45
|
-
type SarvamSpeechModelId = "bulbul:
|
|
53
|
+
type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
|
|
46
54
|
type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
|
|
47
|
-
declare const SpeakerSchema: z.
|
|
55
|
+
declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
|
|
48
56
|
/**
|
|
49
57
|
* Configuration settings for Sarvam Text-to-Speech API.
|
|
50
58
|
*
|
|
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
|
|
|
56
64
|
/**
|
|
57
65
|
* The speaker voice to be used for the output audio.
|
|
58
66
|
*
|
|
59
|
-
* @default
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
67
|
+
* @default
|
|
68
|
+
* - "shubh" (Male voice for bulbul:v3)
|
|
69
|
+
* - "anushka" (Female voice for bulbul:v2)
|
|
70
|
+
* - "meera" (Female voice for bulbul:v1)
|
|
63
71
|
*/
|
|
64
72
|
speaker?: SarvamSpeechVoices;
|
|
65
73
|
/**
|
|
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
|
|
|
103
111
|
* @example false (Disable preprocessing)
|
|
104
112
|
*/
|
|
105
113
|
enable_preprocessing?: boolean;
|
|
114
|
+
/**
|
|
115
|
+
* Specifies the audio codec for the output audio file.
|
|
116
|
+
* Different codecs offer various compression and quality characteristics.
|
|
117
|
+
*/
|
|
118
|
+
output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
|
|
119
|
+
/**
|
|
120
|
+
* Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
|
|
121
|
+
* Lower values produce more stable and consistent output,
|
|
122
|
+
* while higher values sound more expressive but may introduce artifacts or errors.
|
|
123
|
+
*
|
|
124
|
+
* Any number inbetween 0.01 - 2
|
|
125
|
+
* @default 0.6
|
|
126
|
+
*
|
|
127
|
+
* Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
|
|
128
|
+
*/
|
|
129
|
+
temperature?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The ID of a pronunciation dictionary to apply during synthesis.
|
|
132
|
+
* When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
|
|
133
|
+
*
|
|
134
|
+
* Only supported by bulbul:v3.
|
|
135
|
+
*/
|
|
136
|
+
dict_id?: string;
|
|
137
|
+
/**
|
|
138
|
+
* Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
|
|
139
|
+
*
|
|
140
|
+
* @default false
|
|
141
|
+
*
|
|
142
|
+
* Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
|
|
143
|
+
*/
|
|
144
|
+
enable_cached_responses?: boolean;
|
|
106
145
|
};
|
|
107
146
|
|
|
108
|
-
type SarvamTranscriptionModelId = "
|
|
109
|
-
type SarvamSpeechTranslationModelId = "saaras:
|
|
147
|
+
type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
|
|
148
|
+
type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
|
|
149
|
+
declare const SarvamProviderOptionsSchema: z.ZodObject<{
|
|
150
|
+
mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
|
|
151
|
+
with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
|
|
152
|
+
with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
|
|
153
|
+
num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
154
|
+
}, "strip", z.ZodTypeAny, {
|
|
155
|
+
mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
|
|
156
|
+
with_timestamps: boolean | null;
|
|
157
|
+
with_diarization: boolean | null;
|
|
158
|
+
num_speakers?: number | null | undefined;
|
|
159
|
+
}, {
|
|
160
|
+
mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
|
|
161
|
+
with_timestamps?: boolean | null | undefined;
|
|
162
|
+
with_diarization?: boolean | null | undefined;
|
|
163
|
+
num_speakers?: number | null | undefined;
|
|
164
|
+
}>;
|
|
110
165
|
type SarvamTranscriptionCallOptions = {
|
|
166
|
+
/**
|
|
167
|
+
* @default "transcribe"
|
|
168
|
+
*
|
|
169
|
+
* @description
|
|
170
|
+
* - `transcribe`: Standard transcription in the original language, `output`: Text in source language
|
|
171
|
+
* - `translate`: Transcribe and translate to English, `output`: English text
|
|
172
|
+
* - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
|
|
173
|
+
* - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
|
|
174
|
+
* - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
|
|
175
|
+
*/
|
|
176
|
+
mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
|
|
177
|
+
/**
|
|
178
|
+
* - Chunk-level timestamp support
|
|
179
|
+
* - Useful for subtitle alignment and audio navigation
|
|
180
|
+
* - Provides start and end times for each segment of text
|
|
181
|
+
*/
|
|
111
182
|
with_timestamps?: boolean;
|
|
112
183
|
/**
|
|
113
184
|
* Enables speaker diarization, which identifies and separates different speakers in the audio.
|
|
@@ -240,11 +311,19 @@ interface SarvamProvider {
|
|
|
240
311
|
/**
|
|
241
312
|
* Creates a model for text generation.
|
|
242
313
|
*/
|
|
243
|
-
(
|
|
314
|
+
(
|
|
315
|
+
/**
|
|
316
|
+
* @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
|
|
317
|
+
*/
|
|
318
|
+
modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
|
|
244
319
|
/**
|
|
245
320
|
* Creates an Sarvam chat model for text generation.
|
|
246
321
|
*/
|
|
247
|
-
languageModel(
|
|
322
|
+
languageModel(
|
|
323
|
+
/**
|
|
324
|
+
* @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
|
|
325
|
+
*/
|
|
326
|
+
modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
|
|
248
327
|
/**
|
|
249
328
|
* Creates a Sarvam model for transcription.
|
|
250
329
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
|
|
|
2
2
|
import { FetchFunction } from '@ai-sdk/provider-utils';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
/**
|
|
6
|
+
* @description Product models
|
|
7
|
+
*/
|
|
8
|
+
type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
|
|
9
|
+
/**
|
|
10
|
+
* @description Legacy models
|
|
11
|
+
* @deprecated
|
|
12
|
+
*/
|
|
13
|
+
type SarvamChatLegacyModelId = "sarvam-m";
|
|
6
14
|
interface SarvamChatSettings {
|
|
7
15
|
/**
|
|
8
16
|
* Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
|
|
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
|
|
|
42
50
|
type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
|
|
43
51
|
declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
|
|
44
52
|
|
|
45
|
-
type SarvamSpeechModelId = "bulbul:
|
|
53
|
+
type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
|
|
46
54
|
type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
|
|
47
|
-
declare const SpeakerSchema: z.
|
|
55
|
+
declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
|
|
48
56
|
/**
|
|
49
57
|
* Configuration settings for Sarvam Text-to-Speech API.
|
|
50
58
|
*
|
|
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
|
|
|
56
64
|
/**
|
|
57
65
|
* The speaker voice to be used for the output audio.
|
|
58
66
|
*
|
|
59
|
-
* @default
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
67
|
+
* @default
|
|
68
|
+
* - "shubh" (Male voice for bulbul:v3)
|
|
69
|
+
* - "anushka" (Female voice for bulbul:v2)
|
|
70
|
+
* - "meera" (Female voice for bulbul:v1)
|
|
63
71
|
*/
|
|
64
72
|
speaker?: SarvamSpeechVoices;
|
|
65
73
|
/**
|
|
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
|
|
|
103
111
|
* @example false (Disable preprocessing)
|
|
104
112
|
*/
|
|
105
113
|
enable_preprocessing?: boolean;
|
|
114
|
+
/**
|
|
115
|
+
* Specifies the audio codec for the output audio file.
|
|
116
|
+
* Different codecs offer various compression and quality characteristics.
|
|
117
|
+
*/
|
|
118
|
+
output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
|
|
119
|
+
/**
|
|
120
|
+
* Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
|
|
121
|
+
* Lower values produce more stable and consistent output,
|
|
122
|
+
* while higher values sound more expressive but may introduce artifacts or errors.
|
|
123
|
+
*
|
|
124
|
+
* Any number inbetween 0.01 - 2
|
|
125
|
+
* @default 0.6
|
|
126
|
+
*
|
|
127
|
+
* Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
|
|
128
|
+
*/
|
|
129
|
+
temperature?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The ID of a pronunciation dictionary to apply during synthesis.
|
|
132
|
+
* When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
|
|
133
|
+
*
|
|
134
|
+
* Only supported by bulbul:v3.
|
|
135
|
+
*/
|
|
136
|
+
dict_id?: string;
|
|
137
|
+
/**
|
|
138
|
+
* Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
|
|
139
|
+
*
|
|
140
|
+
* @default false
|
|
141
|
+
*
|
|
142
|
+
* Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
|
|
143
|
+
*/
|
|
144
|
+
enable_cached_responses?: boolean;
|
|
106
145
|
};
|
|
107
146
|
|
|
108
|
-
type SarvamTranscriptionModelId = "
|
|
109
|
-
type SarvamSpeechTranslationModelId = "saaras:
|
|
147
|
+
type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
|
|
148
|
+
type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
|
|
149
|
+
declare const SarvamProviderOptionsSchema: z.ZodObject<{
|
|
150
|
+
mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
|
|
151
|
+
with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
|
|
152
|
+
with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
|
|
153
|
+
num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
154
|
+
}, "strip", z.ZodTypeAny, {
|
|
155
|
+
mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
|
|
156
|
+
with_timestamps: boolean | null;
|
|
157
|
+
with_diarization: boolean | null;
|
|
158
|
+
num_speakers?: number | null | undefined;
|
|
159
|
+
}, {
|
|
160
|
+
mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
|
|
161
|
+
with_timestamps?: boolean | null | undefined;
|
|
162
|
+
with_diarization?: boolean | null | undefined;
|
|
163
|
+
num_speakers?: number | null | undefined;
|
|
164
|
+
}>;
|
|
110
165
|
type SarvamTranscriptionCallOptions = {
|
|
166
|
+
/**
|
|
167
|
+
* @default "transcribe"
|
|
168
|
+
*
|
|
169
|
+
* @description
|
|
170
|
+
* - `transcribe`: Standard transcription in the original language, `output`: Text in source language
|
|
171
|
+
* - `translate`: Transcribe and translate to English, `output`: English text
|
|
172
|
+
* - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
|
|
173
|
+
* - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
|
|
174
|
+
* - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
|
|
175
|
+
*/
|
|
176
|
+
mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
|
|
177
|
+
/**
|
|
178
|
+
* - Chunk-level timestamp support
|
|
179
|
+
* - Useful for subtitle alignment and audio navigation
|
|
180
|
+
* - Provides start and end times for each segment of text
|
|
181
|
+
*/
|
|
111
182
|
with_timestamps?: boolean;
|
|
112
183
|
/**
|
|
113
184
|
* Enables speaker diarization, which identifies and separates different speakers in the audio.
|
|
@@ -240,11 +311,19 @@ interface SarvamProvider {
|
|
|
240
311
|
/**
|
|
241
312
|
* Creates a model for text generation.
|
|
242
313
|
*/
|
|
243
|
-
(
|
|
314
|
+
(
|
|
315
|
+
/**
|
|
316
|
+
* @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
|
|
317
|
+
*/
|
|
318
|
+
modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
|
|
244
319
|
/**
|
|
245
320
|
* Creates an Sarvam chat model for text generation.
|
|
246
321
|
*/
|
|
247
|
-
languageModel(
|
|
322
|
+
languageModel(
|
|
323
|
+
/**
|
|
324
|
+
* @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
|
|
325
|
+
*/
|
|
326
|
+
modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
|
|
248
327
|
/**
|
|
249
328
|
* Creates a Sarvam model for transcription.
|
|
250
329
|
*/
|
package/dist/index.js
CHANGED
|
@@ -767,30 +767,72 @@ var sarvamChatChunkSchema = import_zod2.z.union([
|
|
|
767
767
|
|
|
768
768
|
// src/sarvam-speech-model.ts
|
|
769
769
|
var import_provider_utils5 = require("@ai-sdk/provider-utils");
|
|
770
|
+
var import_zod4 = require("zod");
|
|
770
771
|
|
|
771
772
|
// src/sarvam-speech-settings.ts
|
|
772
773
|
var import_zod3 = require("zod");
|
|
773
774
|
var SpeakerSchema = import_zod3.z.enum([
|
|
774
|
-
|
|
775
|
-
"pavithra",
|
|
776
|
-
"maitreyi",
|
|
777
|
-
"arvind",
|
|
778
|
-
"amol",
|
|
779
|
-
"amartya",
|
|
780
|
-
"diya",
|
|
781
|
-
"neel",
|
|
782
|
-
"misha",
|
|
783
|
-
"vian",
|
|
784
|
-
"arjun",
|
|
785
|
-
"maya",
|
|
786
|
-
"anushka",
|
|
775
|
+
// male bulbul:v2
|
|
787
776
|
"abhilash",
|
|
777
|
+
"karun",
|
|
778
|
+
"hitesh",
|
|
779
|
+
// female bulbul:v2
|
|
780
|
+
"anushka",
|
|
788
781
|
"manisha",
|
|
789
782
|
"vidya",
|
|
790
783
|
"arya",
|
|
791
|
-
|
|
792
|
-
"
|
|
793
|
-
|
|
784
|
+
// male bulbul:v3
|
|
785
|
+
"shubh",
|
|
786
|
+
"aditya",
|
|
787
|
+
"rahul",
|
|
788
|
+
"rohan",
|
|
789
|
+
"amit",
|
|
790
|
+
"dev",
|
|
791
|
+
"ratan",
|
|
792
|
+
"varun",
|
|
793
|
+
"manan",
|
|
794
|
+
"sumit",
|
|
795
|
+
"kabir",
|
|
796
|
+
"aayan",
|
|
797
|
+
"ashutosh",
|
|
798
|
+
"advait",
|
|
799
|
+
"anand",
|
|
800
|
+
"tarun",
|
|
801
|
+
"sunny",
|
|
802
|
+
"mani",
|
|
803
|
+
"gokul",
|
|
804
|
+
"vijay",
|
|
805
|
+
"mohit",
|
|
806
|
+
"rehan",
|
|
807
|
+
"soham",
|
|
808
|
+
// female bulbul:v3
|
|
809
|
+
"ritu",
|
|
810
|
+
"priya",
|
|
811
|
+
"neha",
|
|
812
|
+
"pooja",
|
|
813
|
+
"simran",
|
|
814
|
+
"kavya",
|
|
815
|
+
"ishita",
|
|
816
|
+
"shreya",
|
|
817
|
+
"roopa",
|
|
818
|
+
"amelia",
|
|
819
|
+
"sophia",
|
|
820
|
+
"tanya",
|
|
821
|
+
"shruti",
|
|
822
|
+
"suhani",
|
|
823
|
+
"kavitha",
|
|
824
|
+
"rupali"
|
|
825
|
+
]);
|
|
826
|
+
var outputAudioCodecSchema = import_zod3.z.enum([
|
|
827
|
+
"mp3",
|
|
828
|
+
"linear16",
|
|
829
|
+
"mulaw",
|
|
830
|
+
"alaw",
|
|
831
|
+
"opus",
|
|
832
|
+
"flac",
|
|
833
|
+
"aac",
|
|
834
|
+
"wav"
|
|
835
|
+
]);
|
|
794
836
|
var SarvamProviderOptionsSchema = import_zod3.z.object({
|
|
795
837
|
speaker: SpeakerSchema,
|
|
796
838
|
pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
|
|
@@ -802,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
|
|
|
802
844
|
import_zod3.z.literal(22050),
|
|
803
845
|
import_zod3.z.literal(24e3)
|
|
804
846
|
]).default(22050),
|
|
805
|
-
enable_preprocessing: import_zod3.z.boolean().default(false)
|
|
847
|
+
enable_preprocessing: import_zod3.z.boolean().default(false),
|
|
848
|
+
output_audio_codec: outputAudioCodecSchema.optional(),
|
|
849
|
+
temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
|
|
850
|
+
dict_id: import_zod3.z.string().optional(),
|
|
851
|
+
enable_cached_responses: import_zod3.z.boolean().default(false)
|
|
806
852
|
}).partial();
|
|
807
853
|
|
|
808
854
|
// src/sarvam-speech-model.ts
|
|
809
|
-
var import_zod4 = require("zod");
|
|
810
855
|
var SarvamSpeechModel = class {
|
|
811
856
|
constructor(modelId, languageCode, config) {
|
|
812
857
|
this.modelId = modelId;
|
|
@@ -821,7 +866,7 @@ var SarvamSpeechModel = class {
|
|
|
821
866
|
text,
|
|
822
867
|
voice,
|
|
823
868
|
outputFormat = "wav",
|
|
824
|
-
|
|
869
|
+
speed,
|
|
825
870
|
// instructions,
|
|
826
871
|
providerOptions
|
|
827
872
|
}) {
|
|
@@ -837,48 +882,42 @@ var SarvamSpeechModel = class {
|
|
|
837
882
|
schema: SarvamProviderOptionsSchema
|
|
838
883
|
});
|
|
839
884
|
const getSpeaker = () => {
|
|
840
|
-
if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
|
|
841
885
|
if (voice) {
|
|
842
886
|
return SpeakerSchema.parse(voice);
|
|
843
887
|
}
|
|
844
888
|
switch (this.modelId) {
|
|
845
|
-
case "bulbul:v1":
|
|
846
|
-
return "meera";
|
|
847
889
|
case "bulbul:v2":
|
|
848
890
|
return "manisha";
|
|
891
|
+
case "bulbul:v3":
|
|
892
|
+
return "shubh";
|
|
849
893
|
}
|
|
850
|
-
return "
|
|
894
|
+
return "shubh";
|
|
851
895
|
};
|
|
852
896
|
const requestBody = {
|
|
853
897
|
model: this.modelId,
|
|
854
898
|
text,
|
|
855
899
|
target_language_code: this.languageCode,
|
|
856
|
-
speaker: getSpeaker()
|
|
857
|
-
|
|
858
|
-
// speed,
|
|
859
|
-
// instructions,
|
|
900
|
+
speaker: getSpeaker(),
|
|
901
|
+
pace: speed
|
|
860
902
|
};
|
|
861
903
|
if (outputFormat) {
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
requestBody.response_format = outputFormat;
|
|
904
|
+
const of = outputAudioCodecSchema.safeParse(outputFormat);
|
|
905
|
+
if (of.success) {
|
|
906
|
+
requestBody.output_audio_codec = of.data;
|
|
866
907
|
} else {
|
|
867
908
|
warnings.push({
|
|
868
909
|
type: "unsupported-setting",
|
|
869
910
|
setting: "outputFormat",
|
|
870
|
-
details: `Unsupported output format: ${outputFormat}. Using
|
|
911
|
+
details: `Unsupported output format: ${outputFormat}. Using wav instead.`
|
|
871
912
|
});
|
|
872
913
|
}
|
|
873
914
|
}
|
|
874
915
|
if (sarvamOptions) {
|
|
875
|
-
|
|
876
|
-
for (const key in speechModelOptions) {
|
|
877
|
-
const value = speechModelOptions[key];
|
|
916
|
+
Object.entries(sarvamOptions).forEach(([key, value]) => {
|
|
878
917
|
if (value !== void 0) {
|
|
879
918
|
requestBody[key] = value;
|
|
880
919
|
}
|
|
881
|
-
}
|
|
920
|
+
});
|
|
882
921
|
}
|
|
883
922
|
return {
|
|
884
923
|
requestBody,
|
|
@@ -934,6 +973,7 @@ var import_zod6 = require("zod");
|
|
|
934
973
|
// src/sarvam-transcription-settings.ts
|
|
935
974
|
var import_zod5 = require("zod");
|
|
936
975
|
var SarvamProviderOptionsSchema2 = import_zod5.z.object({
|
|
976
|
+
mode: import_zod5.z.enum(["transcribe", "translate", "verbatim", "translit", "codemix"]).default("transcribe"),
|
|
937
977
|
with_timestamps: import_zod5.z.boolean().nullish().default(false),
|
|
938
978
|
with_diarization: import_zod5.z.boolean().nullish().default(false),
|
|
939
979
|
num_speakers: import_zod5.z.number().int().nullish()
|
|
@@ -956,10 +996,6 @@ var SarvamTranscriptionModel = class {
|
|
|
956
996
|
providerOptions
|
|
957
997
|
}) {
|
|
958
998
|
const warnings = [];
|
|
959
|
-
if (this.modelId === "saarika:v1" && this.languageCode === "unknown")
|
|
960
|
-
throw new Error(
|
|
961
|
-
"Language code unknown is not supported for model saarika:v1"
|
|
962
|
-
);
|
|
963
999
|
const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
|
|
964
1000
|
provider: "sarvam",
|
|
965
1001
|
providerOptions: {
|
|
@@ -974,22 +1010,13 @@ var SarvamTranscriptionModel = class {
|
|
|
974
1010
|
const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
|
|
975
1011
|
formData.append("file", blob);
|
|
976
1012
|
formData.append("model", this.modelId);
|
|
1013
|
+
formData.append("language_code", this.languageCode);
|
|
977
1014
|
if (sarvamOptions) {
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
);
|
|
983
|
-
formData.append(
|
|
984
|
-
"with_diarization",
|
|
985
|
-
sarvamOptions.with_diarization ? "true" : "false"
|
|
986
|
-
);
|
|
987
|
-
if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
|
|
988
|
-
formData.append(
|
|
989
|
-
"num_speakers",
|
|
990
|
-
sarvamOptions.num_speakers.toString()
|
|
991
|
-
);
|
|
992
|
-
}
|
|
1015
|
+
Object.entries(sarvamOptions).forEach(([key, value]) => {
|
|
1016
|
+
if (value) {
|
|
1017
|
+
formData.append(key, String(value));
|
|
1018
|
+
}
|
|
1019
|
+
});
|
|
993
1020
|
}
|
|
994
1021
|
return {
|
|
995
1022
|
formData,
|