sarvam-ai-sdk 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -1
- package/dist/index.d.mts +32 -1
- package/dist/index.d.ts +32 -1
- package/dist/index.js +32 -35
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +32 -35
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -65,7 +65,7 @@ import { experimental_transcribe as transcribe } from "ai";
|
|
|
65
65
|
import { readFile } from "fs/promises";
|
|
66
66
|
|
|
67
67
|
const { text } = await transcribe({
|
|
68
|
-
model: sarvam.transcription("saarika:v2.5", "ml-IN")
|
|
68
|
+
model: sarvam.transcription("saarika:v2.5", "ml-IN"),
|
|
69
69
|
audio: await readFile("./src/transcript-test.wav"),
|
|
70
70
|
});
|
|
71
71
|
|
|
@@ -214,6 +214,35 @@ const { object } = await generateObject({
|
|
|
214
214
|
console.log(object);
|
|
215
215
|
```
|
|
216
216
|
|
|
217
|
+
## All APIs
|
|
218
|
+
|
|
219
|
+
```ts
|
|
220
|
+
import { sarvam } from "sarvam-ai-sdk";
|
|
221
|
+
|
|
222
|
+
// Text-to-Text + Chat Completion
|
|
223
|
+
sarvam("sarvam-105b");
|
|
224
|
+
sarvam.languageModel("sarvam-30b");
|
|
225
|
+
|
|
226
|
+
// Text-to-Text + Transliteration
|
|
227
|
+
sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
|
|
228
|
+
|
|
229
|
+
// Text-to-Text + Translation
|
|
230
|
+
sarvam.translation({ from: "en-IN", to: "ml-IN" });
|
|
231
|
+
|
|
232
|
+
// Text-to-Text + Language identification
|
|
233
|
+
sarvam.languageIdentification();
|
|
234
|
+
|
|
235
|
+
// Text-to-Speech
|
|
236
|
+
sarvam.speech("bulbul:v3", "ml-IN");
|
|
237
|
+
|
|
238
|
+
// Speech-to-Text + Transcribe to same language
|
|
239
|
+
sarvam.transcription("saarika:v2.5");
|
|
240
|
+
|
|
241
|
+
// Speech-to-Text + Translate to English
|
|
242
|
+
sarvam.speechTranslation("saaras:v3");
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
|
|
217
246
|
## Documentation
|
|
218
247
|
|
|
219
248
|
Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
|
package/dist/index.d.mts
CHANGED
|
@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
|
|
|
52
52
|
|
|
53
53
|
type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
|
|
54
54
|
type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
|
|
55
|
-
declare const SpeakerSchema: z.
|
|
55
|
+
declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
|
|
56
56
|
/**
|
|
57
57
|
* Configuration settings for Sarvam Text-to-Speech API.
|
|
58
58
|
*
|
|
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
|
|
|
111
111
|
* @example false (Disable preprocessing)
|
|
112
112
|
*/
|
|
113
113
|
enable_preprocessing?: boolean;
|
|
114
|
+
/**
|
|
115
|
+
* Specifies the audio codec for the output audio file.
|
|
116
|
+
* Different codecs offer various compression and quality characteristics.
|
|
117
|
+
*/
|
|
118
|
+
output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
|
|
119
|
+
/**
|
|
120
|
+
* Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
|
|
121
|
+
* Lower values produce more stable and consistent output,
|
|
122
|
+
* while higher values sound more expressive but may introduce artifacts or errors.
|
|
123
|
+
*
|
|
124
|
+
* Any number inbetween 0.01 - 2
|
|
125
|
+
* @default 0.6
|
|
126
|
+
*
|
|
127
|
+
* Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
|
|
128
|
+
*/
|
|
129
|
+
temperature?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The ID of a pronunciation dictionary to apply during synthesis.
|
|
132
|
+
* When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
|
|
133
|
+
*
|
|
134
|
+
* Only supported by bulbul:v3.
|
|
135
|
+
*/
|
|
136
|
+
dict_id?: string;
|
|
137
|
+
/**
|
|
138
|
+
* Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
|
|
139
|
+
*
|
|
140
|
+
* @default false
|
|
141
|
+
*
|
|
142
|
+
* Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
|
|
143
|
+
*/
|
|
144
|
+
enable_cached_responses?: boolean;
|
|
114
145
|
};
|
|
115
146
|
|
|
116
147
|
type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
|
package/dist/index.d.ts
CHANGED
|
@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
|
|
|
52
52
|
|
|
53
53
|
type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
|
|
54
54
|
type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
|
|
55
|
-
declare const SpeakerSchema: z.
|
|
55
|
+
declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
|
|
56
56
|
/**
|
|
57
57
|
* Configuration settings for Sarvam Text-to-Speech API.
|
|
58
58
|
*
|
|
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
|
|
|
111
111
|
* @example false (Disable preprocessing)
|
|
112
112
|
*/
|
|
113
113
|
enable_preprocessing?: boolean;
|
|
114
|
+
/**
|
|
115
|
+
* Specifies the audio codec for the output audio file.
|
|
116
|
+
* Different codecs offer various compression and quality characteristics.
|
|
117
|
+
*/
|
|
118
|
+
output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
|
|
119
|
+
/**
|
|
120
|
+
* Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
|
|
121
|
+
* Lower values produce more stable and consistent output,
|
|
122
|
+
* while higher values sound more expressive but may introduce artifacts or errors.
|
|
123
|
+
*
|
|
124
|
+
* Any number inbetween 0.01 - 2
|
|
125
|
+
* @default 0.6
|
|
126
|
+
*
|
|
127
|
+
* Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
|
|
128
|
+
*/
|
|
129
|
+
temperature?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The ID of a pronunciation dictionary to apply during synthesis.
|
|
132
|
+
* When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
|
|
133
|
+
*
|
|
134
|
+
* Only supported by bulbul:v3.
|
|
135
|
+
*/
|
|
136
|
+
dict_id?: string;
|
|
137
|
+
/**
|
|
138
|
+
* Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
|
|
139
|
+
*
|
|
140
|
+
* @default false
|
|
141
|
+
*
|
|
142
|
+
* Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
|
|
143
|
+
*/
|
|
144
|
+
enable_cached_responses?: boolean;
|
|
114
145
|
};
|
|
115
146
|
|
|
116
147
|
type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
|
package/dist/index.js
CHANGED
|
@@ -767,6 +767,7 @@ var sarvamChatChunkSchema = import_zod2.z.union([
|
|
|
767
767
|
|
|
768
768
|
// src/sarvam-speech-model.ts
|
|
769
769
|
var import_provider_utils5 = require("@ai-sdk/provider-utils");
|
|
770
|
+
var import_zod4 = require("zod");
|
|
770
771
|
|
|
771
772
|
// src/sarvam-speech-settings.ts
|
|
772
773
|
var import_zod3 = require("zod");
|
|
@@ -821,7 +822,17 @@ var SpeakerSchema = import_zod3.z.enum([
|
|
|
821
822
|
"suhani",
|
|
822
823
|
"kavitha",
|
|
823
824
|
"rupali"
|
|
824
|
-
])
|
|
825
|
+
]);
|
|
826
|
+
var outputAudioCodecSchema = import_zod3.z.enum([
|
|
827
|
+
"mp3",
|
|
828
|
+
"linear16",
|
|
829
|
+
"mulaw",
|
|
830
|
+
"alaw",
|
|
831
|
+
"opus",
|
|
832
|
+
"flac",
|
|
833
|
+
"aac",
|
|
834
|
+
"wav"
|
|
835
|
+
]);
|
|
825
836
|
var SarvamProviderOptionsSchema = import_zod3.z.object({
|
|
826
837
|
speaker: SpeakerSchema,
|
|
827
838
|
pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
|
|
@@ -833,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
|
|
|
833
844
|
import_zod3.z.literal(22050),
|
|
834
845
|
import_zod3.z.literal(24e3)
|
|
835
846
|
]).default(22050),
|
|
836
|
-
enable_preprocessing: import_zod3.z.boolean().default(false)
|
|
847
|
+
enable_preprocessing: import_zod3.z.boolean().default(false),
|
|
848
|
+
output_audio_codec: outputAudioCodecSchema.optional(),
|
|
849
|
+
temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
|
|
850
|
+
dict_id: import_zod3.z.string().optional(),
|
|
851
|
+
enable_cached_responses: import_zod3.z.boolean().default(false)
|
|
837
852
|
}).partial();
|
|
838
853
|
|
|
839
854
|
// src/sarvam-speech-model.ts
|
|
840
|
-
var import_zod4 = require("zod");
|
|
841
855
|
var SarvamSpeechModel = class {
|
|
842
856
|
constructor(modelId, languageCode, config) {
|
|
843
857
|
this.modelId = modelId;
|
|
@@ -852,7 +866,7 @@ var SarvamSpeechModel = class {
|
|
|
852
866
|
text,
|
|
853
867
|
voice,
|
|
854
868
|
outputFormat = "wav",
|
|
855
|
-
|
|
869
|
+
speed,
|
|
856
870
|
// instructions,
|
|
857
871
|
providerOptions
|
|
858
872
|
}) {
|
|
@@ -868,7 +882,6 @@ var SarvamSpeechModel = class {
|
|
|
868
882
|
schema: SarvamProviderOptionsSchema
|
|
869
883
|
});
|
|
870
884
|
const getSpeaker = () => {
|
|
871
|
-
if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
|
|
872
885
|
if (voice) {
|
|
873
886
|
return SpeakerSchema.parse(voice);
|
|
874
887
|
}
|
|
@@ -884,32 +897,27 @@ var SarvamSpeechModel = class {
|
|
|
884
897
|
model: this.modelId,
|
|
885
898
|
text,
|
|
886
899
|
target_language_code: this.languageCode,
|
|
887
|
-
speaker: getSpeaker()
|
|
888
|
-
|
|
889
|
-
// speed,
|
|
890
|
-
// instructions,
|
|
900
|
+
speaker: getSpeaker(),
|
|
901
|
+
pace: speed
|
|
891
902
|
};
|
|
892
903
|
if (outputFormat) {
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
requestBody.response_format = outputFormat;
|
|
904
|
+
const of = outputAudioCodecSchema.safeParse(outputFormat);
|
|
905
|
+
if (of.success) {
|
|
906
|
+
requestBody.output_audio_codec = of.data;
|
|
897
907
|
} else {
|
|
898
908
|
warnings.push({
|
|
899
909
|
type: "unsupported-setting",
|
|
900
910
|
setting: "outputFormat",
|
|
901
|
-
details: `Unsupported output format: ${outputFormat}. Using
|
|
911
|
+
details: `Unsupported output format: ${outputFormat}. Using wav instead.`
|
|
902
912
|
});
|
|
903
913
|
}
|
|
904
914
|
}
|
|
905
915
|
if (sarvamOptions) {
|
|
906
|
-
|
|
907
|
-
for (const key in speechModelOptions) {
|
|
908
|
-
const value = speechModelOptions[key];
|
|
916
|
+
Object.entries(sarvamOptions).forEach(([key, value]) => {
|
|
909
917
|
if (value !== void 0) {
|
|
910
918
|
requestBody[key] = value;
|
|
911
919
|
}
|
|
912
|
-
}
|
|
920
|
+
});
|
|
913
921
|
}
|
|
914
922
|
return {
|
|
915
923
|
requestBody,
|
|
@@ -987,7 +995,6 @@ var SarvamTranscriptionModel = class {
|
|
|
987
995
|
mediaType,
|
|
988
996
|
providerOptions
|
|
989
997
|
}) {
|
|
990
|
-
var _a;
|
|
991
998
|
const warnings = [];
|
|
992
999
|
const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
|
|
993
1000
|
provider: "sarvam",
|
|
@@ -1003,23 +1010,13 @@ var SarvamTranscriptionModel = class {
|
|
|
1003
1010
|
const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
|
|
1004
1011
|
formData.append("file", blob);
|
|
1005
1012
|
formData.append("model", this.modelId);
|
|
1013
|
+
formData.append("language_code", this.languageCode);
|
|
1006
1014
|
if (sarvamOptions) {
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
);
|
|
1013
|
-
formData.append(
|
|
1014
|
-
"with_diarization",
|
|
1015
|
-
sarvamOptions.with_diarization ? "true" : "false"
|
|
1016
|
-
);
|
|
1017
|
-
if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
|
|
1018
|
-
formData.append(
|
|
1019
|
-
"num_speakers",
|
|
1020
|
-
sarvamOptions.num_speakers.toString()
|
|
1021
|
-
);
|
|
1022
|
-
}
|
|
1015
|
+
Object.entries(sarvamOptions).forEach(([key, value]) => {
|
|
1016
|
+
if (value) {
|
|
1017
|
+
formData.append(key, String(value));
|
|
1018
|
+
}
|
|
1019
|
+
});
|
|
1023
1020
|
}
|
|
1024
1021
|
return {
|
|
1025
1022
|
formData,
|