sarvam-ai-sdk 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -65,7 +65,7 @@ import { experimental_transcribe as transcribe } from "ai";
65
65
  import { readFile } from "fs/promises";
66
66
 
67
67
  const { text } = await transcribe({
68
- model: sarvam.transcription("saarika:v2.5", "ml-IN")
68
+ model: sarvam.transcription("saarika:v2.5", "ml-IN"),
69
69
  audio: await readFile("./src/transcript-test.wav"),
70
70
  });
71
71
 
@@ -214,6 +214,35 @@ const { object } = await generateObject({
214
214
  console.log(object);
215
215
  ```
216
216
 
217
+ ## All APIs
218
+
219
+ ```ts
220
+ import { sarvam } from "sarvam-ai-sdk";
221
+
222
+ // Text-to-Text + Chat Completion
223
+ sarvam("sarvam-105b");
224
+ sarvam.languageModel("sarvam-30b");
225
+
226
+ // Text-to-Text + Transliteration
227
+ sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
228
+
229
+ // Text-to-Text + Translation
230
+ sarvam.translation({ from: "en-IN", to: "ml-IN" });
231
+
232
+ // Text-to-Text + Language identification
233
+ sarvam.languageIdentification();
234
+
235
+ // Text-to-Speech
236
+ sarvam.speech("bulbul:v3", "ml-IN");
237
+
238
+ // Speech-to-Text + Transcribe to same language
239
+ sarvam.transcription("saarika:v2.5");
240
+
241
+ // Speech-to-Text + Translate to English
242
+ sarvam.speechTranslation("saaras:v3");
243
+ ```
244
+
245
+
217
246
  ## Documentation
218
247
 
219
248
  Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
package/dist/index.d.mts CHANGED
@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
52
52
 
53
53
  type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
54
54
  type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
55
- declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>>;
55
+ declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
56
56
  /**
57
57
  * Configuration settings for Sarvam Text-to-Speech API.
58
58
  *
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
111
111
  * @example false (Disable preprocessing)
112
112
  */
113
113
  enable_preprocessing?: boolean;
114
+ /**
115
+ * Specifies the audio codec for the output audio file.
116
+ * Different codecs offer various compression and quality characteristics.
117
+ */
118
+ output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
119
+ /**
120
+ * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
121
+ * Lower values produce more stable and consistent output,
122
+ * while higher values sound more expressive but may introduce artifacts or errors.
123
+ *
124
+ * Any number inbetween 0.01 - 2
125
+ * @default 0.6
126
+ *
127
+ * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
128
+ */
129
+ temperature?: number;
130
+ /**
131
+ * The ID of a pronunciation dictionary to apply during synthesis.
132
+ * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
133
+ *
134
+ * Only supported by bulbul:v3.
135
+ */
136
+ dict_id?: string;
137
+ /**
138
+ * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
139
+ *
140
+ * @default false
141
+ *
142
+ * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
143
+ */
144
+ enable_cached_responses?: boolean;
114
145
  };
115
146
 
116
147
  type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
package/dist/index.d.ts CHANGED
@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
52
52
 
53
53
  type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
54
54
  type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
55
- declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>>;
55
+ declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
56
56
  /**
57
57
  * Configuration settings for Sarvam Text-to-Speech API.
58
58
  *
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
111
111
  * @example false (Disable preprocessing)
112
112
  */
113
113
  enable_preprocessing?: boolean;
114
+ /**
115
+ * Specifies the audio codec for the output audio file.
116
+ * Different codecs offer various compression and quality characteristics.
117
+ */
118
+ output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
119
+ /**
120
+ * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
121
+ * Lower values produce more stable and consistent output,
122
+ * while higher values sound more expressive but may introduce artifacts or errors.
123
+ *
124
+ * Any number inbetween 0.01 - 2
125
+ * @default 0.6
126
+ *
127
+ * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
128
+ */
129
+ temperature?: number;
130
+ /**
131
+ * The ID of a pronunciation dictionary to apply during synthesis.
132
+ * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
133
+ *
134
+ * Only supported by bulbul:v3.
135
+ */
136
+ dict_id?: string;
137
+ /**
138
+ * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
139
+ *
140
+ * @default false
141
+ *
142
+ * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
143
+ */
144
+ enable_cached_responses?: boolean;
114
145
  };
115
146
 
116
147
  type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
package/dist/index.js CHANGED
@@ -767,6 +767,7 @@ var sarvamChatChunkSchema = import_zod2.z.union([
767
767
 
768
768
  // src/sarvam-speech-model.ts
769
769
  var import_provider_utils5 = require("@ai-sdk/provider-utils");
770
+ var import_zod4 = require("zod");
770
771
 
771
772
  // src/sarvam-speech-settings.ts
772
773
  var import_zod3 = require("zod");
@@ -821,7 +822,17 @@ var SpeakerSchema = import_zod3.z.enum([
821
822
  "suhani",
822
823
  "kavitha",
823
824
  "rupali"
824
- ]).default("shubh");
825
+ ]);
826
+ var outputAudioCodecSchema = import_zod3.z.enum([
827
+ "mp3",
828
+ "linear16",
829
+ "mulaw",
830
+ "alaw",
831
+ "opus",
832
+ "flac",
833
+ "aac",
834
+ "wav"
835
+ ]);
825
836
  var SarvamProviderOptionsSchema = import_zod3.z.object({
826
837
  speaker: SpeakerSchema,
827
838
  pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
@@ -833,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
833
844
  import_zod3.z.literal(22050),
834
845
  import_zod3.z.literal(24e3)
835
846
  ]).default(22050),
836
- enable_preprocessing: import_zod3.z.boolean().default(false)
847
+ enable_preprocessing: import_zod3.z.boolean().default(false),
848
+ output_audio_codec: outputAudioCodecSchema.optional(),
849
+ temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
850
+ dict_id: import_zod3.z.string().optional(),
851
+ enable_cached_responses: import_zod3.z.boolean().default(false)
837
852
  }).partial();
838
853
 
839
854
  // src/sarvam-speech-model.ts
840
- var import_zod4 = require("zod");
841
855
  var SarvamSpeechModel = class {
842
856
  constructor(modelId, languageCode, config) {
843
857
  this.modelId = modelId;
@@ -852,7 +866,7 @@ var SarvamSpeechModel = class {
852
866
  text,
853
867
  voice,
854
868
  outputFormat = "wav",
855
- // speed,
869
+ speed,
856
870
  // instructions,
857
871
  providerOptions
858
872
  }) {
@@ -868,7 +882,6 @@ var SarvamSpeechModel = class {
868
882
  schema: SarvamProviderOptionsSchema
869
883
  });
870
884
  const getSpeaker = () => {
871
- if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
872
885
  if (voice) {
873
886
  return SpeakerSchema.parse(voice);
874
887
  }
@@ -884,32 +897,27 @@ var SarvamSpeechModel = class {
884
897
  model: this.modelId,
885
898
  text,
886
899
  target_language_code: this.languageCode,
887
- speaker: getSpeaker()
888
- // response_format: "wav",
889
- // speed,
890
- // instructions,
900
+ speaker: getSpeaker(),
901
+ pace: speed
891
902
  };
892
903
  if (outputFormat) {
893
- if (["mp3", "opus", "aac", "flac", "wav", "pcm"].includes(
894
- outputFormat
895
- )) {
896
- requestBody.response_format = outputFormat;
904
+ const of = outputAudioCodecSchema.safeParse(outputFormat);
905
+ if (of.success) {
906
+ requestBody.output_audio_codec = of.data;
897
907
  } else {
898
908
  warnings.push({
899
909
  type: "unsupported-setting",
900
910
  setting: "outputFormat",
901
- details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
911
+ details: `Unsupported output format: ${outputFormat}. Using wav instead.`
902
912
  });
903
913
  }
904
914
  }
905
915
  if (sarvamOptions) {
906
- const speechModelOptions = {};
907
- for (const key in speechModelOptions) {
908
- const value = speechModelOptions[key];
916
+ Object.entries(sarvamOptions).forEach(([key, value]) => {
909
917
  if (value !== void 0) {
910
918
  requestBody[key] = value;
911
919
  }
912
- }
920
+ });
913
921
  }
914
922
  return {
915
923
  requestBody,
@@ -987,7 +995,6 @@ var SarvamTranscriptionModel = class {
987
995
  mediaType,
988
996
  providerOptions
989
997
  }) {
990
- var _a;
991
998
  const warnings = [];
992
999
  const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
993
1000
  provider: "sarvam",
@@ -1003,23 +1010,13 @@ var SarvamTranscriptionModel = class {
1003
1010
  const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
1004
1011
  formData.append("file", blob);
1005
1012
  formData.append("model", this.modelId);
1013
+ formData.append("language_code", this.languageCode);
1006
1014
  if (sarvamOptions) {
1007
- formData.append("mode", (_a = sarvamOptions.mode) != null ? _a : "transcribe");
1008
- formData.append("language_code", this.languageCode);
1009
- formData.append(
1010
- "with_timestamps",
1011
- sarvamOptions.with_timestamps ? "true" : "false"
1012
- );
1013
- formData.append(
1014
- "with_diarization",
1015
- sarvamOptions.with_diarization ? "true" : "false"
1016
- );
1017
- if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
1018
- formData.append(
1019
- "num_speakers",
1020
- sarvamOptions.num_speakers.toString()
1021
- );
1022
- }
1015
+ Object.entries(sarvamOptions).forEach(([key, value]) => {
1016
+ if (value) {
1017
+ formData.append(key, String(value));
1018
+ }
1019
+ });
1023
1020
  }
1024
1021
  return {
1025
1022
  formData,