npm - sarvam-ai-sdk - Versions diffs - 0.1.2 → 0.1.3 - Mend

sarvam-ai-sdk 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -65,7 +65,7 @@ import { experimental_transcribe as transcribe } from "ai";
 import { readFile } from "fs/promises";
 const { text } = await transcribe({
-    model: sarvam.transcription("saarika:v2.5", "ml-IN")
+    model: sarvam.transcription("saarika:v2.5", "ml-IN"),
     audio: await readFile("./src/transcript-test.wav"),
 });
@@ -214,6 +214,35 @@ const { object } = await generateObject({
 console.log(object);
 ```
+## All APIs
+```ts
+import { sarvam } from "sarvam-ai-sdk";
+// Text-to-Text + Chat Completion
+sarvam("sarvam-105b");
+sarvam.languageModel("sarvam-30b");
+// Text-to-Text + Transliteration
+sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
+// Text-to-Text + Translation
+sarvam.translation({ from: "en-IN", to: "ml-IN" });
+// Text-to-Text + Language identification
+sarvam.languageIdentification();
+// Text-to-Speech
+sarvam.speech("bulbul:v3", "ml-IN");
+// Speech-to-Text + Transcribe to same language
+sarvam.transcription("saarika:v2.5");
+// Speech-to-Text + Translate to English
+sarvam.speechTranslation("saaras:v3");
+```
 ## Documentation
 Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.

package/dist/index.d.mts CHANGED Viewed

@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
 type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
 type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
-declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>>;
+declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
 /**
  * Configuration settings for Sarvam Text-to-Speech API.
  *
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
      * @example false (Disable preprocessing)
      */
     enable_preprocessing?: boolean;
+    /**
+     * Specifies the audio codec for the output audio file.
+     * Different codecs offer various compression and quality characteristics.
+     */
+    output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
+    /**
+     * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
+     * Lower values produce more stable and consistent output,
+     * while higher values sound more expressive but may introduce artifacts or errors.
+     *
+     * Any number inbetween 0.01 - 2
+     * @default 0.6
+     *
+     * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
+     */
+    temperature?: number;
+    /**
+     * The ID of a pronunciation dictionary to apply during synthesis.
+     * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
+     *
+     * Only supported by bulbul:v3.
+     */
+    dict_id?: string;
+    /**
+     * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
+     *
+     * @default false
+     *
+     * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
+     */
+    enable_cached_responses?: boolean;
 };
 type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});

package/dist/index.d.ts CHANGED Viewed

@@ -52,7 +52,7 @@ declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "m
 type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
 type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
-declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>>;
+declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
 /**
  * Configuration settings for Sarvam Text-to-Speech API.
  *
@@ -111,6 +111,37 @@ type SarvamSpeechSettings = {
      * @example false (Disable preprocessing)
      */
     enable_preprocessing?: boolean;
+    /**
+     * Specifies the audio codec for the output audio file.
+     * Different codecs offer various compression and quality characteristics.
+     */
+    output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
+    /**
+     * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
+     * Lower values produce more stable and consistent output,
+     * while higher values sound more expressive but may introduce artifacts or errors.
+     *
+     * Any number inbetween 0.01 - 2
+     * @default 0.6
+     *
+     * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
+     */
+    temperature?: number;
+    /**
+     * The ID of a pronunciation dictionary to apply during synthesis.
+     * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
+     *
+     * Only supported by bulbul:v3.
+     */
+    dict_id?: string;
+    /**
+     * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
+     *
+     * @default false
+     *
+     * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
+     */
+    enable_cached_responses?: boolean;
 };
 type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});

package/dist/index.js CHANGED Viewed

@@ -767,6 +767,7 @@ var sarvamChatChunkSchema = import_zod2.z.union([
 // src/sarvam-speech-model.ts
 var import_provider_utils5 = require("@ai-sdk/provider-utils");
+var import_zod4 = require("zod");
 // src/sarvam-speech-settings.ts
 var import_zod3 = require("zod");
@@ -821,7 +822,17 @@ var SpeakerSchema = import_zod3.z.enum([
   "suhani",
   "kavitha",
   "rupali"
-]).default("shubh");
+]);
+var outputAudioCodecSchema = import_zod3.z.enum([
+  "mp3",
+  "linear16",
+  "mulaw",
+  "alaw",
+  "opus",
+  "flac",
+  "aac",
+  "wav"
+]);
 var SarvamProviderOptionsSchema = import_zod3.z.object({
   speaker: SpeakerSchema,
   pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
@@ -833,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
     import_zod3.z.literal(22050),
     import_zod3.z.literal(24e3)
   ]).default(22050),
-  enable_preprocessing: import_zod3.z.boolean().default(false)
+  enable_preprocessing: import_zod3.z.boolean().default(false),
+  output_audio_codec: outputAudioCodecSchema.optional(),
+  temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
+  dict_id: import_zod3.z.string().optional(),
+  enable_cached_responses: import_zod3.z.boolean().default(false)
 }).partial();
 // src/sarvam-speech-model.ts
-var import_zod4 = require("zod");
 var SarvamSpeechModel = class {
   constructor(modelId, languageCode, config) {
     this.modelId = modelId;
@@ -852,7 +866,7 @@ var SarvamSpeechModel = class {
     text,
     voice,
     outputFormat = "wav",
-    // speed,
+    speed,
     // instructions,
     providerOptions
   }) {
@@ -868,7 +882,6 @@ var SarvamSpeechModel = class {
       schema: SarvamProviderOptionsSchema
     });
     const getSpeaker = () => {
-      if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
       if (voice) {
         return SpeakerSchema.parse(voice);
       }
@@ -884,32 +897,27 @@ var SarvamSpeechModel = class {
       model: this.modelId,
       text,
       target_language_code: this.languageCode,
-      speaker: getSpeaker()
-      // response_format: "wav",
-      // speed,
-      // instructions,
+      speaker: getSpeaker(),
+      pace: speed
     };
     if (outputFormat) {
-      if (["mp3", "opus", "aac", "flac", "wav", "pcm"].includes(
-        outputFormat
-      )) {
-        requestBody.response_format = outputFormat;
+      const of = outputAudioCodecSchema.safeParse(outputFormat);
+      if (of.success) {
+        requestBody.output_audio_codec = of.data;
       } else {
         warnings.push({
           type: "unsupported-setting",
           setting: "outputFormat",
-          details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
+          details: `Unsupported output format: ${outputFormat}. Using wav instead.`
         });
       }
     }
     if (sarvamOptions) {
-      const speechModelOptions = {};
-      for (const key in speechModelOptions) {
-        const value = speechModelOptions[key];
+      Object.entries(sarvamOptions).forEach(([key, value]) => {
         if (value !== void 0) {
           requestBody[key] = value;
         }
-      }
+      });
     }
     return {
       requestBody,
@@ -987,7 +995,6 @@ var SarvamTranscriptionModel = class {
     mediaType,
     providerOptions
   }) {
-    var _a;
     const warnings = [];
     const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
       provider: "sarvam",
@@ -1003,23 +1010,13 @@ var SarvamTranscriptionModel = class {
     const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
     formData.append("file", blob);
     formData.append("model", this.modelId);
+    formData.append("language_code", this.languageCode);
     if (sarvamOptions) {
-      formData.append("mode", (_a = sarvamOptions.mode) != null ? _a : "transcribe");
-      formData.append("language_code", this.languageCode);
-      formData.append(
-        "with_timestamps",
-        sarvamOptions.with_timestamps ? "true" : "false"
-      );
-      formData.append(
-        "with_diarization",
-        sarvamOptions.with_diarization ? "true" : "false"
-      );
-      if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
-        formData.append(
-          "num_speakers",
-          sarvamOptions.num_speakers.toString()
-        );
-      }
+      Object.entries(sarvamOptions).forEach(([key, value]) => {
+        if (value) {
+          formData.append(key, String(value));
+        }
+      });
     }
     return {
       formData,