npm - sarvam-ai-sdk - Versions diffs - 0.1.1 → 0.1.3 - Mend

sarvam-ai-sdk 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # AI SDK - Sarvam Provider
-The **[Sarvam provider](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://ai-sdk.dev/docs)
+The **[Sarvam provider](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** for the [AI SDK](https://v4.ai-sdk.dev/docs)
 contains language model support for the Sarvam chat completion, Text-to-Speech and Speech-to-Text APIs.
 ## Setup
@@ -11,6 +11,9 @@ The **[Sarvam](http://sarvam.ai)** provider is available in the `sarvam-ai-sdk`
 npm i sarvam-ai-sdk
 ```
+> [!WARNING]
+> This package only works with Vercel AI-SDK v4, not latest v6. Make sure to install `ai@4` in your project.
 ## Provider Instance
 You can import the default provider instance `sarvam` from `sarvam-ai-sdk`:
@@ -31,7 +34,7 @@ import { sarvam } from 'sarvam-ai-sdk';
 import { generateText } from 'ai';
 const { text } = await generateText({
-    model: sarvam("sarvam-m"),
+	model: sarvam("sarvam-30b"),
     prompt: "Translate this to malayalam: 'Keep cooking, guys'",
 });
@@ -46,7 +49,7 @@ import { experimental_generateSpeech as generateSpeech } from "ai";
 import { writeFile } from "fs/promises";
 const { audio } = await generateSpeech({
-    model: sarvam.speech("bulbul:v2", "ml-IN"),
+    model: sarvam.speech("bulbul:v3", "ml-IN"),
     text: "പാചകം തുടരൂ, സുഹൃത്തുക്കളേ",
 });
@@ -62,13 +65,26 @@ import { experimental_transcribe as transcribe } from "ai";
 import { readFile } from "fs/promises";
 const { text } = await transcribe({
-    model: sarvam.transcription("saarika:v2", "ml-IN")
+    model: sarvam.transcription("saarika:v2.5", "ml-IN"),
     audio: await readFile("./src/transcript-test.wav"),
 });
 console.log(text); // പാചകം തുടരും സുഹൃത്തുക്കളെ
 ```
+```ts
+import { sarvam } from "sarvam-ai-sdk";
+import { experimental_transcribe as transcribe } from "ai";
+import { readFile } from "fs/promises";
+const { text } = await transcribe({
+    model: sarvam.transcription("saaras:v3", "en-IN"),
+    audio: await readFile("./src/transcript-test.wav"),
+});
+console.log(text); // Pachakam thudaroo, suhruthukkale.
+```
 ## Speech-to-Text-Translate
 ```ts
@@ -77,7 +93,7 @@ import { experimental_transcribe as transcribe } from "ai";
 import { readFile } from "fs/promises";
 const result = await transcribe({
-    model: sarvam.speechTranslation("saaras:v2"),
+    model: sarvam.speechTranslation("saaras:v2.5"),
     audio: await readFile("./src/transcript-test.wav"),
 });
@@ -141,7 +157,7 @@ console.log(result.text); // ml-IN
 ## Tool Calling
 > [!WARNING]
-> Latest `sarvam-m` model isn't trained on native tool calling feature (aka JSON mode). So we simulate this with prompt engineering technique.
+> Latest `sarvam` models isn't trained on native tool calling feature (aka JSON mode). So we simulate this with prompt engineering technique.
 ```ts
 import { z } from "zod";
@@ -150,7 +166,7 @@ import { sarvam } from "sarvam-ai-sdk";
 const result = await generateText({
-  model: sarvam("sarvam-m", {
+  model: sarvam("sarvam-30b", {
     simulate: "tool-calling" // ⚠️ important
   }),
   tools: {
@@ -174,7 +190,7 @@ console.log(result.toolResults);
 ## Generate JSON object
 > [!WARNING]
-> Latest `sarvam-m` model isn't trained on native JSON object generation. So we simulate this with prompt engineering technique.
+> Latest `sarvam` models isn't trained on native JSON object generation. So we simulate this with prompt engineering technique.
 ```ts
 import { z } from "zod";
@@ -182,7 +198,7 @@ import { sarvam } from "sarvam-ai-sdk";
 import { generateObject } from 'ai';
 const { object } = await generateObject({
-  model: sarvam("sarvam-m", {
+  model: sarvam("sarvam-30b", {
     simulate: "json-object" // ⚠️ important
   }),
   schema: z.object({
@@ -198,6 +214,35 @@ const { object } = await generateObject({
 console.log(object);
 ```
+## All APIs
+```ts
+import { sarvam } from "sarvam-ai-sdk";
+// Text-to-Text + Chat Completion
+sarvam("sarvam-105b");
+sarvam.languageModel("sarvam-30b");
+// Text-to-Text + Transliteration
+sarvam.transliterate({ from: "en-IN", to: "ml-IN" });
+// Text-to-Text + Translation
+sarvam.translation({ from: "en-IN", to: "ml-IN" });
+// Text-to-Text + Language identification
+sarvam.languageIdentification();
+// Text-to-Speech
+sarvam.speech("bulbul:v3", "ml-IN");
+// Speech-to-Text + Transcribe to same language
+sarvam.transcription("saarika:v2.5");
+// Speech-to-Text + Translate to English
+sarvam.speechTranslation("saaras:v3");
+```
 ## Documentation
-Please check out the **[Sarvam provider documentation](https://ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.
+Please check out the **[Sarvam provider documentation](https://v4.ai-sdk.dev/providers/ai-sdk-providers/sarvam)** and **[Sarvam API documentation](https://docs.sarvam.ai)** for more information.

package/dist/index.d.mts CHANGED Viewed

@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
 import { FetchFunction } from '@ai-sdk/provider-utils';
 import { z } from 'zod';
-type SarvamChatModelId = "sarvam-m" | (string & {});
+/**
+ * @description Product models
+ */
+type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
+/**
+ * @description Legacy models
+ * @deprecated
+ */
+type SarvamChatLegacyModelId = "sarvam-m";
 interface SarvamChatSettings {
     /**
     * Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
 type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
 declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
-type SarvamSpeechModelId = "bulbul:v1" | "bulbul:v2" | (string & {});
+type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
 type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
-declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["meera", "pavithra", "maitreyi", "arvind", "amol", "amartya", "diya", "neel", "misha", "vian", "arjun", "maya", "anushka", "abhilash", "manisha", "vidya", "arya", "karun", "hitesh"]>>;
+declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
 /**
  * Configuration settings for Sarvam Text-to-Speech API.
  *
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
     /**
      * The speaker voice to be used for the output audio.
      *
-     * @default "meera"
-     * @example "meera" (Default female voice for bulbul:v1)
-     * @example "arvind" (Male voice for bulbul:v1)
-     * @example "anushka" (Female voice for bulbul:v2)
+     * @default
+     * - "shubh" (Male voice for bulbul:v3)
+     * - "anushka" (Female voice for bulbul:v2)
+     * - "meera" (Female voice for bulbul:v1)
      */
     speaker?: SarvamSpeechVoices;
     /**
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
      * @example false (Disable preprocessing)
      */
     enable_preprocessing?: boolean;
+    /**
+     * Specifies the audio codec for the output audio file.
+     * Different codecs offer various compression and quality characteristics.
+     */
+    output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
+    /**
+     * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
+     * Lower values produce more stable and consistent output,
+     * while higher values sound more expressive but may introduce artifacts or errors.
+     *
+     * Any number inbetween 0.01 - 2
+     * @default 0.6
+     *
+     * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
+     */
+    temperature?: number;
+    /**
+     * The ID of a pronunciation dictionary to apply during synthesis.
+     * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
+     *
+     * Only supported by bulbul:v3.
+     */
+    dict_id?: string;
+    /**
+     * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
+     *
+     * @default false
+     *
+     * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
+     */
+    enable_cached_responses?: boolean;
 };
-type SarvamTranscriptionModelId = "saarika:v2" | "saarika:v1" | "saarika:flash" | (string & {});
-type SarvamSpeechTranslationModelId = "saaras:v1" | "saaras:v2" | "saaras:turbo" | "saaras:flash" | (string & {});
+type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
+type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
+declare const SarvamProviderOptionsSchema: z.ZodObject<{
+    mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
+    with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
+    with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
+    num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
+}, "strip", z.ZodTypeAny, {
+    mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
+    with_timestamps: boolean | null;
+    with_diarization: boolean | null;
+    num_speakers?: number | null | undefined;
+}, {
+    mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
+    with_timestamps?: boolean | null | undefined;
+    with_diarization?: boolean | null | undefined;
+    num_speakers?: number | null | undefined;
+}>;
 type SarvamTranscriptionCallOptions = {
+    /**
+    * @default "transcribe"
+    *
+    * @description
+    * - `transcribe`: Standard transcription in the original language, `output`: Text in source language
+    * - `translate`: Transcribe and translate to English, `output`: English text
+    * - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
+    * - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
+    * - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
+    */
+    mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
+    /**
+     * - Chunk-level timestamp support
+     * - Useful for subtitle alignment and audio navigation
+     * - Provides start and end times for each segment of text
+     */
     with_timestamps?: boolean;
     /**
      * Enables speaker diarization, which identifies and separates different speakers in the audio.
@@ -240,11 +311,19 @@ interface SarvamProvider {
     /**
     * Creates a model for text generation.
     */
-    (modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
+    (
+    /**
+    * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
+    */
+    modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
     /**
     * Creates an Sarvam chat model for text generation.
     */
-    languageModel(modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
+    languageModel(
+    /**
+    * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
+    */
+    modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
     /**
     * Creates a Sarvam model for transcription.
     */

package/dist/index.d.ts CHANGED Viewed

@@ -2,7 +2,15 @@ import { LanguageModelV1, TranscriptionModelV1, SpeechModelV1 } from '@ai-sdk/pr
 import { FetchFunction } from '@ai-sdk/provider-utils';
 import { z } from 'zod';
-type SarvamChatModelId = "sarvam-m" | (string & {});
+/**
+ * @description Product models
+ */
+type SarvamChatModelId = "sarvam-30b" | "sarvam-30b-16k" | "sarvam-105b" | "sarvam-105b-32k" | SarvamChatLegacyModelId | (string & {});
+/**
+ * @description Legacy models
+ * @deprecated
+ */
+type SarvamChatLegacyModelId = "sarvam-m";
 interface SarvamChatSettings {
     /**
     * Whether to simulate artificial tool calling or JSON object generation, because Sarvam Models doen't support native Tool Calling or JSON Schmea.
@@ -42,9 +50,9 @@ interface SarvamChatSettings {
 type SarvamLanguageCode = z.infer<typeof SarvamLanguageCodeSchema>;
 declare const SarvamLanguageCodeSchema: z.ZodEnum<["hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN"]>;
-type SarvamSpeechModelId = "bulbul:v1" | "bulbul:v2" | (string & {});
+type SarvamSpeechModelId = "bulbul:v2" | "bulbul:v3" | (string & {});
 type SarvamSpeechVoices = z.infer<typeof SpeakerSchema>;
-declare const SpeakerSchema: z.ZodDefault<z.ZodEnum<["meera", "pavithra", "maitreyi", "arvind", "amol", "amartya", "diya", "neel", "misha", "vian", "arjun", "maya", "anushka", "abhilash", "manisha", "vidya", "arya", "karun", "hitesh"]>>;
+declare const SpeakerSchema: z.ZodEnum<["abhilash", "karun", "hitesh", "anushka", "manisha", "vidya", "arya", "shubh", "aditya", "rahul", "rohan", "amit", "dev", "ratan", "varun", "manan", "sumit", "kabir", "aayan", "ashutosh", "advait", "anand", "tarun", "sunny", "mani", "gokul", "vijay", "mohit", "rehan", "soham", "ritu", "priya", "neha", "pooja", "simran", "kavya", "ishita", "shreya", "roopa", "amelia", "sophia", "tanya", "shruti", "suhani", "kavitha", "rupali"]>;
 /**
  * Configuration settings for Sarvam Text-to-Speech API.
  *
@@ -56,10 +64,10 @@ type SarvamSpeechSettings = {
     /**
      * The speaker voice to be used for the output audio.
      *
-     * @default "meera"
-     * @example "meera" (Default female voice for bulbul:v1)
-     * @example "arvind" (Male voice for bulbul:v1)
-     * @example "anushka" (Female voice for bulbul:v2)
+     * @default
+     * - "shubh" (Male voice for bulbul:v3)
+     * - "anushka" (Female voice for bulbul:v2)
+     * - "meera" (Female voice for bulbul:v1)
      */
     speaker?: SarvamSpeechVoices;
     /**
@@ -103,11 +111,74 @@ type SarvamSpeechSettings = {
      * @example false (Disable preprocessing)
      */
     enable_preprocessing?: boolean;
+    /**
+     * Specifies the audio codec for the output audio file.
+     * Different codecs offer various compression and quality characteristics.
+     */
+    output_audio_codec?: "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac" | "wav";
+    /**
+     * Temperature controls how much randomness and expressiveness the TTS model uses while generating speech.
+     * Lower values produce more stable and consistent output,
+     * while higher values sound more expressive but may introduce artifacts or errors.
+     *
+     * Any number inbetween 0.01 - 2
+     * @default 0.6
+     *
+     * Note: This parameter is only supported for bulbul:v3. It has no effect on bulbul:v2.
+     */
+    temperature?: number;
+    /**
+     * The ID of a pronunciation dictionary to apply during synthesis.
+     * When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech.
+     *
+     * Only supported by bulbul:v3.
+     */
+    dict_id?: string;
+    /**
+     * Enable caching for the request. When enabled, identical requests will return cached audio instead of regenerating.
+     *
+     * @default false
+     *
+     * Currently in beta and only available for bulbul:v1 and bulbul:v2 models.
+     */
+    enable_cached_responses?: boolean;
 };
-type SarvamTranscriptionModelId = "saarika:v2" | "saarika:v1" | "saarika:flash" | (string & {});
-type SarvamSpeechTranslationModelId = "saaras:v1" | "saaras:v2" | "saaras:turbo" | "saaras:flash" | (string & {});
+type SarvamTranscriptionModelId = "saaras:v3" | "saarika:v2.5" | (string & {});
+type SarvamSpeechTranslationModelId = "saaras:v3" | "saaras:v2.5" | (string & {});
+declare const SarvamProviderOptionsSchema: z.ZodObject<{
+    mode: z.ZodDefault<z.ZodEnum<["transcribe", "translate", "verbatim", "translit", "codemix"]>>;
+    with_timestamps: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
+    with_diarization: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodBoolean>>>;
+    num_speakers: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
+}, "strip", z.ZodTypeAny, {
+    mode: "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
+    with_timestamps: boolean | null;
+    with_diarization: boolean | null;
+    num_speakers?: number | null | undefined;
+}, {
+    mode?: "transcribe" | "translate" | "verbatim" | "translit" | "codemix" | undefined;
+    with_timestamps?: boolean | null | undefined;
+    with_diarization?: boolean | null | undefined;
+    num_speakers?: number | null | undefined;
+}>;
 type SarvamTranscriptionCallOptions = {
+    /**
+    * @default "transcribe"
+    *
+    * @description
+    * - `transcribe`: Standard transcription in the original language, `output`: Text in source language
+    * - `translate`: Transcribe and translate to English, `output`: English text
+    * - `verbatim`: Word-for-word transcription including filler words and repetitions, `output`: Verbatim text in source language
+    * - `translit`: Transcribe and transliterate to Roman script, `output`: Romanized text
+    * - `codemix`: Transcribe code-mixed speech (e.g., Hindi-English) naturally, `output`: Code-mixed text
+    */
+    mode?: z.infer<typeof SarvamProviderOptionsSchema.shape.mode>;
+    /**
+     * - Chunk-level timestamp support
+     * - Useful for subtitle alignment and audio navigation
+     * - Provides start and end times for each segment of text
+     */
     with_timestamps?: boolean;
     /**
      * Enables speaker diarization, which identifies and separates different speakers in the audio.
@@ -240,11 +311,19 @@ interface SarvamProvider {
     /**
     * Creates a model for text generation.
     */
-    (modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
+    (
+    /**
+    * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
+    */
+    modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
     /**
     * Creates an Sarvam chat model for text generation.
     */
-    languageModel(modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
+    languageModel(
+    /**
+    * @description Sarvam-M (24B) is now a legacy model. But we recommend migrating to Sarvam-30B or Sarvam-105B for improved performance.
+    */
+    modelId: SarvamChatModelId, settings?: SarvamChatSettings): LanguageModelV1;
     /**
     * Creates a Sarvam model for transcription.
     */

package/dist/index.js CHANGED Viewed

@@ -767,30 +767,72 @@ var sarvamChatChunkSchema = import_zod2.z.union([
 // src/sarvam-speech-model.ts
 var import_provider_utils5 = require("@ai-sdk/provider-utils");
+var import_zod4 = require("zod");
 // src/sarvam-speech-settings.ts
 var import_zod3 = require("zod");
 var SpeakerSchema = import_zod3.z.enum([
-  "meera",
-  "pavithra",
-  "maitreyi",
-  "arvind",
-  "amol",
-  "amartya",
-  "diya",
-  "neel",
-  "misha",
-  "vian",
-  "arjun",
-  "maya",
-  "anushka",
+  // male bulbul:v2
   "abhilash",
+  "karun",
+  "hitesh",
+  // female bulbul:v2
+  "anushka",
   "manisha",
   "vidya",
   "arya",
-  "karun",
-  "hitesh"
-]).default("meera");
+  // male bulbul:v3
+  "shubh",
+  "aditya",
+  "rahul",
+  "rohan",
+  "amit",
+  "dev",
+  "ratan",
+  "varun",
+  "manan",
+  "sumit",
+  "kabir",
+  "aayan",
+  "ashutosh",
+  "advait",
+  "anand",
+  "tarun",
+  "sunny",
+  "mani",
+  "gokul",
+  "vijay",
+  "mohit",
+  "rehan",
+  "soham",
+  // female bulbul:v3
+  "ritu",
+  "priya",
+  "neha",
+  "pooja",
+  "simran",
+  "kavya",
+  "ishita",
+  "shreya",
+  "roopa",
+  "amelia",
+  "sophia",
+  "tanya",
+  "shruti",
+  "suhani",
+  "kavitha",
+  "rupali"
+]);
+var outputAudioCodecSchema = import_zod3.z.enum([
+  "mp3",
+  "linear16",
+  "mulaw",
+  "alaw",
+  "opus",
+  "flac",
+  "aac",
+  "wav"
+]);
 var SarvamProviderOptionsSchema = import_zod3.z.object({
   speaker: SpeakerSchema,
   pitch: import_zod3.z.number().min(-0.75).max(0.75).default(0),
@@ -802,11 +844,14 @@ var SarvamProviderOptionsSchema = import_zod3.z.object({
     import_zod3.z.literal(22050),
     import_zod3.z.literal(24e3)
   ]).default(22050),
-  enable_preprocessing: import_zod3.z.boolean().default(false)
+  enable_preprocessing: import_zod3.z.boolean().default(false),
+  output_audio_codec: outputAudioCodecSchema.optional(),
+  temperature: import_zod3.z.number().min(0.01).max(2).default(0.6),
+  dict_id: import_zod3.z.string().optional(),
+  enable_cached_responses: import_zod3.z.boolean().default(false)
 }).partial();
 // src/sarvam-speech-model.ts
-var import_zod4 = require("zod");
 var SarvamSpeechModel = class {
   constructor(modelId, languageCode, config) {
     this.modelId = modelId;
@@ -821,7 +866,7 @@ var SarvamSpeechModel = class {
     text,
     voice,
     outputFormat = "wav",
-    // speed,
+    speed,
     // instructions,
     providerOptions
   }) {
@@ -837,48 +882,42 @@ var SarvamSpeechModel = class {
       schema: SarvamProviderOptionsSchema
     });
     const getSpeaker = () => {
-      if (sarvamOptions == null ? void 0 : sarvamOptions.speaker) return sarvamOptions.speaker;
       if (voice) {
         return SpeakerSchema.parse(voice);
       }
       switch (this.modelId) {
-        case "bulbul:v1":
-          return "meera";
         case "bulbul:v2":
           return "manisha";
+        case "bulbul:v3":
+          return "shubh";
       }
-      return "meera";
+      return "shubh";
     };
     const requestBody = {
       model: this.modelId,
       text,
       target_language_code: this.languageCode,
-      speaker: getSpeaker()
-      // response_format: "wav",
-      // speed,
-      // instructions,
+      speaker: getSpeaker(),
+      pace: speed
     };
     if (outputFormat) {
-      if (["mp3", "opus", "aac", "flac", "wav", "pcm"].includes(
-        outputFormat
-      )) {
-        requestBody.response_format = outputFormat;
+      const of = outputAudioCodecSchema.safeParse(outputFormat);
+      if (of.success) {
+        requestBody.output_audio_codec = of.data;
       } else {
         warnings.push({
           type: "unsupported-setting",
           setting: "outputFormat",
-          details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
+          details: `Unsupported output format: ${outputFormat}. Using wav instead.`
         });
       }
     }
     if (sarvamOptions) {
-      const speechModelOptions = {};
-      for (const key in speechModelOptions) {
-        const value = speechModelOptions[key];
+      Object.entries(sarvamOptions).forEach(([key, value]) => {
         if (value !== void 0) {
           requestBody[key] = value;
         }
-      }
+      });
     }
     return {
       requestBody,
@@ -934,6 +973,7 @@ var import_zod6 = require("zod");
 // src/sarvam-transcription-settings.ts
 var import_zod5 = require("zod");
 var SarvamProviderOptionsSchema2 = import_zod5.z.object({
+  mode: import_zod5.z.enum(["transcribe", "translate", "verbatim", "translit", "codemix"]).default("transcribe"),
   with_timestamps: import_zod5.z.boolean().nullish().default(false),
   with_diarization: import_zod5.z.boolean().nullish().default(false),
   num_speakers: import_zod5.z.number().int().nullish()
@@ -956,10 +996,6 @@ var SarvamTranscriptionModel = class {
     providerOptions
   }) {
     const warnings = [];
-    if (this.modelId === "saarika:v1" && this.languageCode === "unknown")
-      throw new Error(
-        "Language code unknown is not supported for model saarika:v1"
-      );
     const sarvamOptions = (0, import_provider_utils6.parseProviderOptions)({
       provider: "sarvam",
       providerOptions: {
@@ -974,22 +1010,13 @@ var SarvamTranscriptionModel = class {
     const blob = audio instanceof Blob ? audio : new Blob([audio], { type: mediaType });
     formData.append("file", blob);
     formData.append("model", this.modelId);
+    formData.append("language_code", this.languageCode);
     if (sarvamOptions) {
-      formData.append("language_code", this.languageCode);
-      formData.append(
-        "with_timestamps",
-        sarvamOptions.with_timestamps ? "true" : "false"
-      );
-      formData.append(
-        "with_diarization",
-        sarvamOptions.with_diarization ? "true" : "false"
-      );
-      if (sarvamOptions.num_speakers !== null && sarvamOptions.num_speakers !== void 0) {
-        formData.append(
-          "num_speakers",
-          sarvamOptions.num_speakers.toString()
-        );
-      }
+      Object.entries(sarvamOptions).forEach(([key, value]) => {
+        if (value) {
+          formData.append(key, String(value));
+        }
+      });
     }
     return {
       formData,