PyPI - sarvamai - Versions diffs - 0.1.23a4__tar.gz → 0.1.23a5__tar.gz - Mend

sarvamai 0.1.23a4tar.gz → 0.1.23a5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (236) hide show

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sarvamai
-Version: 0.1.23a4
+Version: 0.1.23a5
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "sarvamai"
 [tool.poetry]
 name = "sarvamai"
-version = "0.1.23a4"
+version = "0.1.23a5"
 description = ""
 readme = "README.md"
 authors = []

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/core/client_wrapper.py RENAMED Viewed

@@ -23,10 +23,10 @@ class BaseClientWrapper:
     def get_headers(self) -> typing.Dict[str, str]:
         headers: typing.Dict[str, str] = {
-            "User-Agent": "sarvamai/0.1.23a4",
+            "User-Agent": "sarvamai/0.1.23a5",
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "sarvamai",
-            "X-Fern-SDK-Version": "0.1.23a4",
+            "X-Fern-SDK-Version": "0.1.23a5",
             **(self.get_custom_headers() or {}),
         }
         headers["api-subscription-key"] = self.api_subscription_key

sarvamai-0.1.23a5/src/sarvamai/requests/speech_to_text_job_parameters.py ADDED Viewed

@@ -0,0 +1,73 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+from ..types.mode import Mode
+from ..types.speech_to_text_model import SpeechToTextModel
+from ..types.speech_to_text_translate_language import SpeechToTextTranslateLanguage
+class SpeechToTextJobParametersParams(typing_extensions.TypedDict):
+    language_code: typing_extensions.NotRequired[SpeechToTextTranslateLanguage]
+    """
+    Specifies the language of the input audio in BCP-47 format.
+    **Available Options:**
+    - `unknown` (default): Use when the language is not known; the API will auto-detect.
+    - `hi-IN`: Hindi
+    - `bn-IN`: Bengali
+    - `kn-IN`: Kannada
+    - `ml-IN`: Malayalam
+    - `mr-IN`: Marathi
+    - `od-IN`: Odia
+    - `pa-IN`: Punjabi
+    - `ta-IN`: Tamil
+    - `te-IN`: Telugu
+    - `en-IN`: English
+    - `gu-IN`: Gujarati
+    """
+    model: typing_extensions.NotRequired[SpeechToTextModel]
+    """
+    Model to be used for speech to text.
+    - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+    - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+    """
+    mode: typing_extensions.NotRequired[Mode]
+    """
+    Mode of operation. **Only applicable when using saaras:v3 model.**
+    Example audio: 'मेरा फोन नंबर है 9840950950'
+    - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+      - Output: `मेरा फोन नंबर है 9840950950`
+    - **translate**: Translates speech from any supported Indic language to English.
+      - Output: `My phone number is 9840950950`
+    - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+      - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+    - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+      - Output: `mera phone number hai 9840950950`
+    - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+      - Output: `मेरा phone number है 9840950950`
+    """
+    with_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to include timestamps in the response
+    """
+    with_diarization: typing_extensions.NotRequired[bool]
+    """
+    Enables speaker diarization, which identifies and separates different speakers in the audio. In beta mode
+    """
+    num_speakers: typing_extensions.NotRequired[int]
+    """
+    Number of speakers to be detected in the audio. This is used when with_diarization is true.
+    """

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/requests/speech_to_text_translate_job_parameters.py RENAMED Viewed

@@ -12,7 +12,10 @@ class SpeechToTextTranslateJobParametersParams(typing_extensions.TypedDict):
     model: typing_extensions.NotRequired[SpeechToTextTranslateModel]
     """
-    Model to be used for converting speech to text in target language
+    Model to be used for speech to text translation.
+    - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+      - Example: Hindi audio → English text output
     """
     with_diarization: typing_extensions.NotRequired[bool]

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/speech_to_text/client.py RENAMED Viewed

@@ -65,23 +65,49 @@ class SpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            - **saarika:v2.5** (default): Standard transcription model
-            - **saarika:v3**: Advanced transcription model
-            - **saaras:v3**: Advanced model with multiple output modes
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[Mode]
             Mode of operation. **Only applicable when using saaras:v3 model.**
-            - **transcribe** (default): Standard transcription
-            - **translate**: Translation to English
-            - **indic-en**: Indic to English translation
-            - **verbatim**: Exact transcription
-            - **translit**: Transliteration to Latin script
-            - **codemix**: Code-mixed output
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -146,7 +172,10 @@ class SpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -221,23 +250,49 @@ class AsyncSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            - **saarika:v2.5** (default): Standard transcription model
-            - **saarika:v3**: Advanced transcription model
-            - **saaras:v3**: Advanced model with multiple output modes
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[Mode]
             Mode of operation. **Only applicable when using saaras:v3 model.**
-            - **transcribe** (default): Standard transcription
-            - **translate**: Translation to English
-            - **indic-en**: Indic to English translation
-            - **verbatim**: Exact transcription
-            - **translit**: Transliteration to Latin script
-            - **codemix**: Code-mixed output
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -310,7 +365,10 @@ class AsyncSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/speech_to_text/raw_client.py RENAMED Viewed

@@ -63,23 +63,49 @@ class RawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            - **saarika:v2.5** (default): Standard transcription model
-            - **saarika:v3**: Advanced transcription model
-            - **saaras:v3**: Advanced model with multiple output modes
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[Mode]
             Mode of operation. **Only applicable when using saaras:v3 model.**
-            - **transcribe** (default): Standard transcription
-            - **translate**: Translation to English
-            - **indic-en**: Indic to English translation
-            - **verbatim**: Exact transcription
-            - **translit**: Transliteration to Latin script
-            - **codemix**: Code-mixed output
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -223,7 +249,10 @@ class RawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -370,23 +399,49 @@ class AsyncRawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            - **saarika:v2.5** (default): Standard transcription model
-            - **saarika:v3**: Advanced transcription model
-            - **saaras:v3**: Advanced model with multiple output modes
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[Mode]
             Mode of operation. **Only applicable when using saaras:v3 model.**
-            - **transcribe** (default): Standard transcription
-            - **translate**: Translation to English
-            - **indic-en**: Indic to English translation
-            - **verbatim**: Exact transcription
-            - **translit**: Transliteration to Latin script
-            - **codemix**: Code-mixed output
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -530,7 +585,10 @@ class AsyncRawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/speech_to_text_streaming/client.py RENAMED Viewed

@@ -63,19 +63,47 @@ class SpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -178,19 +206,47 @@ class AsyncSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.

{sarvamai-0.1.23a4 → sarvamai-0.1.23a5}/src/sarvamai/speech_to_text_streaming/raw_client.py RENAMED Viewed

@@ -51,19 +51,47 @@ class RawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -155,19 +183,47 @@ class AsyncRawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.

sarvamai 0.1.23a4__tar.gz → 0.1.23a5__tar.gz

sarvamai 0.1.23a4tar.gz → 0.1.23a5tar.gz