PyPI - sarvamai - Versions diffs - 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl - Mend

sarvamai 0.1.23a3py3-none-any.whl → 0.1.23a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sarvamai/__init__.py +203 -405
sarvamai/chat/raw_client.py +20 -20
sarvamai/client.py +34 -186
sarvamai/core/__init__.py +21 -76
sarvamai/core/client_wrapper.py +3 -19
sarvamai/core/force_multipart.py +2 -4
sarvamai/core/http_client.py +97 -217
sarvamai/core/http_response.py +1 -1
sarvamai/core/jsonable_encoder.py +0 -8
sarvamai/core/pydantic_utilities.py +4 -110
sarvamai/errors/__init__.py +6 -40
sarvamai/errors/bad_request_error.py +1 -1
sarvamai/errors/forbidden_error.py +1 -1
sarvamai/errors/internal_server_error.py +1 -1
sarvamai/errors/service_unavailable_error.py +1 -1
sarvamai/errors/too_many_requests_error.py +1 -1
sarvamai/errors/unprocessable_entity_error.py +1 -1
sarvamai/requests/__init__.py +62 -150
sarvamai/requests/configure_connection.py +4 -0
sarvamai/requests/configure_connection_data.py +40 -11
sarvamai/requests/error_response_data.py +1 -1
sarvamai/requests/file_signed_url_details.py +1 -1
sarvamai/requests/speech_to_text_job_parameters.py +43 -2
sarvamai/requests/speech_to_text_transcription_data.py +2 -2
sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/speech_to_text/client.py +95 -10
sarvamai/speech_to_text/raw_client.py +147 -64
sarvamai/speech_to_text_job/client.py +60 -15
sarvamai/speech_to_text_job/raw_client.py +120 -120
sarvamai/speech_to_text_streaming/__init__.py +10 -38
sarvamai/speech_to_text_streaming/client.py +90 -8
sarvamai/speech_to_text_streaming/raw_client.py +90 -8
sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
sarvamai/speech_to_text_translate_streaming/client.py +8 -2
sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
sarvamai/text/raw_client.py +60 -60
sarvamai/text_to_speech/client.py +100 -16
sarvamai/text_to_speech/raw_client.py +120 -36
sarvamai/text_to_speech_streaming/__init__.py +2 -29
sarvamai/text_to_speech_streaming/client.py +19 -6
sarvamai/text_to_speech_streaming/raw_client.py +19 -6
sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
sarvamai/types/__init__.py +102 -222
sarvamai/types/chat_completion_request_message.py +2 -6
sarvamai/types/configure_connection.py +4 -0
sarvamai/types/configure_connection_data.py +40 -11
sarvamai/types/configure_connection_data_model.py +5 -0
sarvamai/types/configure_connection_data_speaker.py +35 -1
sarvamai/types/error_response_data.py +1 -1
sarvamai/types/file_signed_url_details.py +1 -1
sarvamai/types/mode.py +5 -0
sarvamai/types/speech_to_text_job_parameters.py +43 -2
sarvamai/types/speech_to_text_model.py +1 -1
sarvamai/types/speech_to_text_transcription_data.py +2 -2
sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/types/text_to_speech_model.py +1 -1
sarvamai/types/text_to_speech_speaker.py +35 -1
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
sarvamai/core/http_sse/__init__.py +0 -42
sarvamai/core/http_sse/_api.py +0 -112
sarvamai/core/http_sse/_decoders.py +0 -61
sarvamai/core/http_sse/_exceptions.py +0 -7
sarvamai/core/http_sse/_models.py +0 -17
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0

sarvamai/speech_to_text/client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .. import core
 from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ..core.request_options import RequestOptions
 from ..types.input_audio_codec import InputAudioCodec
+from ..types.mode import Mode
 from ..types.speech_to_text_language import SpeechToTextLanguage
 from ..types.speech_to_text_model import SpeechToTextModel
 from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +38,7 @@ class SpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +65,49 @@ class SpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -93,6 +132,7 @@ class SpeechToTextClient:
         _response = self._raw_client.transcribe(
             file=file,
             model=model,
+            mode=mode,
             language_code=language_code,
             input_audio_codec=input_audio_codec,
             request_options=request_options,
@@ -132,7 +172,10 @@ class SpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
         _response = await self._raw_client.transcribe(
             file=file,
             model=model,
+            mode=mode,
             language_code=language_code,
             input_audio_codec=input_audio_codec,
             request_options=request_options,
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

sarvamai/speech_to_text/raw_client.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # This file was auto-generated by Fern from our API Definition.
-import json
 import typing
 from json.decoder import JSONDecodeError
@@ -8,7 +7,6 @@ from .. import core
 from ..core.api_error import ApiError
 from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ..core.http_response import AsyncHttpResponse, HttpResponse
-from ..core.jsonable_encoder import jsonable_encoder
 from ..core.pydantic_utilities import parse_obj_as
 from ..core.request_options import RequestOptions
 from ..errors.bad_request_error import BadRequestError
@@ -18,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
 from ..errors.too_many_requests_error import TooManyRequestsError
 from ..errors.unprocessable_entity_error import UnprocessableEntityError
 from ..types.input_audio_codec import InputAudioCodec
+from ..types.mode import Mode
 from ..types.speech_to_text_language import SpeechToTextLanguage
 from ..types.speech_to_text_model import SpeechToTextModel
 from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +36,7 @@ class RawSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +63,49 @@ class RawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -86,7 +123,8 @@ class RawSpeechToTextClient:
             base_url=self._client_wrapper.get_environment().base,
             method="POST",
             data={
-                "model": json.dumps(jsonable_encoder(model)),
+                "model": model,
+                "mode": mode,
                 "language_code": language_code,
                 "input_audio_codec": input_audio_codec,
             },
@@ -111,9 +149,9 @@ class RawSpeechToTextClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -122,9 +160,9 @@ class RawSpeechToTextClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -133,9 +171,9 @@ class RawSpeechToTextClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -144,9 +182,9 @@ class RawSpeechToTextClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -155,9 +193,9 @@ class RawSpeechToTextClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -166,9 +204,9 @@ class RawSpeechToTextClient:
                 raise ServiceUnavailableError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -211,7 +249,10 @@ class RawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -230,7 +271,7 @@ class RawSpeechToTextClient:
             method="POST",
             data={
                 "prompt": prompt,
-                "model": json.dumps(jsonable_encoder(model)),
+                "model": model,
                 "input_audio_codec": input_audio_codec,
             },
             files={
@@ -254,9 +295,9 @@ class RawSpeechToTextClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -265,9 +306,9 @@ class RawSpeechToTextClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -276,9 +317,9 @@ class RawSpeechToTextClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -287,9 +328,9 @@ class RawSpeechToTextClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -298,9 +339,9 @@ class RawSpeechToTextClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -309,9 +350,9 @@ class RawSpeechToTextClient:
                 raise ServiceUnavailableError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -331,6 +372,7 @@ class AsyncRawSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -357,12 +399,49 @@ class AsyncRawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -380,7 +459,8 @@ class AsyncRawSpeechToTextClient:
             base_url=self._client_wrapper.get_environment().base,
             method="POST",
             data={
-                "model": json.dumps(jsonable_encoder(model)),
+                "model": model,
+                "mode": mode,
                 "language_code": language_code,
                 "input_audio_codec": input_audio_codec,
             },
@@ -405,9 +485,9 @@ class AsyncRawSpeechToTextClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -416,9 +496,9 @@ class AsyncRawSpeechToTextClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -427,9 +507,9 @@ class AsyncRawSpeechToTextClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -438,9 +518,9 @@ class AsyncRawSpeechToTextClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -449,9 +529,9 @@ class AsyncRawSpeechToTextClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -460,9 +540,9 @@ class AsyncRawSpeechToTextClient:
                 raise ServiceUnavailableError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -505,7 +585,10 @@ class AsyncRawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -524,7 +607,7 @@ class AsyncRawSpeechToTextClient:
             method="POST",
             data={
                 "prompt": prompt,
-                "model": json.dumps(jsonable_encoder(model)),
+                "model": model,
                 "input_audio_codec": input_audio_codec,
             },
             files={
@@ -548,9 +631,9 @@ class AsyncRawSpeechToTextClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -559,9 +642,9 @@ class AsyncRawSpeechToTextClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -570,9 +653,9 @@ class AsyncRawSpeechToTextClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -581,9 +664,9 @@ class AsyncRawSpeechToTextClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -592,9 +675,9 @@ class AsyncRawSpeechToTextClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -603,9 +686,9 @@ class AsyncRawSpeechToTextClient:
                 raise ServiceUnavailableError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),

sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl

sarvamai 0.1.23a3py3-none-any.whl → 0.1.23a5py3-none-any.whl