PyPI - sarvamai - Versions diffs - 0.1.22a4__py3-none-any.whl → 0.1.22a8__py3-none-any.whl - Mend

sarvamai 0.1.22a4py3-none-any.whl → 0.1.22a8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

sarvamai/__init__.py +62 -3
sarvamai/client.py +3 -0
sarvamai/core/client_wrapper.py +2 -2
sarvamai/doc_digitization_job/__init__.py +4 -0
sarvamai/doc_digitization_job/client.py +775 -0
sarvamai/doc_digitization_job/job.py +496 -0
sarvamai/doc_digitization_job/raw_client.py +1176 -0
sarvamai/requests/__init__.py +20 -0
sarvamai/requests/audio_data.py +0 -6
sarvamai/requests/configure_connection.py +4 -0
sarvamai/requests/configure_connection_data.py +40 -11
sarvamai/requests/doc_digitization_create_job_response.py +25 -0
sarvamai/requests/doc_digitization_download_files_response.py +37 -0
sarvamai/requests/doc_digitization_error_details.py +21 -0
sarvamai/requests/doc_digitization_error_message.py +11 -0
sarvamai/requests/doc_digitization_job_detail.py +64 -0
sarvamai/requests/doc_digitization_job_parameters.py +21 -0
sarvamai/requests/doc_digitization_job_status_response.py +65 -0
sarvamai/requests/doc_digitization_page_error.py +24 -0
sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
sarvamai/requests/speech_to_text_job_parameters.py +43 -2
sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/speech_to_text/client.py +95 -10
sarvamai/speech_to_text/raw_client.py +95 -10
sarvamai/speech_to_text_job/client.py +60 -15
sarvamai/speech_to_text_streaming/__init__.py +4 -0
sarvamai/speech_to_text_streaming/client.py +102 -18
sarvamai/speech_to_text_streaming/raw_client.py +102 -18
sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
sarvamai/speech_to_text_translate_streaming/client.py +20 -12
sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
sarvamai/text/client.py +0 -12
sarvamai/text/raw_client.py +0 -12
sarvamai/text_to_speech/client.py +116 -14
sarvamai/text_to_speech/raw_client.py +116 -14
sarvamai/text_to_speech_streaming/__init__.py +2 -2
sarvamai/text_to_speech_streaming/client.py +19 -6
sarvamai/text_to_speech_streaming/raw_client.py +19 -6
sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
sarvamai/types/__init__.py +34 -2
sarvamai/types/audio_data.py +0 -6
sarvamai/types/configure_connection.py +4 -0
sarvamai/types/configure_connection_data.py +40 -11
sarvamai/types/configure_connection_data_model.py +5 -0
sarvamai/types/configure_connection_data_speaker.py +35 -1
sarvamai/types/doc_digitization_create_job_response.py +37 -0
sarvamai/types/doc_digitization_download_files_response.py +47 -0
sarvamai/types/doc_digitization_error_code.py +15 -0
sarvamai/types/doc_digitization_error_details.py +33 -0
sarvamai/types/doc_digitization_error_message.py +23 -0
sarvamai/types/doc_digitization_job_detail.py +74 -0
sarvamai/types/doc_digitization_job_detail_state.py +7 -0
sarvamai/types/doc_digitization_job_parameters.py +33 -0
sarvamai/types/doc_digitization_job_state.py +7 -0
sarvamai/types/doc_digitization_job_status_response.py +75 -0
sarvamai/types/doc_digitization_output_format.py +5 -0
sarvamai/types/doc_digitization_page_error.py +36 -0
sarvamai/types/doc_digitization_supported_language.py +32 -0
sarvamai/types/doc_digitization_upload_files_response.py +44 -0
sarvamai/types/doc_digitization_webhook_callback.py +31 -0
sarvamai/types/mode.py +5 -0
sarvamai/types/speech_to_text_job_parameters.py +43 -2
sarvamai/types/speech_to_text_model.py +1 -1
sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/types/text_to_speech_model.py +1 -1
sarvamai/types/text_to_speech_speaker.py +35 -1
{sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/METADATA +1 -1
{sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/RECORD +75 -42
sarvamai/types/audio_data_input_audio_codec.py +0 -33
{sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/WHEEL +0 -0

sarvamai/speech_to_text/client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .. import core
 from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ..core.request_options import RequestOptions
 from ..types.input_audio_codec import InputAudioCodec
+from ..types.mode import Mode
 from ..types.speech_to_text_language import SpeechToTextLanguage
 from ..types.speech_to_text_model import SpeechToTextModel
 from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +38,7 @@ class SpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +65,49 @@ class SpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -93,6 +132,7 @@ class SpeechToTextClient:
         _response = self._raw_client.transcribe(
             file=file,
             model=model,
+            mode=mode,
             language_code=language_code,
             input_audio_codec=input_audio_codec,
             request_options=request_options,
@@ -132,7 +172,10 @@ class SpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
         _response = await self._raw_client.transcribe(
             file=file,
             model=model,
+            mode=mode,
             language_code=language_code,
             input_audio_codec=input_audio_codec,
             request_options=request_options,
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

sarvamai/speech_to_text/raw_client.py CHANGED Viewed

@@ -16,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
 from ..errors.too_many_requests_error import TooManyRequestsError
 from ..errors.unprocessable_entity_error import UnprocessableEntityError
 from ..types.input_audio_codec import InputAudioCodec
+from ..types.mode import Mode
 from ..types.speech_to_text_language import SpeechToTextLanguage
 from ..types.speech_to_text_model import SpeechToTextModel
 from ..types.speech_to_text_response import SpeechToTextResponse
@@ -35,6 +36,7 @@ class RawSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -61,12 +63,49 @@ class RawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -85,6 +124,7 @@ class RawSpeechToTextClient:
             method="POST",
             data={
                 "model": model,
+                "mode": mode,
                 "language_code": language_code,
                 "input_audio_codec": input_audio_codec,
             },
@@ -209,7 +249,10 @@ class RawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -329,6 +372,7 @@ class AsyncRawSpeechToTextClient:
         *,
         file: core.File,
         model: typing.Optional[SpeechToTextModel] = OMIT,
+        mode: typing.Optional[Mode] = OMIT,
         language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
         input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
@@ -355,12 +399,49 @@ class AsyncRawSpeechToTextClient:
         model : typing.Optional[SpeechToTextModel]
             Specifies the model to use for speech-to-text conversion.
-            Note:- Default model is `saarika:v2.5`
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[Mode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         language_code : typing.Optional[SpeechToTextLanguage]
-            Specifies the language of the input audio.
-             For the `saarika:v2.5` model, it is optional.
-            `unknown`: Use this when the language is not known; the API will detect it automatically.
+            Specifies the language of the input audio in BCP-47 format.
+            **Note:** This parameter is optional for `saarika:v2.5` model.
+            **Available Options:**
+            - `unknown`: Use when the language is not known; the API will auto-detect.
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            - `gu-IN`: Gujarati
         input_audio_codec : typing.Optional[InputAudioCodec]
             Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -379,6 +460,7 @@ class AsyncRawSpeechToTextClient:
             method="POST",
             data={
                 "model": model,
+                "mode": mode,
                 "language_code": language_code,
                 "input_audio_codec": input_audio_codec,
             },
@@ -503,7 +585,10 @@ class AsyncRawSpeechToTextClient:
             Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
         model : typing.Optional[SpeechToTextTranslateModel]
-            Model to be used for converting speech to text in target language
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         input_audio_codec : typing.Optional[InputAudioCodec]
             Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

sarvamai/speech_to_text_job/client.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..types.files_upload_response import FilesUploadResponse
 from ..types.job_status_v_1_response import JobStatusV1Response
 from ..types.speech_to_text_model import SpeechToTextModel
 from ..types.speech_to_text_language import SpeechToTextLanguage
+from ..types.mode import Mode
 from .raw_client import AsyncRawSpeechToTextJobClient, RawSpeechToTextJobClient
 from .job import AsyncSpeechToTextJob, SpeechToTextJob
@@ -72,7 +73,9 @@ class SpeechToTextJobClient:
         )
         """
         _response = self._raw_client.initialise(
-            job_parameters=job_parameters, callback=callback, request_options=request_options
+            job_parameters=job_parameters,
+            callback=callback,
+            request_options=request_options,
         )
         return _response.data
@@ -145,11 +148,17 @@ class SpeechToTextJobClient:
             job_id="job_id",
         )
         """
-        _response = self._raw_client.start(job_id, ptu_id=ptu_id, request_options=request_options)
+        _response = self._raw_client.start(
+            job_id, ptu_id=ptu_id, request_options=request_options
+        )
         return _response.data
     def get_upload_links(
-        self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
+        self,
+        *,
+        job_id: str,
+        files: typing.Sequence[str],
+        request_options: typing.Optional[RequestOptions] = None,
     ) -> FilesUploadResponse:
         """
         Start a speech to text bulk job V1
@@ -180,11 +189,17 @@ class SpeechToTextJobClient:
             files=["files"],
         )
         """
-        _response = self._raw_client.get_upload_links(job_id=job_id, files=files, request_options=request_options)
+        _response = self._raw_client.get_upload_links(
+            job_id=job_id, files=files, request_options=request_options
+        )
         return _response.data
     def get_download_links(
-        self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
+        self,
+        *,
+        job_id: str,
+        files: typing.Sequence[str],
+        request_options: typing.Optional[RequestOptions] = None,
     ) -> FilesDownloadResponse:
         """
         Start a speech to text bulk job V1
@@ -215,12 +230,15 @@ class SpeechToTextJobClient:
             files=["files"],
         )
         """
-        _response = self._raw_client.get_download_links(job_id=job_id, files=files, request_options=request_options)
+        _response = self._raw_client.get_download_links(
+            job_id=job_id, files=files, request_options=request_options
+        )
         return _response.data
     def create_job(
         self,
         model: SpeechToTextModel = "saarika:v2.5",
+        mode: typing.Optional[Mode] = None,
         with_diarization: bool = False,
         with_timestamps: bool = False,
         language_code: typing.Optional[SpeechToTextLanguage] = None,
@@ -236,6 +254,10 @@ class SpeechToTextJobClient:
         model : SpeechToTextModel, default="saarika:v2.5"
             The model to use for transcription.
+        mode : typing.Optional[Mode], default=None
+            Mode of operation. Only applicable for saaras:v3 model.
+            Options: transcribe, translate, indic-en, verbatim, translit, codemix
         with_diarization : typing.Optional[bool], default=False
             Whether to enable speaker diarization (distinguishing who said what).
@@ -244,7 +266,7 @@ class SpeechToTextJobClient:
         language_code : typing.Optional[SpeechToTextLanguage], default=None
             The language code of the input audio (e.g., "hi-IN", "bn-IN").
         num_speakers : typing.Optional[int], default=None
             The number of distinct speakers in the audio, if known.
@@ -263,6 +285,7 @@ class SpeechToTextJobClient:
             job_parameters=SpeechToTextJobParametersParams(
                 language_code=language_code,
                 model=model,
+                mode=mode,  # type: ignore[typeddict-item]
                 num_speakers=num_speakers,  # type: ignore[typeddict-item]
                 with_diarization=with_diarization,
                 with_timestamps=with_timestamps,
@@ -350,7 +373,9 @@ class AsyncSpeechToTextJobClient:
         asyncio.run(main())
         """
         _response = await self._raw_client.initialise(
-            job_parameters=job_parameters, callback=callback, request_options=request_options
+            job_parameters=job_parameters,
+            callback=callback,
+            request_options=request_options,
         )
         return _response.data
@@ -392,7 +417,9 @@ class AsyncSpeechToTextJobClient:
         asyncio.run(main())
         """
-        _response = await self._raw_client.get_status(job_id, request_options=request_options)
+        _response = await self._raw_client.get_status(
+            job_id, request_options=request_options
+        )
         return _response.data
     async def start(
@@ -439,11 +466,17 @@ class AsyncSpeechToTextJobClient:
         asyncio.run(main())
         """
-        _response = await self._raw_client.start(job_id, ptu_id=ptu_id, request_options=request_options)
+        _response = await self._raw_client.start(
+            job_id, ptu_id=ptu_id, request_options=request_options
+        )
         return _response.data
     async def get_upload_links(
-        self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
+        self,
+        *,
+        job_id: str,
+        files: typing.Sequence[str],
+        request_options: typing.Optional[RequestOptions] = None,
     ) -> FilesUploadResponse:
         """
         Start a speech to text bulk job V1
@@ -482,11 +515,17 @@ class AsyncSpeechToTextJobClient:
         asyncio.run(main())
         """
-        _response = await self._raw_client.get_upload_links(job_id=job_id, files=files, request_options=request_options)
+        _response = await self._raw_client.get_upload_links(
+            job_id=job_id, files=files, request_options=request_options
+        )
         return _response.data
     async def get_download_links(
-        self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
+        self,
+        *,
+        job_id: str,
+        files: typing.Sequence[str],
+        request_options: typing.Optional[RequestOptions] = None,
     ) -> FilesDownloadResponse:
         """
         Start a speech to text bulk job V1
@@ -533,6 +572,7 @@ class AsyncSpeechToTextJobClient:
     async def create_job(
         self,
         model: SpeechToTextModel = "saarika:v2.5",
+        mode: typing.Optional[Mode] = None,
         with_diarization: bool = False,
         with_timestamps: bool = False,
         language_code: typing.Optional[SpeechToTextLanguage] = None,
@@ -548,6 +588,10 @@ class AsyncSpeechToTextJobClient:
         model : SpeechToTextModel, default="saarika:v2.5"
             The model to use for transcription.
+        mode : typing.Optional[Mode], default=None
+            Mode of operation. Only applicable for saaras:v3 model.
+            Options: transcribe, translate, indic-en, verbatim, translit, codemix
         with_diarization : typing.Optional[bool], default=False
             Whether to enable speaker diarization (distinguishing who said what).
@@ -556,8 +600,8 @@ class AsyncSpeechToTextJobClient:
         language_code : typing.Optional[SpeechToTextLanguage], default=None
             The language code of the input audio (e.g., "hi-IN", "bn-IN").
-        num_speakers : typing.Optional[int], default=None
+        num_speakers : typing.Optional[int] = None
             The number of distinct speakers in the audio, if known.
         callback : typing.Optional[BulkJobCallbackParams], default=OMIT
@@ -575,6 +619,7 @@ class AsyncSpeechToTextJobClient:
             job_parameters=SpeechToTextJobParametersParams(
                 language_code=language_code,
                 model=model,
+                mode=mode,  # type: ignore[typeddict-item]
                 with_diarization=with_diarization,
                 with_timestamps=with_timestamps,
                 num_speakers=num_speakers,  # type: ignore[typeddict-item]

sarvamai/speech_to_text_streaming/__init__.py CHANGED Viewed

@@ -7,6 +7,8 @@ from .types import (
     SpeechToTextStreamingHighVadSensitivity,
     SpeechToTextStreamingInputAudioCodec,
     SpeechToTextStreamingLanguageCode,
+    SpeechToTextStreamingMode,
+    SpeechToTextStreamingModel,
     SpeechToTextStreamingVadSignals,
 )
@@ -15,5 +17,7 @@ __all__ = [
     "SpeechToTextStreamingHighVadSensitivity",
     "SpeechToTextStreamingInputAudioCodec",
     "SpeechToTextStreamingLanguageCode",
+    "SpeechToTextStreamingMode",
+    "SpeechToTextStreamingModel",
     "SpeechToTextStreamingVadSignals",
 ]

sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a8__py3-none-any.whl

sarvamai 0.1.22a4py3-none-any.whl → 0.1.22a8py3-none-any.whl