PyPI - sarvamai - Versions diffs - 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl - Mend

sarvamai 0.1.22a3py3-none-any.whl → 0.1.22a7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sarvamai/__init__.py +62 -9
sarvamai/client.py +3 -0
sarvamai/core/client_wrapper.py +2 -2
sarvamai/doc_digitization_job/__init__.py +4 -0
sarvamai/doc_digitization_job/client.py +776 -0
sarvamai/doc_digitization_job/job.py +496 -0
sarvamai/doc_digitization_job/raw_client.py +1176 -0
sarvamai/requests/__init__.py +20 -0
sarvamai/requests/audio_data.py +0 -6
sarvamai/requests/configure_connection.py +4 -0
sarvamai/requests/configure_connection_data.py +40 -11
sarvamai/requests/doc_digitization_create_job_response.py +25 -0
sarvamai/requests/doc_digitization_download_files_response.py +37 -0
sarvamai/requests/doc_digitization_error_details.py +21 -0
sarvamai/requests/doc_digitization_error_message.py +11 -0
sarvamai/requests/doc_digitization_job_detail.py +64 -0
sarvamai/requests/doc_digitization_job_parameters.py +21 -0
sarvamai/requests/doc_digitization_job_status_response.py +65 -0
sarvamai/requests/doc_digitization_page_error.py +24 -0
sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
sarvamai/requests/speech_to_text_job_parameters.py +43 -2
sarvamai/requests/speech_to_text_transcription_data.py +0 -6
sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/requests/speech_to_text_translate_transcription_data.py +0 -6
sarvamai/speech_to_text/client.py +95 -10
sarvamai/speech_to_text/raw_client.py +95 -10
sarvamai/speech_to_text_job/client.py +60 -15
sarvamai/speech_to_text_job/job.py +100 -2
sarvamai/speech_to_text_job/raw_client.py +14 -10
sarvamai/speech_to_text_streaming/__init__.py +4 -2
sarvamai/speech_to_text_streaming/client.py +100 -47
sarvamai/speech_to_text_streaming/raw_client.py +100 -47
sarvamai/speech_to_text_streaming/types/__init__.py +4 -2
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
sarvamai/speech_to_text_translate_job/job.py +100 -2
sarvamai/speech_to_text_translate_job/raw_client.py +14 -10
sarvamai/speech_to_text_translate_streaming/__init__.py +0 -2
sarvamai/speech_to_text_translate_streaming/client.py +18 -41
sarvamai/speech_to_text_translate_streaming/raw_client.py +18 -41
sarvamai/speech_to_text_translate_streaming/types/__init__.py +0 -4
sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
sarvamai/text/client.py +0 -12
sarvamai/text/raw_client.py +0 -12
sarvamai/text_to_speech/client.py +116 -14
sarvamai/text_to_speech/raw_client.py +116 -14
sarvamai/text_to_speech_streaming/__init__.py +2 -2
sarvamai/text_to_speech_streaming/client.py +19 -6
sarvamai/text_to_speech_streaming/raw_client.py +19 -6
sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
sarvamai/types/__init__.py +34 -4
sarvamai/types/audio_data.py +0 -6
sarvamai/types/completion_event_flag.py +3 -1
sarvamai/types/configure_connection.py +4 -0
sarvamai/types/configure_connection_data.py +40 -11
sarvamai/types/configure_connection_data_model.py +5 -0
sarvamai/types/configure_connection_data_speaker.py +35 -1
sarvamai/types/doc_digitization_create_job_response.py +37 -0
sarvamai/types/doc_digitization_download_files_response.py +47 -0
sarvamai/types/doc_digitization_error_code.py +15 -0
sarvamai/types/doc_digitization_error_details.py +33 -0
sarvamai/types/doc_digitization_error_message.py +23 -0
sarvamai/types/doc_digitization_job_detail.py +74 -0
sarvamai/types/doc_digitization_job_detail_state.py +7 -0
sarvamai/types/doc_digitization_job_parameters.py +33 -0
sarvamai/types/doc_digitization_job_state.py +7 -0
sarvamai/types/doc_digitization_job_status_response.py +75 -0
sarvamai/types/doc_digitization_output_format.py +5 -0
sarvamai/types/doc_digitization_page_error.py +36 -0
sarvamai/types/doc_digitization_supported_language.py +32 -0
sarvamai/types/doc_digitization_upload_files_response.py +44 -0
sarvamai/types/doc_digitization_webhook_callback.py +31 -0
sarvamai/types/mode.py +5 -0
sarvamai/types/speech_to_text_job_parameters.py +43 -2
sarvamai/types/speech_to_text_model.py +1 -1
sarvamai/types/speech_to_text_transcription_data.py +0 -6
sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/types/speech_to_text_translate_transcription_data.py +0 -6
sarvamai/types/text_to_speech_model.py +1 -1
sarvamai/types/text_to_speech_speaker.py +35 -1
{sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
{sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +86 -56
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_stream_ongoing_speech_results.py +0 -5
sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_stream_ongoing_speech_results.py +0 -5
sarvamai/types/audio_data_input_audio_codec.py +0 -33
sarvamai/types/response_speech_state.py +0 -7
{sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0

sarvamai/speech_to_text_job/raw_client.py CHANGED Viewed

@@ -39,7 +39,7 @@ class RawSpeechToTextJobClient:
         request_options: typing.Optional[RequestOptions] = None,
     ) -> HttpResponse[BulkJobInitResponseV1]:
         """
-        Get a job uuid, and storage folder details for speech to text bulk job v1
+        Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
         Parameters
         ----------
@@ -160,7 +160,9 @@ class RawSpeechToTextJobClient:
         self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
     ) -> HttpResponse[JobStatusV1Response]:
         """
-        Get the status of a speech to text bulk job V1
+        Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
+        **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
         Parameters
         ----------
@@ -270,7 +272,7 @@ class RawSpeechToTextJobClient:
         request_options: typing.Optional[RequestOptions] = None,
     ) -> HttpResponse[JobStatusV1Response]:
         """
-        Start a speech to text bulk job V1
+        Start processing a speech to text bulk job after all audio files have been uploaded
         Parameters
         ----------
@@ -381,7 +383,7 @@ class RawSpeechToTextJobClient:
         self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
     ) -> HttpResponse[FilesUploadResponse]:
         """
-        Start a speech to text bulk job V1
+        Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
         Parameters
         ----------
@@ -496,7 +498,7 @@ class RawSpeechToTextJobClient:
         self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
     ) -> HttpResponse[FilesDownloadResponse]:
         """
-        Start a speech to text bulk job V1
+        Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
         Parameters
         ----------
@@ -620,7 +622,7 @@ class AsyncRawSpeechToTextJobClient:
         request_options: typing.Optional[RequestOptions] = None,
     ) -> AsyncHttpResponse[BulkJobInitResponseV1]:
         """
-        Get a job uuid, and storage folder details for speech to text bulk job v1
+        Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
         Parameters
         ----------
@@ -741,7 +743,9 @@ class AsyncRawSpeechToTextJobClient:
         self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
     ) -> AsyncHttpResponse[JobStatusV1Response]:
         """
-        Get the status of a speech to text bulk job V1
+        Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
+        **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
         Parameters
         ----------
@@ -851,7 +855,7 @@ class AsyncRawSpeechToTextJobClient:
         request_options: typing.Optional[RequestOptions] = None,
     ) -> AsyncHttpResponse[JobStatusV1Response]:
         """
-        Start a speech to text bulk job V1
+        Start processing a speech to text bulk job after all audio files have been uploaded
         Parameters
         ----------
@@ -962,7 +966,7 @@ class AsyncRawSpeechToTextJobClient:
         self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
     ) -> AsyncHttpResponse[FilesUploadResponse]:
         """
-        Start a speech to text bulk job V1
+        Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
         Parameters
         ----------
@@ -1077,7 +1081,7 @@ class AsyncRawSpeechToTextJobClient:
         self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
     ) -> AsyncHttpResponse[FilesDownloadResponse]:
         """
-        Start a speech to text bulk job V1
+        Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
         Parameters
         ----------

sarvamai/speech_to_text_streaming/__init__.py CHANGED Viewed

@@ -7,7 +7,8 @@ from .types import (
     SpeechToTextStreamingHighVadSensitivity,
     SpeechToTextStreamingInputAudioCodec,
     SpeechToTextStreamingLanguageCode,
-    SpeechToTextStreamingStreamOngoingSpeechResults,
+    SpeechToTextStreamingMode,
+    SpeechToTextStreamingModel,
     SpeechToTextStreamingVadSignals,
 )
@@ -16,6 +17,7 @@ __all__ = [
     "SpeechToTextStreamingHighVadSensitivity",
     "SpeechToTextStreamingInputAudioCodec",
     "SpeechToTextStreamingLanguageCode",
-    "SpeechToTextStreamingStreamOngoingSpeechResults",
+    "SpeechToTextStreamingMode",
+    "SpeechToTextStreamingModel",
     "SpeechToTextStreamingVadSignals",
 ]

sarvamai/speech_to_text_streaming/client.py CHANGED Viewed

@@ -15,9 +15,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
 from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
 from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
-from .types.speech_to_text_streaming_stream_ongoing_speech_results import (
-    SpeechToTextStreamingStreamOngoingSpeechResults,
-)
+from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
+from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
 from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
 try:
@@ -46,14 +45,13 @@ class SpeechToTextStreamingClient:
         self,
         *,
         language_code: SpeechToTextStreamingLanguageCode,
-        model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
-        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
+        model: typing.Optional[SpeechToTextStreamingModel] = None,
+        mode: typing.Optional[SpeechToTextStreamingMode] = None,
         sample_rate: typing.Optional[str] = None,
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
-        stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
-        streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -67,13 +65,47 @@ class SpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition
+            Specifies the language of the input audio in BCP-47 format.
-        model : typing.Optional[typing.Literal["saarika:v2.5"]]
-            Speech to text model to use
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
-        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
-            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
+        model : typing.Optional[SpeechToTextStreamingModel]
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[SpeechToTextStreamingMode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -87,11 +119,9 @@ class SpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
-        stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
-            Enable streaming of ongoing speech results during active speech
-        streaming_ongoing_requests_frame_size : typing.Optional[str]
-            Frame size for streaming ongoing speech results (1-100)
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -109,8 +139,8 @@ class SpeechToTextStreamingClient:
             query_params = query_params.add("language-code", language_code)
         if model is not None:
             query_params = query_params.add("model", model)
-        if input_audio_codec is not None:
-            query_params = query_params.add("input_audio_codec", input_audio_codec)
+        if mode is not None:
+            query_params = query_params.add("mode", mode)
         if sample_rate is not None:
             query_params = query_params.add("sample_rate", sample_rate)
         if high_vad_sensitivity is not None:
@@ -119,12 +149,8 @@ class SpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
-        if stream_ongoing_speech_results is not None:
-            query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
-        if streaming_ongoing_requests_frame_size is not None:
-            query_params = query_params.add(
-                "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
-            )
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:
@@ -169,14 +195,13 @@ class AsyncSpeechToTextStreamingClient:
         self,
         *,
         language_code: SpeechToTextStreamingLanguageCode,
-        model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
-        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
+        model: typing.Optional[SpeechToTextStreamingModel] = None,
+        mode: typing.Optional[SpeechToTextStreamingMode] = None,
         sample_rate: typing.Optional[str] = None,
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
-        stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
-        streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -190,13 +215,47 @@ class AsyncSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition
+            Specifies the language of the input audio in BCP-47 format.
-        model : typing.Optional[typing.Literal["saarika:v2.5"]]
-            Speech to text model to use
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
-        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
-            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
+        model : typing.Optional[SpeechToTextStreamingModel]
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[SpeechToTextStreamingMode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -210,11 +269,9 @@ class AsyncSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
-        stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
-            Enable streaming of ongoing speech results during active speech
-        streaming_ongoing_requests_frame_size : typing.Optional[str]
-            Frame size for streaming ongoing speech results (1-100)
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -232,8 +289,8 @@ class AsyncSpeechToTextStreamingClient:
             query_params = query_params.add("language-code", language_code)
         if model is not None:
             query_params = query_params.add("model", model)
-        if input_audio_codec is not None:
-            query_params = query_params.add("input_audio_codec", input_audio_codec)
+        if mode is not None:
+            query_params = query_params.add("mode", mode)
         if sample_rate is not None:
             query_params = query_params.add("sample_rate", sample_rate)
         if high_vad_sensitivity is not None:
@@ -242,12 +299,8 @@ class AsyncSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
-        if stream_ongoing_speech_results is not None:
-            query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
-        if streaming_ongoing_requests_frame_size is not None:
-            query_params = query_params.add(
-                "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
-            )
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:

sarvamai/speech_to_text_streaming/raw_client.py CHANGED Viewed

@@ -14,9 +14,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
 from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
 from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
-from .types.speech_to_text_streaming_stream_ongoing_speech_results import (
-    SpeechToTextStreamingStreamOngoingSpeechResults,
-)
+from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
+from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
 from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
 try:
@@ -34,14 +33,13 @@ class RawSpeechToTextStreamingClient:
         self,
         *,
         language_code: SpeechToTextStreamingLanguageCode,
-        model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
-        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
+        model: typing.Optional[SpeechToTextStreamingModel] = None,
+        mode: typing.Optional[SpeechToTextStreamingMode] = None,
         sample_rate: typing.Optional[str] = None,
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
-        stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
-        streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -55,13 +53,47 @@ class RawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition
+            Specifies the language of the input audio in BCP-47 format.
-        model : typing.Optional[typing.Literal["saarika:v2.5"]]
-            Speech to text model to use
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
-        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
-            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
+        model : typing.Optional[SpeechToTextStreamingModel]
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[SpeechToTextStreamingMode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -75,11 +107,9 @@ class RawSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
-        stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
-            Enable streaming of ongoing speech results during active speech
-        streaming_ongoing_requests_frame_size : typing.Optional[str]
-            Frame size for streaming ongoing speech results (1-100)
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -97,8 +127,8 @@ class RawSpeechToTextStreamingClient:
             query_params = query_params.add("language-code", language_code)
         if model is not None:
             query_params = query_params.add("model", model)
-        if input_audio_codec is not None:
-            query_params = query_params.add("input_audio_codec", input_audio_codec)
+        if mode is not None:
+            query_params = query_params.add("mode", mode)
         if sample_rate is not None:
             query_params = query_params.add("sample_rate", sample_rate)
         if high_vad_sensitivity is not None:
@@ -107,12 +137,8 @@ class RawSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
-        if stream_ongoing_speech_results is not None:
-            query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
-        if streaming_ongoing_requests_frame_size is not None:
-            query_params = query_params.add(
-                "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
-            )
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._client_wrapper.get_headers()
         if api_subscription_key is not None:
@@ -146,14 +172,13 @@ class AsyncRawSpeechToTextStreamingClient:
         self,
         *,
         language_code: SpeechToTextStreamingLanguageCode,
-        model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
-        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
+        model: typing.Optional[SpeechToTextStreamingModel] = None,
+        mode: typing.Optional[SpeechToTextStreamingMode] = None,
         sample_rate: typing.Optional[str] = None,
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
-        stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
-        streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -167,13 +192,47 @@ class AsyncRawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition
+            Specifies the language of the input audio in BCP-47 format.
-        model : typing.Optional[typing.Literal["saarika:v2.5"]]
-            Speech to text model to use
+            **Available Options:**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
-        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
-            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
+        model : typing.Optional[SpeechToTextStreamingModel]
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
+        mode : typing.Optional[SpeechToTextStreamingMode]
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -187,11 +246,9 @@ class AsyncRawSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
-        stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
-            Enable streaming of ongoing speech results during active speech
-        streaming_ongoing_requests_frame_size : typing.Optional[str]
-            Frame size for streaming ongoing speech results (1-100)
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -209,8 +266,8 @@ class AsyncRawSpeechToTextStreamingClient:
             query_params = query_params.add("language-code", language_code)
         if model is not None:
             query_params = query_params.add("model", model)
-        if input_audio_codec is not None:
-            query_params = query_params.add("input_audio_codec", input_audio_codec)
+        if mode is not None:
+            query_params = query_params.add("mode", mode)
         if sample_rate is not None:
             query_params = query_params.add("sample_rate", sample_rate)
         if high_vad_sensitivity is not None:
@@ -219,12 +276,8 @@ class AsyncRawSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
-        if stream_ongoing_speech_results is not None:
-            query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
-        if streaming_ongoing_requests_frame_size is not None:
-            query_params = query_params.add(
-                "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
-            )
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._client_wrapper.get_headers()
         if api_subscription_key is not None:

sarvamai/speech_to_text_streaming/types/__init__.py CHANGED Viewed

@@ -6,7 +6,8 @@ from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSig
 from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
 from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
-from .speech_to_text_streaming_stream_ongoing_speech_results import SpeechToTextStreamingStreamOngoingSpeechResults
+from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
+from .speech_to_text_streaming_model import SpeechToTextStreamingModel
 from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
 __all__ = [
@@ -14,6 +15,7 @@ __all__ = [
     "SpeechToTextStreamingHighVadSensitivity",
     "SpeechToTextStreamingInputAudioCodec",
     "SpeechToTextStreamingLanguageCode",
-    "SpeechToTextStreamingStreamOngoingSpeechResults",
+    "SpeechToTextStreamingMode",
+    "SpeechToTextStreamingModel",
     "SpeechToTextStreamingVadSignals",
 ]

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py CHANGED Viewed

@@ -3,31 +3,5 @@
 import typing
 SpeechToTextStreamingInputAudioCodec = typing.Union[
-    typing.Literal[
-        "wav",
-        "x-wav",
-        "wave",
-        "mp3",
-        "mpeg",
-        "mpeg3",
-        "x-mp3",
-        "x-mpeg-3",
-        "aac",
-        "x-aac",
-        "aiff",
-        "x-aiff",
-        "ogg",
-        "opus",
-        "flac",
-        "x-flac",
-        "mp4",
-        "x-m4a",
-        "amr",
-        "x-ms-wma",
-        "webm",
-        "pcm_s16le",
-        "pcm_l16",
-        "pcm_raw",
-    ],
-    typing.Any,
+    typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
 ]

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py ADDED Viewed

@@ -0,0 +1,7 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+SpeechToTextStreamingMode = typing.Union[
+    typing.Literal["transcribe", "translate", "verbatim", "translit", "codemix"], typing.Any
+]

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+SpeechToTextStreamingModel = typing.Union[typing.Literal["saarika:v2.5", "saaras:v3"], typing.Any]

sarvamai 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl

sarvamai 0.1.22a3py3-none-any.whl → 0.1.22a7py3-none-any.whl