PyPI - sarvamai - Versions diffs - 0.1.23a4__py3-none-any.whl → 0.1.23a6__py3-none-any.whl - Mend

sarvamai 0.1.23a4py3-none-any.whl → 0.1.23a6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

sarvamai/speech_to_text_streaming/client.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .raw_client import AsyncRawSpeechToTextStreamingClient, RawSpeechToTextStre
 from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
 from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
 from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
+from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
 from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
 from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -50,6 +51,7 @@ class SpeechToTextStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -63,19 +65,61 @@ class SpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options (saarika:v2.5):**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            **Additional Options (saaras:v3 only):**
+            - `as-IN`: Assamese
+            - `ur-IN`: Urdu
+            - `ne-IN`: Nepali
+            - `kok-IN`: Konkani
+            - `ks-IN`: Kashmiri
+            - `sd-IN`: Sindhi
+            - `sa-IN`: Sanskrit
+            - `sat-IN`: Santali
+            - `mni-IN`: Manipuri
+            - `brx-IN`: Bodo
+            - `mai-IN`: Maithili
+            - `doi-IN`: Dogri
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -89,6 +133,10 @@ class SpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -115,6 +163,8 @@ class SpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:
@@ -165,6 +215,7 @@ class AsyncSpeechToTextStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -178,19 +229,61 @@ class AsyncSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options (saarika:v2.5):**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            **Additional Options (saaras:v3 only):**
+            - `as-IN`: Assamese
+            - `ur-IN`: Urdu
+            - `ne-IN`: Nepali
+            - `kok-IN`: Konkani
+            - `ks-IN`: Kashmiri
+            - `sd-IN`: Sindhi
+            - `sa-IN`: Sanskrit
+            - `sat-IN`: Santali
+            - `mni-IN`: Manipuri
+            - `brx-IN`: Bodo
+            - `mai-IN`: Maithili
+            - `doi-IN`: Dogri
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -204,6 +297,10 @@ class AsyncSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -230,6 +327,8 @@ class AsyncSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:

sarvamai/speech_to_text_streaming/raw_client.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..core.request_options import RequestOptions
 from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
 from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
 from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
+from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
 from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
 from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -38,6 +39,7 @@ class RawSpeechToTextStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -51,19 +53,61 @@ class RawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options (saarika:v2.5):**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            **Additional Options (saaras:v3 only):**
+            - `as-IN`: Assamese
+            - `ur-IN`: Urdu
+            - `ne-IN`: Nepali
+            - `kok-IN`: Konkani
+            - `ks-IN`: Kashmiri
+            - `sd-IN`: Sindhi
+            - `sa-IN`: Sanskrit
+            - `sat-IN`: Santali
+            - `mni-IN`: Manipuri
+            - `brx-IN`: Bodo
+            - `mai-IN`: Maithili
+            - `doi-IN`: Dogri
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -77,6 +121,10 @@ class RawSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -103,6 +151,8 @@ class RawSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._client_wrapper.get_headers()
         if api_subscription_key is not None:
@@ -142,6 +192,7 @@ class AsyncRawSpeechToTextStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -155,19 +206,61 @@ class AsyncRawSpeechToTextStreamingClient:
         Parameters
         ----------
         language_code : SpeechToTextStreamingLanguageCode
-            Language code for speech recognition (BCP-47 format)
+            Specifies the language of the input audio in BCP-47 format.
+            **Available Options (saarika:v2.5):**
+            - `hi-IN`: Hindi
+            - `bn-IN`: Bengali
+            - `gu-IN`: Gujarati
+            - `kn-IN`: Kannada
+            - `ml-IN`: Malayalam
+            - `mr-IN`: Marathi
+            - `od-IN`: Odia
+            - `pa-IN`: Punjabi
+            - `ta-IN`: Tamil
+            - `te-IN`: Telugu
+            - `en-IN`: English
+            **Additional Options (saaras:v3 only):**
+            - `as-IN`: Assamese
+            - `ur-IN`: Urdu
+            - `ne-IN`: Nepali
+            - `kok-IN`: Konkani
+            - `ks-IN`: Kashmiri
+            - `sd-IN`: Sindhi
+            - `sa-IN`: Sanskrit
+            - `sat-IN`: Santali
+            - `mni-IN`: Manipuri
+            - `brx-IN`: Bodo
+            - `mai-IN`: Maithili
+            - `doi-IN`: Dogri
         model : typing.Optional[SpeechToTextStreamingModel]
-            Speech to text model to use
+            Specifies the model to use for speech-to-text conversion.
+            - **saarika:v2.5** (default): Transcribes audio in the spoken language.
+            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
         mode : typing.Optional[SpeechToTextStreamingMode]
-            Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
-            - transcribe: Standard Whisper transcription
-            - translate: Standard Whisper translation to English
-            - indic-en: Translate Indic languages to English
-            - verbatim: Exact transcription in original script
-            - translit: Transliteration to Latin script
-            - codemix: Code-mixed output (native + English)
+            Mode of operation. **Only applicable when using saaras:v3 model.**
+            Example audio: 'मेरा फोन नंबर है 9840950950'
+            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
+              - Output: `मेरा फोन नंबर है 9840950950`
+            - **translate**: Translates speech from any supported Indic language to English.
+              - Output: `My phone number is 9840950950`
+            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
+              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
+            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
+              - Output: `mera phone number hai 9840950950`
+            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
+              - Output: `मेरा phone number है 9840950950`
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -181,6 +274,10 @@ class AsyncRawSpeechToTextStreamingClient:
         flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription
+        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -207,6 +304,8 @@ class AsyncRawSpeechToTextStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._client_wrapper.get_headers()
         if api_subscription_key is not None:

sarvamai/speech_to_text_streaming/types/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
 from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
+from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
 from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
 from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
 from .speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -12,6 +13,7 @@ from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignal
 __all__ = [
     "SpeechToTextStreamingFlushSignal",
     "SpeechToTextStreamingHighVadSensitivity",
+    "SpeechToTextStreamingInputAudioCodec",
     "SpeechToTextStreamingLanguageCode",
     "SpeechToTextStreamingMode",
     "SpeechToTextStreamingModel",

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py ADDED Viewed

@@ -0,0 +1,7 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+SpeechToTextStreamingInputAudioCodec = typing.Union[
+    typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
+]

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_language_code.py CHANGED Viewed

@@ -3,6 +3,30 @@
 import typing
 SpeechToTextStreamingLanguageCode = typing.Union[
-    typing.Literal["en-IN", "hi-IN", "bn-IN", "gu-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN"],
+    typing.Literal[
+        "en-IN",
+        "hi-IN",
+        "bn-IN",
+        "gu-IN",
+        "kn-IN",
+        "ml-IN",
+        "mr-IN",
+        "od-IN",
+        "pa-IN",
+        "ta-IN",
+        "te-IN",
+        "as-IN",
+        "ur-IN",
+        "ne-IN",
+        "kok-IN",
+        "ks-IN",
+        "sd-IN",
+        "sa-IN",
+        "sat-IN",
+        "mni-IN",
+        "brx-IN",
+        "mai-IN",
+        "doi-IN",
+    ],
     typing.Any,
 ]

sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py CHANGED Viewed

@@ -3,5 +3,5 @@
 import typing
 SpeechToTextStreamingMode = typing.Union[
-    typing.Literal["transcribe", "translate", "indic-en", "verbatim", "translit", "codemix"], typing.Any
+    typing.Literal["transcribe", "translate", "verbatim", "translit", "codemix"], typing.Any
 ]

sarvamai/speech_to_text_translate_streaming/__init__.py CHANGED Viewed

@@ -5,11 +5,13 @@
 from .types import (
     SpeechToTextTranslateStreamingFlushSignal,
     SpeechToTextTranslateStreamingHighVadSensitivity,
+    SpeechToTextTranslateStreamingInputAudioCodec,
     SpeechToTextTranslateStreamingVadSignals,
 )
 __all__ = [
     "SpeechToTextTranslateStreamingFlushSignal",
     "SpeechToTextTranslateStreamingHighVadSensitivity",
+    "SpeechToTextTranslateStreamingInputAudioCodec",
     "SpeechToTextTranslateStreamingVadSignals",
 ]

sarvamai/speech_to_text_translate_streaming/client.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .types.speech_to_text_translate_streaming_flush_signal import SpeechToTextT
 from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
     SpeechToTextTranslateStreamingHighVadSensitivity,
 )
+from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
 from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
 try:
@@ -47,6 +48,7 @@ class SpeechToTextTranslateStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -60,7 +62,10 @@ class SpeechToTextTranslateStreamingClient:
         Parameters
         ----------
         model : typing.Optional[typing.Literal["saaras:v2.5"]]
-            Speech to text model to use (defaults to "saaras:v2.5" if not specified)
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -74,6 +79,10 @@ class SpeechToTextTranslateStreamingClient:
         flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription and translation
+        input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -96,6 +105,8 @@ class SpeechToTextTranslateStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:
@@ -144,6 +155,7 @@ class AsyncSpeechToTextTranslateStreamingClient:
         high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
         vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
         flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
+        input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
         api_subscription_key: typing.Optional[str] = None,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -157,7 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
         Parameters
         ----------
         model : typing.Optional[typing.Literal["saaras:v2.5"]]
-            Speech to text model to use (defaults to "saaras:v2.5" if not specified)
+            Model to be used for speech to text translation.
+            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
+              - Example: Hindi audio → English text output
         sample_rate : typing.Optional[str]
             Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -171,6 +186,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
         flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
             Signal to flush the audio buffer and finalize transcription and translation
+        input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
+            Audio codec/format of the input stream. Use this when sending raw PCM audio.
+            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
         api_subscription_key : typing.Optional[str]
             API subscription key for authentication
@@ -193,6 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
             query_params = query_params.add("vad_signals", vad_signals)
         if flush_signal is not None:
             query_params = query_params.add("flush_signal", flush_signal)
+        if input_audio_codec is not None:
+            query_params = query_params.add("input_audio_codec", input_audio_codec)
         ws_url = ws_url + f"?{query_params}"
         headers = self._raw_client._client_wrapper.get_headers()
         if api_subscription_key is not None:

sarvamai 0.1.23a4__py3-none-any.whl → 0.1.23a6__py3-none-any.whl

sarvamai 0.1.23a4py3-none-any.whl → 0.1.23a6py3-none-any.whl