PyPI - sarvamai - Versions diffs - 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl - Mend

sarvamai 0.1.23a3py3-none-any.whl → 0.1.23a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

sarvamai/__init__.py +203 -405
sarvamai/chat/raw_client.py +20 -20
sarvamai/client.py +34 -186
sarvamai/core/__init__.py +21 -76
sarvamai/core/client_wrapper.py +3 -19
sarvamai/core/force_multipart.py +2 -4
sarvamai/core/http_client.py +97 -217
sarvamai/core/http_response.py +1 -1
sarvamai/core/jsonable_encoder.py +0 -8
sarvamai/core/pydantic_utilities.py +4 -110
sarvamai/errors/__init__.py +6 -40
sarvamai/errors/bad_request_error.py +1 -1
sarvamai/errors/forbidden_error.py +1 -1
sarvamai/errors/internal_server_error.py +1 -1
sarvamai/errors/service_unavailable_error.py +1 -1
sarvamai/errors/too_many_requests_error.py +1 -1
sarvamai/errors/unprocessable_entity_error.py +1 -1
sarvamai/requests/__init__.py +62 -150
sarvamai/requests/configure_connection.py +4 -0
sarvamai/requests/configure_connection_data.py +40 -11
sarvamai/requests/error_response_data.py +1 -1
sarvamai/requests/file_signed_url_details.py +1 -1
sarvamai/requests/speech_to_text_job_parameters.py +43 -2
sarvamai/requests/speech_to_text_transcription_data.py +2 -2
sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/speech_to_text/client.py +95 -10
sarvamai/speech_to_text/raw_client.py +147 -64
sarvamai/speech_to_text_job/client.py +60 -15
sarvamai/speech_to_text_job/raw_client.py +120 -120
sarvamai/speech_to_text_streaming/__init__.py +10 -38
sarvamai/speech_to_text_streaming/client.py +90 -8
sarvamai/speech_to_text_streaming/raw_client.py +90 -8
sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
sarvamai/speech_to_text_translate_streaming/client.py +8 -2
sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
sarvamai/text/raw_client.py +60 -60
sarvamai/text_to_speech/client.py +100 -16
sarvamai/text_to_speech/raw_client.py +120 -36
sarvamai/text_to_speech_streaming/__init__.py +2 -29
sarvamai/text_to_speech_streaming/client.py +19 -6
sarvamai/text_to_speech_streaming/raw_client.py +19 -6
sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
sarvamai/types/__init__.py +102 -222
sarvamai/types/chat_completion_request_message.py +2 -6
sarvamai/types/configure_connection.py +4 -0
sarvamai/types/configure_connection_data.py +40 -11
sarvamai/types/configure_connection_data_model.py +5 -0
sarvamai/types/configure_connection_data_speaker.py +35 -1
sarvamai/types/error_response_data.py +1 -1
sarvamai/types/file_signed_url_details.py +1 -1
sarvamai/types/mode.py +5 -0
sarvamai/types/speech_to_text_job_parameters.py +43 -2
sarvamai/types/speech_to_text_model.py +1 -1
sarvamai/types/speech_to_text_transcription_data.py +2 -2
sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
sarvamai/types/text_to_speech_model.py +1 -1
sarvamai/types/text_to_speech_speaker.py +35 -1
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
sarvamai/core/http_sse/__init__.py +0 -42
sarvamai/core/http_sse/_api.py +0 -112
sarvamai/core/http_sse/_decoders.py +0 -61
sarvamai/core/http_sse/_exceptions.py +0 -7
sarvamai/core/http_sse/_models.py +0 -17
{sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0

sarvamai/text_to_speech/client.py CHANGED Viewed

@@ -44,11 +44,22 @@ class TextToSpeechClient:
         enable_preprocessing: typing.Optional[bool] = OMIT,
         model: typing.Optional[TextToSpeechModel] = OMIT,
         output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
+        temperature: typing.Optional[float] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TextToSpeechResponse:
         """
-        This is the model to convert text into spoken audio.
-        The output is a wave file encoded as a base64 string.
+        Convert text into spoken audio. The output is a wave file encoded as a base64 string.
+        **Available Models:**
+        - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
+        - **bulbul:v3-beta**: Newer model with temperature control and improved quality
+        **Important Notes for bulbul:v3-beta:**
+        - Pitch and loudness parameters are NOT supported
+        - Pace must be between 0.5 and 2.0
+        - Preprocessing is automatically enabled
+        - Default sample rate is 24000 Hz
+        - Temperature parameter available (0.01-1.0, default 0.6)
         Parameters
         ----------
@@ -56,9 +67,12 @@ class TextToSpeechClient:
             The text(s) to be converted into speech.
             **Features:**
-            - Each text should be no longer than 1500 characters
             - Supports code-mixed text (English and Indic languages)
+            **Model-specific limits:**
+            - **bulbul:v2:** Max 1500 characters
+            - **bulbul:v3-beta:** Max 2500 characters
             **Important Note:**
             - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
             - This ensures proper pronunciation as a whole number
@@ -69,36 +83,63 @@ class TextToSpeechClient:
         speaker : typing.Optional[TextToSpeechSpeaker]
             The speaker voice to be used for the output audio.
-            **Default:** Anushka
+            **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
             **Model Compatibility (Speakers compatible with respective model):**
             - **bulbul:v2:**
               - Female: Anushka, Manisha, Vidya, Arya
               - Male: Abhilash, Karun, Hitesh
+            - **bulbul:v3-beta:**
+              - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
             **Note:** Speaker selection must match the chosen model version.
         pitch : typing.Optional[float]
             Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         pace : typing.Optional[float]
-            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
+            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
+            **Model-specific ranges:**
+            - **bulbul:v2:** 0.3 to 3.0
+            - **bulbul:v3-beta:** 0.5 to 2.0
         loudness : typing.Optional[float]
             Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         speech_sample_rate : typing.Optional[SpeechSampleRate]
-            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
+            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
+            **Model-specific defaults:**
+            - **bulbul:v2:** Default is 22050 Hz
+            - **bulbul:v3-beta:** Default is 24000 Hz
         enable_preprocessing : typing.Optional[bool]
-             Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
+            Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
+            **Model-specific behavior:**
+            - **bulbul:v2:** Default is false
+            - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
         model : typing.Optional[TextToSpeechModel]
-            Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
+            Specifies the model to use for text-to-speech conversion.
+            **Available models:**
+            - **bulbul:v2:** Default model with pitch, loudness controls
+            - **bulbul:v3-beta:** Newer model with temperature control, improved quality
         output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
             Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
+        temperature : typing.Optional[float]
+            Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
+            **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -130,6 +171,7 @@ class TextToSpeechClient:
             enable_preprocessing=enable_preprocessing,
             model=model,
             output_audio_codec=output_audio_codec,
+            temperature=temperature,
             request_options=request_options,
         )
         return _response.data
@@ -163,11 +205,22 @@ class AsyncTextToSpeechClient:
         enable_preprocessing: typing.Optional[bool] = OMIT,
         model: typing.Optional[TextToSpeechModel] = OMIT,
         output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
+        temperature: typing.Optional[float] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TextToSpeechResponse:
         """
-        This is the model to convert text into spoken audio.
-        The output is a wave file encoded as a base64 string.
+        Convert text into spoken audio. The output is a wave file encoded as a base64 string.
+        **Available Models:**
+        - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
+        - **bulbul:v3-beta**: Newer model with temperature control and improved quality
+        **Important Notes for bulbul:v3-beta:**
+        - Pitch and loudness parameters are NOT supported
+        - Pace must be between 0.5 and 2.0
+        - Preprocessing is automatically enabled
+        - Default sample rate is 24000 Hz
+        - Temperature parameter available (0.01-1.0, default 0.6)
         Parameters
         ----------
@@ -175,9 +228,12 @@ class AsyncTextToSpeechClient:
             The text(s) to be converted into speech.
             **Features:**
-            - Each text should be no longer than 1500 characters
             - Supports code-mixed text (English and Indic languages)
+            **Model-specific limits:**
+            - **bulbul:v2:** Max 1500 characters
+            - **bulbul:v3-beta:** Max 2500 characters
             **Important Note:**
             - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
             - This ensures proper pronunciation as a whole number
@@ -188,36 +244,63 @@ class AsyncTextToSpeechClient:
         speaker : typing.Optional[TextToSpeechSpeaker]
             The speaker voice to be used for the output audio.
-            **Default:** Anushka
+            **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
             **Model Compatibility (Speakers compatible with respective model):**
             - **bulbul:v2:**
               - Female: Anushka, Manisha, Vidya, Arya
               - Male: Abhilash, Karun, Hitesh
+            - **bulbul:v3-beta:**
+              - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
             **Note:** Speaker selection must match the chosen model version.
         pitch : typing.Optional[float]
             Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         pace : typing.Optional[float]
-            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
+            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
+            **Model-specific ranges:**
+            - **bulbul:v2:** 0.3 to 3.0
+            - **bulbul:v3-beta:** 0.5 to 2.0
         loudness : typing.Optional[float]
             Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         speech_sample_rate : typing.Optional[SpeechSampleRate]
-            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
+            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
+            **Model-specific defaults:**
+            - **bulbul:v2:** Default is 22050 Hz
+            - **bulbul:v3-beta:** Default is 24000 Hz
         enable_preprocessing : typing.Optional[bool]
-             Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
+            Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
+            **Model-specific behavior:**
+            - **bulbul:v2:** Default is false
+            - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
         model : typing.Optional[TextToSpeechModel]
-            Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
+            Specifies the model to use for text-to-speech conversion.
+            **Available models:**
+            - **bulbul:v2:** Default model with pitch, loudness controls
+            - **bulbul:v3-beta:** Newer model with temperature control, improved quality
         output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
             Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
+        temperature : typing.Optional[float]
+            Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
+            **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -257,6 +340,7 @@ class AsyncTextToSpeechClient:
             enable_preprocessing=enable_preprocessing,
             model=model,
             output_audio_codec=output_audio_codec,
+            temperature=temperature,
             request_options=request_options,
         )
         return _response.data

sarvamai/text_to_speech/raw_client.py CHANGED Viewed

@@ -41,11 +41,22 @@ class RawTextToSpeechClient:
         enable_preprocessing: typing.Optional[bool] = OMIT,
         model: typing.Optional[TextToSpeechModel] = OMIT,
         output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
+        temperature: typing.Optional[float] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> HttpResponse[TextToSpeechResponse]:
         """
-        This is the model to convert text into spoken audio.
-        The output is a wave file encoded as a base64 string.
+        Convert text into spoken audio. The output is a wave file encoded as a base64 string.
+        **Available Models:**
+        - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
+        - **bulbul:v3-beta**: Newer model with temperature control and improved quality
+        **Important Notes for bulbul:v3-beta:**
+        - Pitch and loudness parameters are NOT supported
+        - Pace must be between 0.5 and 2.0
+        - Preprocessing is automatically enabled
+        - Default sample rate is 24000 Hz
+        - Temperature parameter available (0.01-1.0, default 0.6)
         Parameters
         ----------
@@ -53,9 +64,12 @@ class RawTextToSpeechClient:
             The text(s) to be converted into speech.
             **Features:**
-            - Each text should be no longer than 1500 characters
             - Supports code-mixed text (English and Indic languages)
+            **Model-specific limits:**
+            - **bulbul:v2:** Max 1500 characters
+            - **bulbul:v3-beta:** Max 2500 characters
             **Important Note:**
             - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
             - This ensures proper pronunciation as a whole number
@@ -66,36 +80,63 @@ class RawTextToSpeechClient:
         speaker : typing.Optional[TextToSpeechSpeaker]
             The speaker voice to be used for the output audio.
-            **Default:** Anushka
+            **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
             **Model Compatibility (Speakers compatible with respective model):**
             - **bulbul:v2:**
               - Female: Anushka, Manisha, Vidya, Arya
               - Male: Abhilash, Karun, Hitesh
+            - **bulbul:v3-beta:**
+              - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
             **Note:** Speaker selection must match the chosen model version.
         pitch : typing.Optional[float]
             Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         pace : typing.Optional[float]
-            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
+            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
+            **Model-specific ranges:**
+            - **bulbul:v2:** 0.3 to 3.0
+            - **bulbul:v3-beta:** 0.5 to 2.0
         loudness : typing.Optional[float]
             Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         speech_sample_rate : typing.Optional[SpeechSampleRate]
-            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
+            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
+            **Model-specific defaults:**
+            - **bulbul:v2:** Default is 22050 Hz
+            - **bulbul:v3-beta:** Default is 24000 Hz
         enable_preprocessing : typing.Optional[bool]
-             Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
+            Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
+            **Model-specific behavior:**
+            - **bulbul:v2:** Default is false
+            - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
         model : typing.Optional[TextToSpeechModel]
-            Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
+            Specifies the model to use for text-to-speech conversion.
+            **Available models:**
+            - **bulbul:v2:** Default model with pitch, loudness controls
+            - **bulbul:v3-beta:** Newer model with temperature control, improved quality
         output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
             Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
+        temperature : typing.Optional[float]
+            Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
+            **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -119,6 +160,7 @@ class RawTextToSpeechClient:
                 "enable_preprocessing": enable_preprocessing,
                 "model": model,
                 "output_audio_codec": output_audio_codec,
+                "temperature": temperature,
             },
             headers={
                 "content-type": "application/json",
@@ -140,9 +182,9 @@ class RawTextToSpeechClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -151,9 +193,9 @@ class RawTextToSpeechClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -162,9 +204,9 @@ class RawTextToSpeechClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -173,9 +215,9 @@ class RawTextToSpeechClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -184,9 +226,9 @@ class RawTextToSpeechClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -214,11 +256,22 @@ class AsyncRawTextToSpeechClient:
         enable_preprocessing: typing.Optional[bool] = OMIT,
         model: typing.Optional[TextToSpeechModel] = OMIT,
         output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
+        temperature: typing.Optional[float] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> AsyncHttpResponse[TextToSpeechResponse]:
         """
-        This is the model to convert text into spoken audio.
-        The output is a wave file encoded as a base64 string.
+        Convert text into spoken audio. The output is a wave file encoded as a base64 string.
+        **Available Models:**
+        - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
+        - **bulbul:v3-beta**: Newer model with temperature control and improved quality
+        **Important Notes for bulbul:v3-beta:**
+        - Pitch and loudness parameters are NOT supported
+        - Pace must be between 0.5 and 2.0
+        - Preprocessing is automatically enabled
+        - Default sample rate is 24000 Hz
+        - Temperature parameter available (0.01-1.0, default 0.6)
         Parameters
         ----------
@@ -226,9 +279,12 @@ class AsyncRawTextToSpeechClient:
             The text(s) to be converted into speech.
             **Features:**
-            - Each text should be no longer than 1500 characters
             - Supports code-mixed text (English and Indic languages)
+            **Model-specific limits:**
+            - **bulbul:v2:** Max 1500 characters
+            - **bulbul:v3-beta:** Max 2500 characters
             **Important Note:**
             - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
             - This ensures proper pronunciation as a whole number
@@ -239,36 +295,63 @@ class AsyncRawTextToSpeechClient:
         speaker : typing.Optional[TextToSpeechSpeaker]
             The speaker voice to be used for the output audio.
-            **Default:** Anushka
+            **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
             **Model Compatibility (Speakers compatible with respective model):**
             - **bulbul:v2:**
               - Female: Anushka, Manisha, Vidya, Arya
               - Male: Abhilash, Karun, Hitesh
+            - **bulbul:v3-beta:**
+              - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
             **Note:** Speaker selection must match the chosen model version.
         pitch : typing.Optional[float]
             Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         pace : typing.Optional[float]
-            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
+            Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
+            **Model-specific ranges:**
+            - **bulbul:v2:** 0.3 to 3.0
+            - **bulbul:v3-beta:** 0.5 to 2.0
         loudness : typing.Optional[float]
             Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
+            **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
         speech_sample_rate : typing.Optional[SpeechSampleRate]
-            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
+            Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
+            **Model-specific defaults:**
+            - **bulbul:v2:** Default is 22050 Hz
+            - **bulbul:v3-beta:** Default is 24000 Hz
         enable_preprocessing : typing.Optional[bool]
-             Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
+            Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
+            **Model-specific behavior:**
+            - **bulbul:v2:** Default is false
+            - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
         model : typing.Optional[TextToSpeechModel]
-            Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
+            Specifies the model to use for text-to-speech conversion.
+            **Available models:**
+            - **bulbul:v2:** Default model with pitch, loudness controls
+            - **bulbul:v3-beta:** Newer model with temperature control, improved quality
         output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
             Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
+        temperature : typing.Optional[float]
+            Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
+            **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -292,6 +375,7 @@ class AsyncRawTextToSpeechClient:
                 "enable_preprocessing": enable_preprocessing,
                 "model": model,
                 "output_audio_codec": output_audio_codec,
+                "temperature": temperature,
             },
             headers={
                 "content-type": "application/json",
@@ -313,9 +397,9 @@ class AsyncRawTextToSpeechClient:
                 raise BadRequestError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -324,9 +408,9 @@ class AsyncRawTextToSpeechClient:
                 raise ForbiddenError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -335,9 +419,9 @@ class AsyncRawTextToSpeechClient:
                 raise UnprocessableEntityError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -346,9 +430,9 @@ class AsyncRawTextToSpeechClient:
                 raise TooManyRequestsError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),
@@ -357,9 +441,9 @@ class AsyncRawTextToSpeechClient:
                 raise InternalServerError(
                     headers=dict(_response.headers),
                     body=typing.cast(
-                        typing.Any,
+                        typing.Optional[typing.Any],
                         parse_obj_as(
-                            type_=typing.Any,  # type: ignore
+                            type_=typing.Optional[typing.Any],  # type: ignore
                             object_=_response.json(),
                         ),
                     ),

sarvamai/text_to_speech_streaming/__init__.py CHANGED Viewed

@@ -2,33 +2,6 @@
 # isort: skip_file
-import typing
-from importlib import import_module
+from .types import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
-if typing.TYPE_CHECKING:
-    from .types import TextToSpeechStreamingSendCompletionEvent
-_dynamic_imports: typing.Dict[str, str] = {"TextToSpeechStreamingSendCompletionEvent": ".types"}
-def __getattr__(attr_name: str) -> typing.Any:
-    module_name = _dynamic_imports.get(attr_name)
-    if module_name is None:
-        raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}")
-    try:
-        module = import_module(module_name, __package__)
-        if module_name == f".{attr_name}":
-            return module
-        else:
-            return getattr(module, attr_name)
-    except ImportError as e:
-        raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e
-    except AttributeError as e:
-        raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e
-def __dir__():
-    lazy_attrs = list(_dynamic_imports.keys())
-    return sorted(lazy_attrs)
-__all__ = ["TextToSpeechStreamingSendCompletionEvent"]
+__all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]

sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl

sarvamai 0.1.23a3py3-none-any.whl → 0.1.23a5py3-none-any.whl