sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sarvamai/__init__.py +203 -405
  2. sarvamai/chat/raw_client.py +20 -20
  3. sarvamai/client.py +34 -186
  4. sarvamai/core/__init__.py +21 -76
  5. sarvamai/core/client_wrapper.py +3 -19
  6. sarvamai/core/force_multipart.py +2 -4
  7. sarvamai/core/http_client.py +97 -217
  8. sarvamai/core/http_response.py +1 -1
  9. sarvamai/core/jsonable_encoder.py +0 -8
  10. sarvamai/core/pydantic_utilities.py +4 -110
  11. sarvamai/errors/__init__.py +6 -40
  12. sarvamai/errors/bad_request_error.py +1 -1
  13. sarvamai/errors/forbidden_error.py +1 -1
  14. sarvamai/errors/internal_server_error.py +1 -1
  15. sarvamai/errors/service_unavailable_error.py +1 -1
  16. sarvamai/errors/too_many_requests_error.py +1 -1
  17. sarvamai/errors/unprocessable_entity_error.py +1 -1
  18. sarvamai/requests/__init__.py +62 -150
  19. sarvamai/requests/configure_connection.py +4 -0
  20. sarvamai/requests/configure_connection_data.py +40 -11
  21. sarvamai/requests/error_response_data.py +1 -1
  22. sarvamai/requests/file_signed_url_details.py +1 -1
  23. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  24. sarvamai/requests/speech_to_text_transcription_data.py +2 -2
  25. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  26. sarvamai/speech_to_text/client.py +95 -10
  27. sarvamai/speech_to_text/raw_client.py +147 -64
  28. sarvamai/speech_to_text_job/client.py +60 -15
  29. sarvamai/speech_to_text_job/raw_client.py +120 -120
  30. sarvamai/speech_to_text_streaming/__init__.py +10 -38
  31. sarvamai/speech_to_text_streaming/client.py +90 -8
  32. sarvamai/speech_to_text_streaming/raw_client.py +90 -8
  33. sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
  34. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  35. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  36. sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
  37. sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
  38. sarvamai/speech_to_text_translate_streaming/client.py +8 -2
  39. sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
  40. sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
  41. sarvamai/text/raw_client.py +60 -60
  42. sarvamai/text_to_speech/client.py +100 -16
  43. sarvamai/text_to_speech/raw_client.py +120 -36
  44. sarvamai/text_to_speech_streaming/__init__.py +2 -29
  45. sarvamai/text_to_speech_streaming/client.py +19 -6
  46. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  47. sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
  48. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  49. sarvamai/types/__init__.py +102 -222
  50. sarvamai/types/chat_completion_request_message.py +2 -6
  51. sarvamai/types/configure_connection.py +4 -0
  52. sarvamai/types/configure_connection_data.py +40 -11
  53. sarvamai/types/configure_connection_data_model.py +5 -0
  54. sarvamai/types/configure_connection_data_speaker.py +35 -1
  55. sarvamai/types/error_response_data.py +1 -1
  56. sarvamai/types/file_signed_url_details.py +1 -1
  57. sarvamai/types/mode.py +5 -0
  58. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  59. sarvamai/types/speech_to_text_model.py +1 -1
  60. sarvamai/types/speech_to_text_transcription_data.py +2 -2
  61. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  62. sarvamai/types/text_to_speech_model.py +1 -1
  63. sarvamai/types/text_to_speech_speaker.py +35 -1
  64. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
  65. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
  66. sarvamai/core/http_sse/__init__.py +0 -42
  67. sarvamai/core/http_sse/_api.py +0 -112
  68. sarvamai/core/http_sse/_decoders.py +0 -61
  69. sarvamai/core/http_sse/_exceptions.py +0 -7
  70. sarvamai/core/http_sse/_models.py +0 -17
  71. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0
@@ -44,11 +44,22 @@ class TextToSpeechClient:
44
44
  enable_preprocessing: typing.Optional[bool] = OMIT,
45
45
  model: typing.Optional[TextToSpeechModel] = OMIT,
46
46
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
47
+ temperature: typing.Optional[float] = OMIT,
47
48
  request_options: typing.Optional[RequestOptions] = None,
48
49
  ) -> TextToSpeechResponse:
49
50
  """
50
- This is the model to convert text into spoken audio.
51
- The output is a wave file encoded as a base64 string.
51
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
52
+
53
+ **Available Models:**
54
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
55
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
56
+
57
+ **Important Notes for bulbul:v3-beta:**
58
+ - Pitch and loudness parameters are NOT supported
59
+ - Pace must be between 0.5 and 2.0
60
+ - Preprocessing is automatically enabled
61
+ - Default sample rate is 24000 Hz
62
+ - Temperature parameter available (0.01-1.0, default 0.6)
52
63
 
53
64
  Parameters
54
65
  ----------
@@ -56,9 +67,12 @@ class TextToSpeechClient:
56
67
  The text(s) to be converted into speech.
57
68
 
58
69
  **Features:**
59
- - Each text should be no longer than 1500 characters
60
70
  - Supports code-mixed text (English and Indic languages)
61
71
 
72
+ **Model-specific limits:**
73
+ - **bulbul:v2:** Max 1500 characters
74
+ - **bulbul:v3-beta:** Max 2500 characters
75
+
62
76
  **Important Note:**
63
77
  - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
64
78
  - This ensures proper pronunciation as a whole number
@@ -69,36 +83,63 @@ class TextToSpeechClient:
69
83
  speaker : typing.Optional[TextToSpeechSpeaker]
70
84
  The speaker voice to be used for the output audio.
71
85
 
72
- **Default:** Anushka
86
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
73
87
 
74
88
  **Model Compatibility (Speakers compatible with respective model):**
75
89
  - **bulbul:v2:**
76
90
  - Female: Anushka, Manisha, Vidya, Arya
77
91
  - Male: Abhilash, Karun, Hitesh
92
+ - **bulbul:v3-beta:**
93
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
78
94
 
79
95
  **Note:** Speaker selection must match the chosen model version.
80
96
 
81
97
  pitch : typing.Optional[float]
82
98
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
83
99
 
100
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
101
+
84
102
  pace : typing.Optional[float]
85
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
103
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
104
+
105
+ **Model-specific ranges:**
106
+ - **bulbul:v2:** 0.3 to 3.0
107
+ - **bulbul:v3-beta:** 0.5 to 2.0
86
108
 
87
109
  loudness : typing.Optional[float]
88
110
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
89
111
 
112
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
113
+
90
114
  speech_sample_rate : typing.Optional[SpeechSampleRate]
91
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
115
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
116
+
117
+ **Model-specific defaults:**
118
+ - **bulbul:v2:** Default is 22050 Hz
119
+ - **bulbul:v3-beta:** Default is 24000 Hz
92
120
 
93
121
  enable_preprocessing : typing.Optional[bool]
94
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
122
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
123
+
124
+ **Model-specific behavior:**
125
+ - **bulbul:v2:** Default is false
126
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
95
127
 
96
128
  model : typing.Optional[TextToSpeechModel]
97
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
129
+ Specifies the model to use for text-to-speech conversion.
130
+
131
+ **Available models:**
132
+ - **bulbul:v2:** Default model with pitch, loudness controls
133
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
98
134
 
99
135
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
100
136
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
101
137
 
138
+ temperature : typing.Optional[float]
139
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
140
+
141
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
142
+
102
143
  request_options : typing.Optional[RequestOptions]
103
144
  Request-specific configuration.
104
145
 
@@ -130,6 +171,7 @@ class TextToSpeechClient:
130
171
  enable_preprocessing=enable_preprocessing,
131
172
  model=model,
132
173
  output_audio_codec=output_audio_codec,
174
+ temperature=temperature,
133
175
  request_options=request_options,
134
176
  )
135
177
  return _response.data
@@ -163,11 +205,22 @@ class AsyncTextToSpeechClient:
163
205
  enable_preprocessing: typing.Optional[bool] = OMIT,
164
206
  model: typing.Optional[TextToSpeechModel] = OMIT,
165
207
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
208
+ temperature: typing.Optional[float] = OMIT,
166
209
  request_options: typing.Optional[RequestOptions] = None,
167
210
  ) -> TextToSpeechResponse:
168
211
  """
169
- This is the model to convert text into spoken audio.
170
- The output is a wave file encoded as a base64 string.
212
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
213
+
214
+ **Available Models:**
215
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
216
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
217
+
218
+ **Important Notes for bulbul:v3-beta:**
219
+ - Pitch and loudness parameters are NOT supported
220
+ - Pace must be between 0.5 and 2.0
221
+ - Preprocessing is automatically enabled
222
+ - Default sample rate is 24000 Hz
223
+ - Temperature parameter available (0.01-1.0, default 0.6)
171
224
 
172
225
  Parameters
173
226
  ----------
@@ -175,9 +228,12 @@ class AsyncTextToSpeechClient:
175
228
  The text(s) to be converted into speech.
176
229
 
177
230
  **Features:**
178
- - Each text should be no longer than 1500 characters
179
231
  - Supports code-mixed text (English and Indic languages)
180
232
 
233
+ **Model-specific limits:**
234
+ - **bulbul:v2:** Max 1500 characters
235
+ - **bulbul:v3-beta:** Max 2500 characters
236
+
181
237
  **Important Note:**
182
238
  - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
183
239
  - This ensures proper pronunciation as a whole number
@@ -188,36 +244,63 @@ class AsyncTextToSpeechClient:
188
244
  speaker : typing.Optional[TextToSpeechSpeaker]
189
245
  The speaker voice to be used for the output audio.
190
246
 
191
- **Default:** Anushka
247
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
192
248
 
193
249
  **Model Compatibility (Speakers compatible with respective model):**
194
250
  - **bulbul:v2:**
195
251
  - Female: Anushka, Manisha, Vidya, Arya
196
252
  - Male: Abhilash, Karun, Hitesh
253
+ - **bulbul:v3-beta:**
254
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
197
255
 
198
256
  **Note:** Speaker selection must match the chosen model version.
199
257
 
200
258
  pitch : typing.Optional[float]
201
259
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
202
260
 
261
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
262
+
203
263
  pace : typing.Optional[float]
204
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
264
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
265
+
266
+ **Model-specific ranges:**
267
+ - **bulbul:v2:** 0.3 to 3.0
268
+ - **bulbul:v3-beta:** 0.5 to 2.0
205
269
 
206
270
  loudness : typing.Optional[float]
207
271
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
208
272
 
273
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
274
+
209
275
  speech_sample_rate : typing.Optional[SpeechSampleRate]
210
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
276
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
277
+
278
+ **Model-specific defaults:**
279
+ - **bulbul:v2:** Default is 22050 Hz
280
+ - **bulbul:v3-beta:** Default is 24000 Hz
211
281
 
212
282
  enable_preprocessing : typing.Optional[bool]
213
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
283
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
284
+
285
+ **Model-specific behavior:**
286
+ - **bulbul:v2:** Default is false
287
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
214
288
 
215
289
  model : typing.Optional[TextToSpeechModel]
216
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
290
+ Specifies the model to use for text-to-speech conversion.
291
+
292
+ **Available models:**
293
+ - **bulbul:v2:** Default model with pitch, loudness controls
294
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
217
295
 
218
296
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
219
297
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
220
298
 
299
+ temperature : typing.Optional[float]
300
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
301
+
302
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
303
+
221
304
  request_options : typing.Optional[RequestOptions]
222
305
  Request-specific configuration.
223
306
 
@@ -257,6 +340,7 @@ class AsyncTextToSpeechClient:
257
340
  enable_preprocessing=enable_preprocessing,
258
341
  model=model,
259
342
  output_audio_codec=output_audio_codec,
343
+ temperature=temperature,
260
344
  request_options=request_options,
261
345
  )
262
346
  return _response.data
@@ -41,11 +41,22 @@ class RawTextToSpeechClient:
41
41
  enable_preprocessing: typing.Optional[bool] = OMIT,
42
42
  model: typing.Optional[TextToSpeechModel] = OMIT,
43
43
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
44
+ temperature: typing.Optional[float] = OMIT,
44
45
  request_options: typing.Optional[RequestOptions] = None,
45
46
  ) -> HttpResponse[TextToSpeechResponse]:
46
47
  """
47
- This is the model to convert text into spoken audio.
48
- The output is a wave file encoded as a base64 string.
48
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
49
+
50
+ **Available Models:**
51
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
52
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
53
+
54
+ **Important Notes for bulbul:v3-beta:**
55
+ - Pitch and loudness parameters are NOT supported
56
+ - Pace must be between 0.5 and 2.0
57
+ - Preprocessing is automatically enabled
58
+ - Default sample rate is 24000 Hz
59
+ - Temperature parameter available (0.01-1.0, default 0.6)
49
60
 
50
61
  Parameters
51
62
  ----------
@@ -53,9 +64,12 @@ class RawTextToSpeechClient:
53
64
  The text(s) to be converted into speech.
54
65
 
55
66
  **Features:**
56
- - Each text should be no longer than 1500 characters
57
67
  - Supports code-mixed text (English and Indic languages)
58
68
 
69
+ **Model-specific limits:**
70
+ - **bulbul:v2:** Max 1500 characters
71
+ - **bulbul:v3-beta:** Max 2500 characters
72
+
59
73
  **Important Note:**
60
74
  - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
61
75
  - This ensures proper pronunciation as a whole number
@@ -66,36 +80,63 @@ class RawTextToSpeechClient:
66
80
  speaker : typing.Optional[TextToSpeechSpeaker]
67
81
  The speaker voice to be used for the output audio.
68
82
 
69
- **Default:** Anushka
83
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
70
84
 
71
85
  **Model Compatibility (Speakers compatible with respective model):**
72
86
  - **bulbul:v2:**
73
87
  - Female: Anushka, Manisha, Vidya, Arya
74
88
  - Male: Abhilash, Karun, Hitesh
89
+ - **bulbul:v3-beta:**
90
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
75
91
 
76
92
  **Note:** Speaker selection must match the chosen model version.
77
93
 
78
94
  pitch : typing.Optional[float]
79
95
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
80
96
 
97
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
98
+
81
99
  pace : typing.Optional[float]
82
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
100
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
101
+
102
+ **Model-specific ranges:**
103
+ - **bulbul:v2:** 0.3 to 3.0
104
+ - **bulbul:v3-beta:** 0.5 to 2.0
83
105
 
84
106
  loudness : typing.Optional[float]
85
107
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
86
108
 
109
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
110
+
87
111
  speech_sample_rate : typing.Optional[SpeechSampleRate]
88
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
112
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
113
+
114
+ **Model-specific defaults:**
115
+ - **bulbul:v2:** Default is 22050 Hz
116
+ - **bulbul:v3-beta:** Default is 24000 Hz
89
117
 
90
118
  enable_preprocessing : typing.Optional[bool]
91
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
119
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
120
+
121
+ **Model-specific behavior:**
122
+ - **bulbul:v2:** Default is false
123
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
92
124
 
93
125
  model : typing.Optional[TextToSpeechModel]
94
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
126
+ Specifies the model to use for text-to-speech conversion.
127
+
128
+ **Available models:**
129
+ - **bulbul:v2:** Default model with pitch, loudness controls
130
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
95
131
 
96
132
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
97
133
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
98
134
 
135
+ temperature : typing.Optional[float]
136
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
137
+
138
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
139
+
99
140
  request_options : typing.Optional[RequestOptions]
100
141
  Request-specific configuration.
101
142
 
@@ -119,6 +160,7 @@ class RawTextToSpeechClient:
119
160
  "enable_preprocessing": enable_preprocessing,
120
161
  "model": model,
121
162
  "output_audio_codec": output_audio_codec,
163
+ "temperature": temperature,
122
164
  },
123
165
  headers={
124
166
  "content-type": "application/json",
@@ -140,9 +182,9 @@ class RawTextToSpeechClient:
140
182
  raise BadRequestError(
141
183
  headers=dict(_response.headers),
142
184
  body=typing.cast(
143
- typing.Any,
185
+ typing.Optional[typing.Any],
144
186
  parse_obj_as(
145
- type_=typing.Any, # type: ignore
187
+ type_=typing.Optional[typing.Any], # type: ignore
146
188
  object_=_response.json(),
147
189
  ),
148
190
  ),
@@ -151,9 +193,9 @@ class RawTextToSpeechClient:
151
193
  raise ForbiddenError(
152
194
  headers=dict(_response.headers),
153
195
  body=typing.cast(
154
- typing.Any,
196
+ typing.Optional[typing.Any],
155
197
  parse_obj_as(
156
- type_=typing.Any, # type: ignore
198
+ type_=typing.Optional[typing.Any], # type: ignore
157
199
  object_=_response.json(),
158
200
  ),
159
201
  ),
@@ -162,9 +204,9 @@ class RawTextToSpeechClient:
162
204
  raise UnprocessableEntityError(
163
205
  headers=dict(_response.headers),
164
206
  body=typing.cast(
165
- typing.Any,
207
+ typing.Optional[typing.Any],
166
208
  parse_obj_as(
167
- type_=typing.Any, # type: ignore
209
+ type_=typing.Optional[typing.Any], # type: ignore
168
210
  object_=_response.json(),
169
211
  ),
170
212
  ),
@@ -173,9 +215,9 @@ class RawTextToSpeechClient:
173
215
  raise TooManyRequestsError(
174
216
  headers=dict(_response.headers),
175
217
  body=typing.cast(
176
- typing.Any,
218
+ typing.Optional[typing.Any],
177
219
  parse_obj_as(
178
- type_=typing.Any, # type: ignore
220
+ type_=typing.Optional[typing.Any], # type: ignore
179
221
  object_=_response.json(),
180
222
  ),
181
223
  ),
@@ -184,9 +226,9 @@ class RawTextToSpeechClient:
184
226
  raise InternalServerError(
185
227
  headers=dict(_response.headers),
186
228
  body=typing.cast(
187
- typing.Any,
229
+ typing.Optional[typing.Any],
188
230
  parse_obj_as(
189
- type_=typing.Any, # type: ignore
231
+ type_=typing.Optional[typing.Any], # type: ignore
190
232
  object_=_response.json(),
191
233
  ),
192
234
  ),
@@ -214,11 +256,22 @@ class AsyncRawTextToSpeechClient:
214
256
  enable_preprocessing: typing.Optional[bool] = OMIT,
215
257
  model: typing.Optional[TextToSpeechModel] = OMIT,
216
258
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
259
+ temperature: typing.Optional[float] = OMIT,
217
260
  request_options: typing.Optional[RequestOptions] = None,
218
261
  ) -> AsyncHttpResponse[TextToSpeechResponse]:
219
262
  """
220
- This is the model to convert text into spoken audio.
221
- The output is a wave file encoded as a base64 string.
263
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
264
+
265
+ **Available Models:**
266
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
267
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
268
+
269
+ **Important Notes for bulbul:v3-beta:**
270
+ - Pitch and loudness parameters are NOT supported
271
+ - Pace must be between 0.5 and 2.0
272
+ - Preprocessing is automatically enabled
273
+ - Default sample rate is 24000 Hz
274
+ - Temperature parameter available (0.01-1.0, default 0.6)
222
275
 
223
276
  Parameters
224
277
  ----------
@@ -226,9 +279,12 @@ class AsyncRawTextToSpeechClient:
226
279
  The text(s) to be converted into speech.
227
280
 
228
281
  **Features:**
229
- - Each text should be no longer than 1500 characters
230
282
  - Supports code-mixed text (English and Indic languages)
231
283
 
284
+ **Model-specific limits:**
285
+ - **bulbul:v2:** Max 1500 characters
286
+ - **bulbul:v3-beta:** Max 2500 characters
287
+
232
288
  **Important Note:**
233
289
  - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
234
290
  - This ensures proper pronunciation as a whole number
@@ -239,36 +295,63 @@ class AsyncRawTextToSpeechClient:
239
295
  speaker : typing.Optional[TextToSpeechSpeaker]
240
296
  The speaker voice to be used for the output audio.
241
297
 
242
- **Default:** Anushka
298
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
243
299
 
244
300
  **Model Compatibility (Speakers compatible with respective model):**
245
301
  - **bulbul:v2:**
246
302
  - Female: Anushka, Manisha, Vidya, Arya
247
303
  - Male: Abhilash, Karun, Hitesh
304
+ - **bulbul:v3-beta:**
305
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
248
306
 
249
307
  **Note:** Speaker selection must match the chosen model version.
250
308
 
251
309
  pitch : typing.Optional[float]
252
310
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
253
311
 
312
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
313
+
254
314
  pace : typing.Optional[float]
255
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
315
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
316
+
317
+ **Model-specific ranges:**
318
+ - **bulbul:v2:** 0.3 to 3.0
319
+ - **bulbul:v3-beta:** 0.5 to 2.0
256
320
 
257
321
  loudness : typing.Optional[float]
258
322
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
259
323
 
324
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
325
+
260
326
  speech_sample_rate : typing.Optional[SpeechSampleRate]
261
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
327
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
328
+
329
+ **Model-specific defaults:**
330
+ - **bulbul:v2:** Default is 22050 Hz
331
+ - **bulbul:v3-beta:** Default is 24000 Hz
262
332
 
263
333
  enable_preprocessing : typing.Optional[bool]
264
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
334
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
335
+
336
+ **Model-specific behavior:**
337
+ - **bulbul:v2:** Default is false
338
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
265
339
 
266
340
  model : typing.Optional[TextToSpeechModel]
267
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
341
+ Specifies the model to use for text-to-speech conversion.
342
+
343
+ **Available models:**
344
+ - **bulbul:v2:** Default model with pitch, loudness controls
345
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
268
346
 
269
347
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
270
348
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
271
349
 
350
+ temperature : typing.Optional[float]
351
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
352
+
353
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
354
+
272
355
  request_options : typing.Optional[RequestOptions]
273
356
  Request-specific configuration.
274
357
 
@@ -292,6 +375,7 @@ class AsyncRawTextToSpeechClient:
292
375
  "enable_preprocessing": enable_preprocessing,
293
376
  "model": model,
294
377
  "output_audio_codec": output_audio_codec,
378
+ "temperature": temperature,
295
379
  },
296
380
  headers={
297
381
  "content-type": "application/json",
@@ -313,9 +397,9 @@ class AsyncRawTextToSpeechClient:
313
397
  raise BadRequestError(
314
398
  headers=dict(_response.headers),
315
399
  body=typing.cast(
316
- typing.Any,
400
+ typing.Optional[typing.Any],
317
401
  parse_obj_as(
318
- type_=typing.Any, # type: ignore
402
+ type_=typing.Optional[typing.Any], # type: ignore
319
403
  object_=_response.json(),
320
404
  ),
321
405
  ),
@@ -324,9 +408,9 @@ class AsyncRawTextToSpeechClient:
324
408
  raise ForbiddenError(
325
409
  headers=dict(_response.headers),
326
410
  body=typing.cast(
327
- typing.Any,
411
+ typing.Optional[typing.Any],
328
412
  parse_obj_as(
329
- type_=typing.Any, # type: ignore
413
+ type_=typing.Optional[typing.Any], # type: ignore
330
414
  object_=_response.json(),
331
415
  ),
332
416
  ),
@@ -335,9 +419,9 @@ class AsyncRawTextToSpeechClient:
335
419
  raise UnprocessableEntityError(
336
420
  headers=dict(_response.headers),
337
421
  body=typing.cast(
338
- typing.Any,
422
+ typing.Optional[typing.Any],
339
423
  parse_obj_as(
340
- type_=typing.Any, # type: ignore
424
+ type_=typing.Optional[typing.Any], # type: ignore
341
425
  object_=_response.json(),
342
426
  ),
343
427
  ),
@@ -346,9 +430,9 @@ class AsyncRawTextToSpeechClient:
346
430
  raise TooManyRequestsError(
347
431
  headers=dict(_response.headers),
348
432
  body=typing.cast(
349
- typing.Any,
433
+ typing.Optional[typing.Any],
350
434
  parse_obj_as(
351
- type_=typing.Any, # type: ignore
435
+ type_=typing.Optional[typing.Any], # type: ignore
352
436
  object_=_response.json(),
353
437
  ),
354
438
  ),
@@ -357,9 +441,9 @@ class AsyncRawTextToSpeechClient:
357
441
  raise InternalServerError(
358
442
  headers=dict(_response.headers),
359
443
  body=typing.cast(
360
- typing.Any,
444
+ typing.Optional[typing.Any],
361
445
  parse_obj_as(
362
- type_=typing.Any, # type: ignore
446
+ type_=typing.Optional[typing.Any], # type: ignore
363
447
  object_=_response.json(),
364
448
  ),
365
449
  ),
@@ -2,33 +2,6 @@
2
2
 
3
3
  # isort: skip_file
4
4
 
5
- import typing
6
- from importlib import import_module
5
+ from .types import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
7
6
 
8
- if typing.TYPE_CHECKING:
9
- from .types import TextToSpeechStreamingSendCompletionEvent
10
- _dynamic_imports: typing.Dict[str, str] = {"TextToSpeechStreamingSendCompletionEvent": ".types"}
11
-
12
-
13
- def __getattr__(attr_name: str) -> typing.Any:
14
- module_name = _dynamic_imports.get(attr_name)
15
- if module_name is None:
16
- raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}")
17
- try:
18
- module = import_module(module_name, __package__)
19
- if module_name == f".{attr_name}":
20
- return module
21
- else:
22
- return getattr(module, attr_name)
23
- except ImportError as e:
24
- raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e
25
- except AttributeError as e:
26
- raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e
27
-
28
-
29
- def __dir__():
30
- lazy_attrs = list(_dynamic_imports.keys())
31
- return sorted(lazy_attrs)
32
-
33
-
34
- __all__ = ["TextToSpeechStreamingSendCompletionEvent"]
7
+ __all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]