sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +203 -405
- sarvamai/chat/raw_client.py +20 -20
- sarvamai/client.py +34 -186
- sarvamai/core/__init__.py +21 -76
- sarvamai/core/client_wrapper.py +3 -19
- sarvamai/core/force_multipart.py +2 -4
- sarvamai/core/http_client.py +97 -217
- sarvamai/core/http_response.py +1 -1
- sarvamai/core/jsonable_encoder.py +0 -8
- sarvamai/core/pydantic_utilities.py +4 -110
- sarvamai/errors/__init__.py +6 -40
- sarvamai/errors/bad_request_error.py +1 -1
- sarvamai/errors/forbidden_error.py +1 -1
- sarvamai/errors/internal_server_error.py +1 -1
- sarvamai/errors/service_unavailable_error.py +1 -1
- sarvamai/errors/too_many_requests_error.py +1 -1
- sarvamai/errors/unprocessable_entity_error.py +1 -1
- sarvamai/requests/__init__.py +62 -150
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/error_response_data.py +1 -1
- sarvamai/requests/file_signed_url_details.py +1 -1
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_transcription_data.py +2 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +147 -64
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_job/raw_client.py +120 -120
- sarvamai/speech_to_text_streaming/__init__.py +10 -38
- sarvamai/speech_to_text_streaming/client.py +90 -8
- sarvamai/speech_to_text_streaming/raw_client.py +90 -8
- sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
- sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
- sarvamai/speech_to_text_translate_streaming/client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
- sarvamai/text/raw_client.py +60 -60
- sarvamai/text_to_speech/client.py +100 -16
- sarvamai/text_to_speech/raw_client.py +120 -36
- sarvamai/text_to_speech_streaming/__init__.py +2 -29
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +102 -222
- sarvamai/types/chat_completion_request_message.py +2 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/error_response_data.py +1 -1
- sarvamai/types/file_signed_url_details.py +1 -1
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_transcription_data.py +2 -2
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
- sarvamai/core/http_sse/__init__.py +0 -42
- sarvamai/core/http_sse/_api.py +0 -112
- sarvamai/core/http_sse/_decoders.py +0 -61
- sarvamai/core/http_sse/_exceptions.py +0 -7
- sarvamai/core/http_sse/_models.py +0 -17
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0
|
@@ -44,11 +44,22 @@ class TextToSpeechClient:
|
|
|
44
44
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
45
45
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
46
46
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
47
|
+
temperature: typing.Optional[float] = OMIT,
|
|
47
48
|
request_options: typing.Optional[RequestOptions] = None,
|
|
48
49
|
) -> TextToSpeechResponse:
|
|
49
50
|
"""
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
52
|
+
|
|
53
|
+
**Available Models:**
|
|
54
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
55
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
56
|
+
|
|
57
|
+
**Important Notes for bulbul:v3-beta:**
|
|
58
|
+
- Pitch and loudness parameters are NOT supported
|
|
59
|
+
- Pace must be between 0.5 and 2.0
|
|
60
|
+
- Preprocessing is automatically enabled
|
|
61
|
+
- Default sample rate is 24000 Hz
|
|
62
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
52
63
|
|
|
53
64
|
Parameters
|
|
54
65
|
----------
|
|
@@ -56,9 +67,12 @@ class TextToSpeechClient:
|
|
|
56
67
|
The text(s) to be converted into speech.
|
|
57
68
|
|
|
58
69
|
**Features:**
|
|
59
|
-
- Each text should be no longer than 1500 characters
|
|
60
70
|
- Supports code-mixed text (English and Indic languages)
|
|
61
71
|
|
|
72
|
+
**Model-specific limits:**
|
|
73
|
+
- **bulbul:v2:** Max 1500 characters
|
|
74
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
75
|
+
|
|
62
76
|
**Important Note:**
|
|
63
77
|
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
64
78
|
- This ensures proper pronunciation as a whole number
|
|
@@ -69,36 +83,63 @@ class TextToSpeechClient:
|
|
|
69
83
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
70
84
|
The speaker voice to be used for the output audio.
|
|
71
85
|
|
|
72
|
-
**Default:** Anushka
|
|
86
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
73
87
|
|
|
74
88
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
75
89
|
- **bulbul:v2:**
|
|
76
90
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
77
91
|
- Male: Abhilash, Karun, Hitesh
|
|
92
|
+
- **bulbul:v3-beta:**
|
|
93
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
78
94
|
|
|
79
95
|
**Note:** Speaker selection must match the chosen model version.
|
|
80
96
|
|
|
81
97
|
pitch : typing.Optional[float]
|
|
82
98
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
83
99
|
|
|
100
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
101
|
+
|
|
84
102
|
pace : typing.Optional[float]
|
|
85
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
103
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
104
|
+
|
|
105
|
+
**Model-specific ranges:**
|
|
106
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
107
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
86
108
|
|
|
87
109
|
loudness : typing.Optional[float]
|
|
88
110
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
89
111
|
|
|
112
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
113
|
+
|
|
90
114
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
91
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
115
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
116
|
+
|
|
117
|
+
**Model-specific defaults:**
|
|
118
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
119
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
92
120
|
|
|
93
121
|
enable_preprocessing : typing.Optional[bool]
|
|
94
|
-
|
|
122
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
123
|
+
|
|
124
|
+
**Model-specific behavior:**
|
|
125
|
+
- **bulbul:v2:** Default is false
|
|
126
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
95
127
|
|
|
96
128
|
model : typing.Optional[TextToSpeechModel]
|
|
97
|
-
Specifies the model to use for text-to-speech conversion.
|
|
129
|
+
Specifies the model to use for text-to-speech conversion.
|
|
130
|
+
|
|
131
|
+
**Available models:**
|
|
132
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
133
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
98
134
|
|
|
99
135
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
100
136
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
101
137
|
|
|
138
|
+
temperature : typing.Optional[float]
|
|
139
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
140
|
+
|
|
141
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
142
|
+
|
|
102
143
|
request_options : typing.Optional[RequestOptions]
|
|
103
144
|
Request-specific configuration.
|
|
104
145
|
|
|
@@ -130,6 +171,7 @@ class TextToSpeechClient:
|
|
|
130
171
|
enable_preprocessing=enable_preprocessing,
|
|
131
172
|
model=model,
|
|
132
173
|
output_audio_codec=output_audio_codec,
|
|
174
|
+
temperature=temperature,
|
|
133
175
|
request_options=request_options,
|
|
134
176
|
)
|
|
135
177
|
return _response.data
|
|
@@ -163,11 +205,22 @@ class AsyncTextToSpeechClient:
|
|
|
163
205
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
164
206
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
165
207
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
208
|
+
temperature: typing.Optional[float] = OMIT,
|
|
166
209
|
request_options: typing.Optional[RequestOptions] = None,
|
|
167
210
|
) -> TextToSpeechResponse:
|
|
168
211
|
"""
|
|
169
|
-
|
|
170
|
-
|
|
212
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
213
|
+
|
|
214
|
+
**Available Models:**
|
|
215
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
216
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
217
|
+
|
|
218
|
+
**Important Notes for bulbul:v3-beta:**
|
|
219
|
+
- Pitch and loudness parameters are NOT supported
|
|
220
|
+
- Pace must be between 0.5 and 2.0
|
|
221
|
+
- Preprocessing is automatically enabled
|
|
222
|
+
- Default sample rate is 24000 Hz
|
|
223
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
171
224
|
|
|
172
225
|
Parameters
|
|
173
226
|
----------
|
|
@@ -175,9 +228,12 @@ class AsyncTextToSpeechClient:
|
|
|
175
228
|
The text(s) to be converted into speech.
|
|
176
229
|
|
|
177
230
|
**Features:**
|
|
178
|
-
- Each text should be no longer than 1500 characters
|
|
179
231
|
- Supports code-mixed text (English and Indic languages)
|
|
180
232
|
|
|
233
|
+
**Model-specific limits:**
|
|
234
|
+
- **bulbul:v2:** Max 1500 characters
|
|
235
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
236
|
+
|
|
181
237
|
**Important Note:**
|
|
182
238
|
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
183
239
|
- This ensures proper pronunciation as a whole number
|
|
@@ -188,36 +244,63 @@ class AsyncTextToSpeechClient:
|
|
|
188
244
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
189
245
|
The speaker voice to be used for the output audio.
|
|
190
246
|
|
|
191
|
-
**Default:** Anushka
|
|
247
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
192
248
|
|
|
193
249
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
194
250
|
- **bulbul:v2:**
|
|
195
251
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
196
252
|
- Male: Abhilash, Karun, Hitesh
|
|
253
|
+
- **bulbul:v3-beta:**
|
|
254
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
197
255
|
|
|
198
256
|
**Note:** Speaker selection must match the chosen model version.
|
|
199
257
|
|
|
200
258
|
pitch : typing.Optional[float]
|
|
201
259
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
202
260
|
|
|
261
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
262
|
+
|
|
203
263
|
pace : typing.Optional[float]
|
|
204
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
264
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
265
|
+
|
|
266
|
+
**Model-specific ranges:**
|
|
267
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
268
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
205
269
|
|
|
206
270
|
loudness : typing.Optional[float]
|
|
207
271
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
208
272
|
|
|
273
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
274
|
+
|
|
209
275
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
210
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
276
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
277
|
+
|
|
278
|
+
**Model-specific defaults:**
|
|
279
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
280
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
211
281
|
|
|
212
282
|
enable_preprocessing : typing.Optional[bool]
|
|
213
|
-
|
|
283
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
284
|
+
|
|
285
|
+
**Model-specific behavior:**
|
|
286
|
+
- **bulbul:v2:** Default is false
|
|
287
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
214
288
|
|
|
215
289
|
model : typing.Optional[TextToSpeechModel]
|
|
216
|
-
Specifies the model to use for text-to-speech conversion.
|
|
290
|
+
Specifies the model to use for text-to-speech conversion.
|
|
291
|
+
|
|
292
|
+
**Available models:**
|
|
293
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
294
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
217
295
|
|
|
218
296
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
219
297
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
220
298
|
|
|
299
|
+
temperature : typing.Optional[float]
|
|
300
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
301
|
+
|
|
302
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
303
|
+
|
|
221
304
|
request_options : typing.Optional[RequestOptions]
|
|
222
305
|
Request-specific configuration.
|
|
223
306
|
|
|
@@ -257,6 +340,7 @@ class AsyncTextToSpeechClient:
|
|
|
257
340
|
enable_preprocessing=enable_preprocessing,
|
|
258
341
|
model=model,
|
|
259
342
|
output_audio_codec=output_audio_codec,
|
|
343
|
+
temperature=temperature,
|
|
260
344
|
request_options=request_options,
|
|
261
345
|
)
|
|
262
346
|
return _response.data
|
|
@@ -41,11 +41,22 @@ class RawTextToSpeechClient:
|
|
|
41
41
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
42
42
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
43
43
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
44
|
+
temperature: typing.Optional[float] = OMIT,
|
|
44
45
|
request_options: typing.Optional[RequestOptions] = None,
|
|
45
46
|
) -> HttpResponse[TextToSpeechResponse]:
|
|
46
47
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
49
|
+
|
|
50
|
+
**Available Models:**
|
|
51
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
52
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
53
|
+
|
|
54
|
+
**Important Notes for bulbul:v3-beta:**
|
|
55
|
+
- Pitch and loudness parameters are NOT supported
|
|
56
|
+
- Pace must be between 0.5 and 2.0
|
|
57
|
+
- Preprocessing is automatically enabled
|
|
58
|
+
- Default sample rate is 24000 Hz
|
|
59
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
49
60
|
|
|
50
61
|
Parameters
|
|
51
62
|
----------
|
|
@@ -53,9 +64,12 @@ class RawTextToSpeechClient:
|
|
|
53
64
|
The text(s) to be converted into speech.
|
|
54
65
|
|
|
55
66
|
**Features:**
|
|
56
|
-
- Each text should be no longer than 1500 characters
|
|
57
67
|
- Supports code-mixed text (English and Indic languages)
|
|
58
68
|
|
|
69
|
+
**Model-specific limits:**
|
|
70
|
+
- **bulbul:v2:** Max 1500 characters
|
|
71
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
72
|
+
|
|
59
73
|
**Important Note:**
|
|
60
74
|
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
61
75
|
- This ensures proper pronunciation as a whole number
|
|
@@ -66,36 +80,63 @@ class RawTextToSpeechClient:
|
|
|
66
80
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
67
81
|
The speaker voice to be used for the output audio.
|
|
68
82
|
|
|
69
|
-
**Default:** Anushka
|
|
83
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
70
84
|
|
|
71
85
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
72
86
|
- **bulbul:v2:**
|
|
73
87
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
74
88
|
- Male: Abhilash, Karun, Hitesh
|
|
89
|
+
- **bulbul:v3-beta:**
|
|
90
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
75
91
|
|
|
76
92
|
**Note:** Speaker selection must match the chosen model version.
|
|
77
93
|
|
|
78
94
|
pitch : typing.Optional[float]
|
|
79
95
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
80
96
|
|
|
97
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
98
|
+
|
|
81
99
|
pace : typing.Optional[float]
|
|
82
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
100
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
101
|
+
|
|
102
|
+
**Model-specific ranges:**
|
|
103
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
104
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
83
105
|
|
|
84
106
|
loudness : typing.Optional[float]
|
|
85
107
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
86
108
|
|
|
109
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
110
|
+
|
|
87
111
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
88
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
112
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
113
|
+
|
|
114
|
+
**Model-specific defaults:**
|
|
115
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
116
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
89
117
|
|
|
90
118
|
enable_preprocessing : typing.Optional[bool]
|
|
91
|
-
|
|
119
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
120
|
+
|
|
121
|
+
**Model-specific behavior:**
|
|
122
|
+
- **bulbul:v2:** Default is false
|
|
123
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
92
124
|
|
|
93
125
|
model : typing.Optional[TextToSpeechModel]
|
|
94
|
-
Specifies the model to use for text-to-speech conversion.
|
|
126
|
+
Specifies the model to use for text-to-speech conversion.
|
|
127
|
+
|
|
128
|
+
**Available models:**
|
|
129
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
130
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
95
131
|
|
|
96
132
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
97
133
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
98
134
|
|
|
135
|
+
temperature : typing.Optional[float]
|
|
136
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
137
|
+
|
|
138
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
139
|
+
|
|
99
140
|
request_options : typing.Optional[RequestOptions]
|
|
100
141
|
Request-specific configuration.
|
|
101
142
|
|
|
@@ -119,6 +160,7 @@ class RawTextToSpeechClient:
|
|
|
119
160
|
"enable_preprocessing": enable_preprocessing,
|
|
120
161
|
"model": model,
|
|
121
162
|
"output_audio_codec": output_audio_codec,
|
|
163
|
+
"temperature": temperature,
|
|
122
164
|
},
|
|
123
165
|
headers={
|
|
124
166
|
"content-type": "application/json",
|
|
@@ -140,9 +182,9 @@ class RawTextToSpeechClient:
|
|
|
140
182
|
raise BadRequestError(
|
|
141
183
|
headers=dict(_response.headers),
|
|
142
184
|
body=typing.cast(
|
|
143
|
-
typing.Any,
|
|
185
|
+
typing.Optional[typing.Any],
|
|
144
186
|
parse_obj_as(
|
|
145
|
-
type_=typing.Any, # type: ignore
|
|
187
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
146
188
|
object_=_response.json(),
|
|
147
189
|
),
|
|
148
190
|
),
|
|
@@ -151,9 +193,9 @@ class RawTextToSpeechClient:
|
|
|
151
193
|
raise ForbiddenError(
|
|
152
194
|
headers=dict(_response.headers),
|
|
153
195
|
body=typing.cast(
|
|
154
|
-
typing.Any,
|
|
196
|
+
typing.Optional[typing.Any],
|
|
155
197
|
parse_obj_as(
|
|
156
|
-
type_=typing.Any, # type: ignore
|
|
198
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
157
199
|
object_=_response.json(),
|
|
158
200
|
),
|
|
159
201
|
),
|
|
@@ -162,9 +204,9 @@ class RawTextToSpeechClient:
|
|
|
162
204
|
raise UnprocessableEntityError(
|
|
163
205
|
headers=dict(_response.headers),
|
|
164
206
|
body=typing.cast(
|
|
165
|
-
typing.Any,
|
|
207
|
+
typing.Optional[typing.Any],
|
|
166
208
|
parse_obj_as(
|
|
167
|
-
type_=typing.Any, # type: ignore
|
|
209
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
168
210
|
object_=_response.json(),
|
|
169
211
|
),
|
|
170
212
|
),
|
|
@@ -173,9 +215,9 @@ class RawTextToSpeechClient:
|
|
|
173
215
|
raise TooManyRequestsError(
|
|
174
216
|
headers=dict(_response.headers),
|
|
175
217
|
body=typing.cast(
|
|
176
|
-
typing.Any,
|
|
218
|
+
typing.Optional[typing.Any],
|
|
177
219
|
parse_obj_as(
|
|
178
|
-
type_=typing.Any, # type: ignore
|
|
220
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
179
221
|
object_=_response.json(),
|
|
180
222
|
),
|
|
181
223
|
),
|
|
@@ -184,9 +226,9 @@ class RawTextToSpeechClient:
|
|
|
184
226
|
raise InternalServerError(
|
|
185
227
|
headers=dict(_response.headers),
|
|
186
228
|
body=typing.cast(
|
|
187
|
-
typing.Any,
|
|
229
|
+
typing.Optional[typing.Any],
|
|
188
230
|
parse_obj_as(
|
|
189
|
-
type_=typing.Any, # type: ignore
|
|
231
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
190
232
|
object_=_response.json(),
|
|
191
233
|
),
|
|
192
234
|
),
|
|
@@ -214,11 +256,22 @@ class AsyncRawTextToSpeechClient:
|
|
|
214
256
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
215
257
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
216
258
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
259
|
+
temperature: typing.Optional[float] = OMIT,
|
|
217
260
|
request_options: typing.Optional[RequestOptions] = None,
|
|
218
261
|
) -> AsyncHttpResponse[TextToSpeechResponse]:
|
|
219
262
|
"""
|
|
220
|
-
|
|
221
|
-
|
|
263
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
264
|
+
|
|
265
|
+
**Available Models:**
|
|
266
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
267
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
268
|
+
|
|
269
|
+
**Important Notes for bulbul:v3-beta:**
|
|
270
|
+
- Pitch and loudness parameters are NOT supported
|
|
271
|
+
- Pace must be between 0.5 and 2.0
|
|
272
|
+
- Preprocessing is automatically enabled
|
|
273
|
+
- Default sample rate is 24000 Hz
|
|
274
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
222
275
|
|
|
223
276
|
Parameters
|
|
224
277
|
----------
|
|
@@ -226,9 +279,12 @@ class AsyncRawTextToSpeechClient:
|
|
|
226
279
|
The text(s) to be converted into speech.
|
|
227
280
|
|
|
228
281
|
**Features:**
|
|
229
|
-
- Each text should be no longer than 1500 characters
|
|
230
282
|
- Supports code-mixed text (English and Indic languages)
|
|
231
283
|
|
|
284
|
+
**Model-specific limits:**
|
|
285
|
+
- **bulbul:v2:** Max 1500 characters
|
|
286
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
287
|
+
|
|
232
288
|
**Important Note:**
|
|
233
289
|
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
234
290
|
- This ensures proper pronunciation as a whole number
|
|
@@ -239,36 +295,63 @@ class AsyncRawTextToSpeechClient:
|
|
|
239
295
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
240
296
|
The speaker voice to be used for the output audio.
|
|
241
297
|
|
|
242
|
-
**Default:** Anushka
|
|
298
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
243
299
|
|
|
244
300
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
245
301
|
- **bulbul:v2:**
|
|
246
302
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
247
303
|
- Male: Abhilash, Karun, Hitesh
|
|
304
|
+
- **bulbul:v3-beta:**
|
|
305
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
248
306
|
|
|
249
307
|
**Note:** Speaker selection must match the chosen model version.
|
|
250
308
|
|
|
251
309
|
pitch : typing.Optional[float]
|
|
252
310
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
253
311
|
|
|
312
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
313
|
+
|
|
254
314
|
pace : typing.Optional[float]
|
|
255
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
315
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
316
|
+
|
|
317
|
+
**Model-specific ranges:**
|
|
318
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
319
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
256
320
|
|
|
257
321
|
loudness : typing.Optional[float]
|
|
258
322
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
259
323
|
|
|
324
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
325
|
+
|
|
260
326
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
261
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
327
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
328
|
+
|
|
329
|
+
**Model-specific defaults:**
|
|
330
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
331
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
262
332
|
|
|
263
333
|
enable_preprocessing : typing.Optional[bool]
|
|
264
|
-
|
|
334
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
335
|
+
|
|
336
|
+
**Model-specific behavior:**
|
|
337
|
+
- **bulbul:v2:** Default is false
|
|
338
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
265
339
|
|
|
266
340
|
model : typing.Optional[TextToSpeechModel]
|
|
267
|
-
Specifies the model to use for text-to-speech conversion.
|
|
341
|
+
Specifies the model to use for text-to-speech conversion.
|
|
342
|
+
|
|
343
|
+
**Available models:**
|
|
344
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
345
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
268
346
|
|
|
269
347
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
270
348
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
271
349
|
|
|
350
|
+
temperature : typing.Optional[float]
|
|
351
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
352
|
+
|
|
353
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
354
|
+
|
|
272
355
|
request_options : typing.Optional[RequestOptions]
|
|
273
356
|
Request-specific configuration.
|
|
274
357
|
|
|
@@ -292,6 +375,7 @@ class AsyncRawTextToSpeechClient:
|
|
|
292
375
|
"enable_preprocessing": enable_preprocessing,
|
|
293
376
|
"model": model,
|
|
294
377
|
"output_audio_codec": output_audio_codec,
|
|
378
|
+
"temperature": temperature,
|
|
295
379
|
},
|
|
296
380
|
headers={
|
|
297
381
|
"content-type": "application/json",
|
|
@@ -313,9 +397,9 @@ class AsyncRawTextToSpeechClient:
|
|
|
313
397
|
raise BadRequestError(
|
|
314
398
|
headers=dict(_response.headers),
|
|
315
399
|
body=typing.cast(
|
|
316
|
-
typing.Any,
|
|
400
|
+
typing.Optional[typing.Any],
|
|
317
401
|
parse_obj_as(
|
|
318
|
-
type_=typing.Any, # type: ignore
|
|
402
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
319
403
|
object_=_response.json(),
|
|
320
404
|
),
|
|
321
405
|
),
|
|
@@ -324,9 +408,9 @@ class AsyncRawTextToSpeechClient:
|
|
|
324
408
|
raise ForbiddenError(
|
|
325
409
|
headers=dict(_response.headers),
|
|
326
410
|
body=typing.cast(
|
|
327
|
-
typing.Any,
|
|
411
|
+
typing.Optional[typing.Any],
|
|
328
412
|
parse_obj_as(
|
|
329
|
-
type_=typing.Any, # type: ignore
|
|
413
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
330
414
|
object_=_response.json(),
|
|
331
415
|
),
|
|
332
416
|
),
|
|
@@ -335,9 +419,9 @@ class AsyncRawTextToSpeechClient:
|
|
|
335
419
|
raise UnprocessableEntityError(
|
|
336
420
|
headers=dict(_response.headers),
|
|
337
421
|
body=typing.cast(
|
|
338
|
-
typing.Any,
|
|
422
|
+
typing.Optional[typing.Any],
|
|
339
423
|
parse_obj_as(
|
|
340
|
-
type_=typing.Any, # type: ignore
|
|
424
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
341
425
|
object_=_response.json(),
|
|
342
426
|
),
|
|
343
427
|
),
|
|
@@ -346,9 +430,9 @@ class AsyncRawTextToSpeechClient:
|
|
|
346
430
|
raise TooManyRequestsError(
|
|
347
431
|
headers=dict(_response.headers),
|
|
348
432
|
body=typing.cast(
|
|
349
|
-
typing.Any,
|
|
433
|
+
typing.Optional[typing.Any],
|
|
350
434
|
parse_obj_as(
|
|
351
|
-
type_=typing.Any, # type: ignore
|
|
435
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
352
436
|
object_=_response.json(),
|
|
353
437
|
),
|
|
354
438
|
),
|
|
@@ -357,9 +441,9 @@ class AsyncRawTextToSpeechClient:
|
|
|
357
441
|
raise InternalServerError(
|
|
358
442
|
headers=dict(_response.headers),
|
|
359
443
|
body=typing.cast(
|
|
360
|
-
typing.Any,
|
|
444
|
+
typing.Optional[typing.Any],
|
|
361
445
|
parse_obj_as(
|
|
362
|
-
type_=typing.Any, # type: ignore
|
|
446
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
363
447
|
object_=_response.json(),
|
|
364
448
|
),
|
|
365
449
|
),
|
|
@@ -2,33 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
from importlib import import_module
|
|
5
|
+
from .types import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
from .types import TextToSpeechStreamingSendCompletionEvent
|
|
10
|
-
_dynamic_imports: typing.Dict[str, str] = {"TextToSpeechStreamingSendCompletionEvent": ".types"}
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def __getattr__(attr_name: str) -> typing.Any:
|
|
14
|
-
module_name = _dynamic_imports.get(attr_name)
|
|
15
|
-
if module_name is None:
|
|
16
|
-
raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}")
|
|
17
|
-
try:
|
|
18
|
-
module = import_module(module_name, __package__)
|
|
19
|
-
if module_name == f".{attr_name}":
|
|
20
|
-
return module
|
|
21
|
-
else:
|
|
22
|
-
return getattr(module, attr_name)
|
|
23
|
-
except ImportError as e:
|
|
24
|
-
raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e
|
|
25
|
-
except AttributeError as e:
|
|
26
|
-
raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def __dir__():
|
|
30
|
-
lazy_attrs = list(_dynamic_imports.keys())
|
|
31
|
-
return sorted(lazy_attrs)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
__all__ = ["TextToSpeechStreamingSendCompletionEvent"]
|
|
7
|
+
__all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]
|