sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -3
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_streaming/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/client.py +102 -18
- sarvamai/speech_to_text_streaming/raw_client.py +102 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_streaming/client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -2
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -41,15 +41,38 @@ class RawTextToSpeechClient:
|
|
|
41
41
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
42
42
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
43
43
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
44
|
+
temperature: typing.Optional[float] = OMIT,
|
|
44
45
|
request_options: typing.Optional[RequestOptions] = None,
|
|
45
46
|
) -> HttpResponse[TextToSpeechResponse]:
|
|
46
47
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
49
|
+
|
|
50
|
+
**Available Models:**
|
|
51
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
52
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
53
|
+
|
|
54
|
+
**Important Notes for bulbul:v3-beta:**
|
|
55
|
+
- Pitch and loudness parameters are NOT supported
|
|
56
|
+
- Pace must be between 0.5 and 2.0
|
|
57
|
+
- Preprocessing is automatically enabled
|
|
58
|
+
- Default sample rate is 24000 Hz
|
|
59
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
49
60
|
|
|
50
61
|
Parameters
|
|
51
62
|
----------
|
|
52
63
|
text : str
|
|
64
|
+
The text(s) to be converted into speech.
|
|
65
|
+
|
|
66
|
+
**Features:**
|
|
67
|
+
- Supports code-mixed text (English and Indic languages)
|
|
68
|
+
|
|
69
|
+
**Model-specific limits:**
|
|
70
|
+
- **bulbul:v2:** Max 1500 characters
|
|
71
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
72
|
+
|
|
73
|
+
**Important Note:**
|
|
74
|
+
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
75
|
+
- This ensures proper pronunciation as a whole number
|
|
53
76
|
|
|
54
77
|
target_language_code : TextToSpeechLanguage
|
|
55
78
|
The language of the text is BCP-47 format
|
|
@@ -57,36 +80,63 @@ class RawTextToSpeechClient:
|
|
|
57
80
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
58
81
|
The speaker voice to be used for the output audio.
|
|
59
82
|
|
|
60
|
-
**Default:** Anushka
|
|
83
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
61
84
|
|
|
62
85
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
63
86
|
- **bulbul:v2:**
|
|
64
87
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
65
88
|
- Male: Abhilash, Karun, Hitesh
|
|
89
|
+
- **bulbul:v3-beta:**
|
|
90
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
66
91
|
|
|
67
92
|
**Note:** Speaker selection must match the chosen model version.
|
|
68
93
|
|
|
69
94
|
pitch : typing.Optional[float]
|
|
70
95
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
71
96
|
|
|
97
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
98
|
+
|
|
72
99
|
pace : typing.Optional[float]
|
|
73
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
100
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
101
|
+
|
|
102
|
+
**Model-specific ranges:**
|
|
103
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
104
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
74
105
|
|
|
75
106
|
loudness : typing.Optional[float]
|
|
76
107
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
77
108
|
|
|
109
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
110
|
+
|
|
78
111
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
79
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
112
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
113
|
+
|
|
114
|
+
**Model-specific defaults:**
|
|
115
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
116
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
80
117
|
|
|
81
118
|
enable_preprocessing : typing.Optional[bool]
|
|
82
|
-
|
|
119
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
120
|
+
|
|
121
|
+
**Model-specific behavior:**
|
|
122
|
+
- **bulbul:v2:** Default is false
|
|
123
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
83
124
|
|
|
84
125
|
model : typing.Optional[TextToSpeechModel]
|
|
85
|
-
Specifies the model to use for text-to-speech conversion.
|
|
126
|
+
Specifies the model to use for text-to-speech conversion.
|
|
127
|
+
|
|
128
|
+
**Available models:**
|
|
129
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
130
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
86
131
|
|
|
87
132
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
88
133
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
89
134
|
|
|
135
|
+
temperature : typing.Optional[float]
|
|
136
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
137
|
+
|
|
138
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
139
|
+
|
|
90
140
|
request_options : typing.Optional[RequestOptions]
|
|
91
141
|
Request-specific configuration.
|
|
92
142
|
|
|
@@ -110,6 +160,7 @@ class RawTextToSpeechClient:
|
|
|
110
160
|
"enable_preprocessing": enable_preprocessing,
|
|
111
161
|
"model": model,
|
|
112
162
|
"output_audio_codec": output_audio_codec,
|
|
163
|
+
"temperature": temperature,
|
|
113
164
|
},
|
|
114
165
|
headers={
|
|
115
166
|
"content-type": "application/json",
|
|
@@ -205,15 +256,38 @@ class AsyncRawTextToSpeechClient:
|
|
|
205
256
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
206
257
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
207
258
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
259
|
+
temperature: typing.Optional[float] = OMIT,
|
|
208
260
|
request_options: typing.Optional[RequestOptions] = None,
|
|
209
261
|
) -> AsyncHttpResponse[TextToSpeechResponse]:
|
|
210
262
|
"""
|
|
211
|
-
|
|
212
|
-
|
|
263
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
264
|
+
|
|
265
|
+
**Available Models:**
|
|
266
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
267
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
268
|
+
|
|
269
|
+
**Important Notes for bulbul:v3-beta:**
|
|
270
|
+
- Pitch and loudness parameters are NOT supported
|
|
271
|
+
- Pace must be between 0.5 and 2.0
|
|
272
|
+
- Preprocessing is automatically enabled
|
|
273
|
+
- Default sample rate is 24000 Hz
|
|
274
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
213
275
|
|
|
214
276
|
Parameters
|
|
215
277
|
----------
|
|
216
278
|
text : str
|
|
279
|
+
The text(s) to be converted into speech.
|
|
280
|
+
|
|
281
|
+
**Features:**
|
|
282
|
+
- Supports code-mixed text (English and Indic languages)
|
|
283
|
+
|
|
284
|
+
**Model-specific limits:**
|
|
285
|
+
- **bulbul:v2:** Max 1500 characters
|
|
286
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
287
|
+
|
|
288
|
+
**Important Note:**
|
|
289
|
+
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
290
|
+
- This ensures proper pronunciation as a whole number
|
|
217
291
|
|
|
218
292
|
target_language_code : TextToSpeechLanguage
|
|
219
293
|
The language of the text is BCP-47 format
|
|
@@ -221,36 +295,63 @@ class AsyncRawTextToSpeechClient:
|
|
|
221
295
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
222
296
|
The speaker voice to be used for the output audio.
|
|
223
297
|
|
|
224
|
-
**Default:** Anushka
|
|
298
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
225
299
|
|
|
226
300
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
227
301
|
- **bulbul:v2:**
|
|
228
302
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
229
303
|
- Male: Abhilash, Karun, Hitesh
|
|
304
|
+
- **bulbul:v3-beta:**
|
|
305
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
230
306
|
|
|
231
307
|
**Note:** Speaker selection must match the chosen model version.
|
|
232
308
|
|
|
233
309
|
pitch : typing.Optional[float]
|
|
234
310
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
235
311
|
|
|
312
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
313
|
+
|
|
236
314
|
pace : typing.Optional[float]
|
|
237
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
315
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
316
|
+
|
|
317
|
+
**Model-specific ranges:**
|
|
318
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
319
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
238
320
|
|
|
239
321
|
loudness : typing.Optional[float]
|
|
240
322
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
241
323
|
|
|
324
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
325
|
+
|
|
242
326
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
243
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
327
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
328
|
+
|
|
329
|
+
**Model-specific defaults:**
|
|
330
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
331
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
244
332
|
|
|
245
333
|
enable_preprocessing : typing.Optional[bool]
|
|
246
|
-
|
|
334
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
335
|
+
|
|
336
|
+
**Model-specific behavior:**
|
|
337
|
+
- **bulbul:v2:** Default is false
|
|
338
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
247
339
|
|
|
248
340
|
model : typing.Optional[TextToSpeechModel]
|
|
249
|
-
Specifies the model to use for text-to-speech conversion.
|
|
341
|
+
Specifies the model to use for text-to-speech conversion.
|
|
342
|
+
|
|
343
|
+
**Available models:**
|
|
344
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
345
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
250
346
|
|
|
251
347
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
252
348
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
253
349
|
|
|
350
|
+
temperature : typing.Optional[float]
|
|
351
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
352
|
+
|
|
353
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
354
|
+
|
|
254
355
|
request_options : typing.Optional[RequestOptions]
|
|
255
356
|
Request-specific configuration.
|
|
256
357
|
|
|
@@ -274,6 +375,7 @@ class AsyncRawTextToSpeechClient:
|
|
|
274
375
|
"enable_preprocessing": enable_preprocessing,
|
|
275
376
|
"model": model,
|
|
276
377
|
"output_audio_codec": output_audio_codec,
|
|
378
|
+
"temperature": temperature,
|
|
277
379
|
},
|
|
278
380
|
headers={
|
|
279
381
|
"content-type": "application/json",
|
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
|
-
from .types import TextToSpeechStreamingSendCompletionEvent
|
|
5
|
+
from .types import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
|
|
6
6
|
|
|
7
|
-
__all__ = ["TextToSpeechStreamingSendCompletionEvent"]
|
|
7
|
+
__all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]
|
|
@@ -11,6 +11,7 @@ from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
|
11
11
|
from ..core.request_options import RequestOptions
|
|
12
12
|
from .raw_client import AsyncRawTextToSpeechStreamingClient, RawTextToSpeechStreamingClient
|
|
13
13
|
from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
|
|
14
|
+
from .types.text_to_speech_streaming_model import TextToSpeechStreamingModel
|
|
14
15
|
from .types.text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
|
|
15
16
|
|
|
16
17
|
try:
|
|
@@ -38,7 +39,7 @@ class TextToSpeechStreamingClient:
|
|
|
38
39
|
def connect(
|
|
39
40
|
self,
|
|
40
41
|
*,
|
|
41
|
-
model: typing.Optional[
|
|
42
|
+
model: typing.Optional[TextToSpeechStreamingModel] = None,
|
|
42
43
|
send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
|
|
43
44
|
api_subscription_key: typing.Optional[str] = None,
|
|
44
45
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -50,10 +51,16 @@ class TextToSpeechStreamingClient:
|
|
|
50
51
|
The Try It playground may not provide the best experience for streaming audio.
|
|
51
52
|
For optimal streaming performance, please use the SDK or implement your own WebSocket client.
|
|
52
53
|
|
|
54
|
+
**Model-Specific Notes:**
|
|
55
|
+
- **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
|
|
56
|
+
- **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
|
|
57
|
+
|
|
53
58
|
Parameters
|
|
54
59
|
----------
|
|
55
|
-
model : typing.Optional[
|
|
56
|
-
Text to speech model to use
|
|
60
|
+
model : typing.Optional[TextToSpeechStreamingModel]
|
|
61
|
+
Text to speech model to use.
|
|
62
|
+
- **bulbul:v2** (default): Standard TTS model with pitch/loudness support
|
|
63
|
+
- **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
|
|
57
64
|
|
|
58
65
|
send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
|
|
59
66
|
Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
|
|
@@ -117,7 +124,7 @@ class AsyncTextToSpeechStreamingClient:
|
|
|
117
124
|
async def connect(
|
|
118
125
|
self,
|
|
119
126
|
*,
|
|
120
|
-
model: typing.Optional[
|
|
127
|
+
model: typing.Optional[TextToSpeechStreamingModel] = None,
|
|
121
128
|
send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
|
|
122
129
|
api_subscription_key: typing.Optional[str] = None,
|
|
123
130
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -129,10 +136,16 @@ class AsyncTextToSpeechStreamingClient:
|
|
|
129
136
|
The Try It playground may not provide the best experience for streaming audio.
|
|
130
137
|
For optimal streaming performance, please use the SDK or implement your own WebSocket client.
|
|
131
138
|
|
|
139
|
+
**Model-Specific Notes:**
|
|
140
|
+
- **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
|
|
141
|
+
- **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
|
|
142
|
+
|
|
132
143
|
Parameters
|
|
133
144
|
----------
|
|
134
|
-
model : typing.Optional[
|
|
135
|
-
Text to speech model to use
|
|
145
|
+
model : typing.Optional[TextToSpeechStreamingModel]
|
|
146
|
+
Text to speech model to use.
|
|
147
|
+
- **bulbul:v2** (default): Standard TTS model with pitch/loudness support
|
|
148
|
+
- **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
|
|
136
149
|
|
|
137
150
|
send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
|
|
138
151
|
Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
|
|
@@ -10,6 +10,7 @@ from ..core.api_error import ApiError
|
|
|
10
10
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
11
11
|
from ..core.request_options import RequestOptions
|
|
12
12
|
from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
|
|
13
|
+
from .types.text_to_speech_streaming_model import TextToSpeechStreamingModel
|
|
13
14
|
from .types.text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
|
|
14
15
|
|
|
15
16
|
try:
|
|
@@ -26,7 +27,7 @@ class RawTextToSpeechStreamingClient:
|
|
|
26
27
|
def connect(
|
|
27
28
|
self,
|
|
28
29
|
*,
|
|
29
|
-
model: typing.Optional[
|
|
30
|
+
model: typing.Optional[TextToSpeechStreamingModel] = None,
|
|
30
31
|
send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
|
|
31
32
|
api_subscription_key: typing.Optional[str] = None,
|
|
32
33
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -38,10 +39,16 @@ class RawTextToSpeechStreamingClient:
|
|
|
38
39
|
The Try It playground may not provide the best experience for streaming audio.
|
|
39
40
|
For optimal streaming performance, please use the SDK or implement your own WebSocket client.
|
|
40
41
|
|
|
42
|
+
**Model-Specific Notes:**
|
|
43
|
+
- **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
|
|
44
|
+
- **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
|
|
45
|
+
|
|
41
46
|
Parameters
|
|
42
47
|
----------
|
|
43
|
-
model : typing.Optional[
|
|
44
|
-
Text to speech model to use
|
|
48
|
+
model : typing.Optional[TextToSpeechStreamingModel]
|
|
49
|
+
Text to speech model to use.
|
|
50
|
+
- **bulbul:v2** (default): Standard TTS model with pitch/loudness support
|
|
51
|
+
- **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
|
|
45
52
|
|
|
46
53
|
send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
|
|
47
54
|
Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
|
|
@@ -94,7 +101,7 @@ class AsyncRawTextToSpeechStreamingClient:
|
|
|
94
101
|
async def connect(
|
|
95
102
|
self,
|
|
96
103
|
*,
|
|
97
|
-
model: typing.Optional[
|
|
104
|
+
model: typing.Optional[TextToSpeechStreamingModel] = None,
|
|
98
105
|
send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
|
|
99
106
|
api_subscription_key: typing.Optional[str] = None,
|
|
100
107
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -106,10 +113,16 @@ class AsyncRawTextToSpeechStreamingClient:
|
|
|
106
113
|
The Try It playground may not provide the best experience for streaming audio.
|
|
107
114
|
For optimal streaming performance, please use the SDK or implement your own WebSocket client.
|
|
108
115
|
|
|
116
|
+
**Model-Specific Notes:**
|
|
117
|
+
- **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
|
|
118
|
+
- **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
|
|
119
|
+
|
|
109
120
|
Parameters
|
|
110
121
|
----------
|
|
111
|
-
model : typing.Optional[
|
|
112
|
-
Text to speech model to use
|
|
122
|
+
model : typing.Optional[TextToSpeechStreamingModel]
|
|
123
|
+
Text to speech model to use.
|
|
124
|
+
- **bulbul:v2** (default): Standard TTS model with pitch/loudness support
|
|
125
|
+
- **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
|
|
113
126
|
|
|
114
127
|
send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
|
|
115
128
|
Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
|
+
from .text_to_speech_streaming_model import TextToSpeechStreamingModel
|
|
5
6
|
from .text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
|
|
6
7
|
|
|
7
|
-
__all__ = ["TextToSpeechStreamingSendCompletionEvent"]
|
|
8
|
+
__all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]
|
sarvamai/types/__init__.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
5
|
from .audio_data import AudioData
|
|
6
|
-
from .audio_data_input_audio_codec import AudioDataInputAudioCodec
|
|
7
6
|
from .audio_message import AudioMessage
|
|
8
7
|
from .audio_output import AudioOutput
|
|
9
8
|
from .audio_output_data import AudioOutputData
|
|
@@ -26,6 +25,7 @@ from .completion_usage import CompletionUsage
|
|
|
26
25
|
from .config_message import ConfigMessage
|
|
27
26
|
from .configure_connection import ConfigureConnection
|
|
28
27
|
from .configure_connection_data import ConfigureConnectionData
|
|
28
|
+
from .configure_connection_data_model import ConfigureConnectionDataModel
|
|
29
29
|
from .configure_connection_data_output_audio_bitrate import ConfigureConnectionDataOutputAudioBitrate
|
|
30
30
|
from .configure_connection_data_output_audio_codec import ConfigureConnectionDataOutputAudioCodec
|
|
31
31
|
from .configure_connection_data_speaker import ConfigureConnectionDataSpeaker
|
|
@@ -34,6 +34,21 @@ from .connection_sample_rate import ConnectionSampleRate
|
|
|
34
34
|
from .create_chat_completion_response import CreateChatCompletionResponse
|
|
35
35
|
from .diarized_entry import DiarizedEntry
|
|
36
36
|
from .diarized_transcript import DiarizedTranscript
|
|
37
|
+
from .doc_digitization_create_job_response import DocDigitizationCreateJobResponse
|
|
38
|
+
from .doc_digitization_download_files_response import DocDigitizationDownloadFilesResponse
|
|
39
|
+
from .doc_digitization_error_code import DocDigitizationErrorCode
|
|
40
|
+
from .doc_digitization_error_details import DocDigitizationErrorDetails
|
|
41
|
+
from .doc_digitization_error_message import DocDigitizationErrorMessage
|
|
42
|
+
from .doc_digitization_job_detail import DocDigitizationJobDetail
|
|
43
|
+
from .doc_digitization_job_detail_state import DocDigitizationJobDetailState
|
|
44
|
+
from .doc_digitization_job_parameters import DocDigitizationJobParameters
|
|
45
|
+
from .doc_digitization_job_state import DocDigitizationJobState
|
|
46
|
+
from .doc_digitization_job_status_response import DocDigitizationJobStatusResponse
|
|
47
|
+
from .doc_digitization_output_format import DocDigitizationOutputFormat
|
|
48
|
+
from .doc_digitization_page_error import DocDigitizationPageError
|
|
49
|
+
from .doc_digitization_supported_language import DocDigitizationSupportedLanguage
|
|
50
|
+
from .doc_digitization_upload_files_response import DocDigitizationUploadFilesResponse
|
|
51
|
+
from .doc_digitization_webhook_callback import DocDigitizationWebhookCallback
|
|
37
52
|
from .error_code import ErrorCode
|
|
38
53
|
from .error_data import ErrorData
|
|
39
54
|
from .error_details import ErrorDetails
|
|
@@ -54,6 +69,7 @@ from .input_audio_codec import InputAudioCodec
|
|
|
54
69
|
from .job_state import JobState
|
|
55
70
|
from .job_status_v_1_response import JobStatusV1Response
|
|
56
71
|
from .language_identification_response import LanguageIdentificationResponse
|
|
72
|
+
from .mode import Mode
|
|
57
73
|
from .numerals_format import NumeralsFormat
|
|
58
74
|
from .ping_signal import PingSignal
|
|
59
75
|
from .reasoning_effort import ReasoningEffort
|
|
@@ -104,7 +120,6 @@ from .transliteration_response import TransliterationResponse
|
|
|
104
120
|
|
|
105
121
|
__all__ = [
|
|
106
122
|
"AudioData",
|
|
107
|
-
"AudioDataInputAudioCodec",
|
|
108
123
|
"AudioMessage",
|
|
109
124
|
"AudioOutput",
|
|
110
125
|
"AudioOutputData",
|
|
@@ -125,6 +140,7 @@ __all__ = [
|
|
|
125
140
|
"ConfigMessage",
|
|
126
141
|
"ConfigureConnection",
|
|
127
142
|
"ConfigureConnectionData",
|
|
143
|
+
"ConfigureConnectionDataModel",
|
|
128
144
|
"ConfigureConnectionDataOutputAudioBitrate",
|
|
129
145
|
"ConfigureConnectionDataOutputAudioCodec",
|
|
130
146
|
"ConfigureConnectionDataSpeaker",
|
|
@@ -133,6 +149,21 @@ __all__ = [
|
|
|
133
149
|
"CreateChatCompletionResponse",
|
|
134
150
|
"DiarizedEntry",
|
|
135
151
|
"DiarizedTranscript",
|
|
152
|
+
"DocDigitizationCreateJobResponse",
|
|
153
|
+
"DocDigitizationDownloadFilesResponse",
|
|
154
|
+
"DocDigitizationErrorCode",
|
|
155
|
+
"DocDigitizationErrorDetails",
|
|
156
|
+
"DocDigitizationErrorMessage",
|
|
157
|
+
"DocDigitizationJobDetail",
|
|
158
|
+
"DocDigitizationJobDetailState",
|
|
159
|
+
"DocDigitizationJobParameters",
|
|
160
|
+
"DocDigitizationJobState",
|
|
161
|
+
"DocDigitizationJobStatusResponse",
|
|
162
|
+
"DocDigitizationOutputFormat",
|
|
163
|
+
"DocDigitizationPageError",
|
|
164
|
+
"DocDigitizationSupportedLanguage",
|
|
165
|
+
"DocDigitizationUploadFilesResponse",
|
|
166
|
+
"DocDigitizationWebhookCallback",
|
|
136
167
|
"ErrorCode",
|
|
137
168
|
"ErrorData",
|
|
138
169
|
"ErrorDetails",
|
|
@@ -153,6 +184,7 @@ __all__ = [
|
|
|
153
184
|
"JobState",
|
|
154
185
|
"JobStatusV1Response",
|
|
155
186
|
"LanguageIdentificationResponse",
|
|
187
|
+
"Mode",
|
|
156
188
|
"NumeralsFormat",
|
|
157
189
|
"PingSignal",
|
|
158
190
|
"ReasoningEffort",
|
sarvamai/types/audio_data.py
CHANGED
|
@@ -4,7 +4,6 @@ import typing
|
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
6
|
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
|
7
|
-
from .audio_data_input_audio_codec import AudioDataInputAudioCodec
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class AudioData(UniversalBaseModel):
|
|
@@ -29,11 +28,6 @@ class AudioData(UniversalBaseModel):
|
|
|
29
28
|
Audio encoding format
|
|
30
29
|
"""
|
|
31
30
|
|
|
32
|
-
input_audio_codec: typing.Optional[AudioDataInputAudioCodec] = pydantic.Field(default=None)
|
|
33
|
-
"""
|
|
34
|
-
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
31
|
if IS_PYDANTIC_V2:
|
|
38
32
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
|
39
33
|
else:
|
|
@@ -13,6 +13,10 @@ class ConfigureConnection(UniversalBaseModel):
|
|
|
13
13
|
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
14
14
|
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
15
15
|
will be automatically flushed and processed before applying the new configuration.
|
|
16
|
+
|
|
17
|
+
**Model-Specific Notes:**
|
|
18
|
+
- **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
|
|
19
|
+
- **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature. Default sample rate: 24000 Hz.
|
|
16
20
|
"""
|
|
17
21
|
|
|
18
22
|
type: typing.Literal["config"] = "config"
|
|
@@ -4,6 +4,7 @@ import typing
|
|
|
4
4
|
|
|
5
5
|
import pydantic
|
|
6
6
|
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
|
7
|
+
from .configure_connection_data_model import ConfigureConnectionDataModel
|
|
7
8
|
from .configure_connection_data_output_audio_bitrate import ConfigureConnectionDataOutputAudioBitrate
|
|
8
9
|
from .configure_connection_data_output_audio_codec import ConfigureConnectionDataOutputAudioCodec
|
|
9
10
|
from .configure_connection_data_speaker import ConfigureConnectionDataSpeaker
|
|
@@ -11,21 +12,25 @@ from .configure_connection_data_target_language_code import ConfigureConnectionD
|
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class ConfigureConnectionData(UniversalBaseModel):
|
|
15
|
+
model: typing.Optional[ConfigureConnectionDataModel] = pydantic.Field(default=None)
|
|
16
|
+
"""
|
|
17
|
+
Specifies the model to use for text-to-speech conversion.
|
|
18
|
+
- **bulbul:v2** (default): Standard TTS model with pitch/loudness support
|
|
19
|
+
- **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
|
|
20
|
+
"""
|
|
21
|
+
|
|
14
22
|
target_language_code: ConfigureConnectionDataTargetLanguageCode = pydantic.Field()
|
|
15
23
|
"""
|
|
16
|
-
The language of the text
|
|
24
|
+
The language of the text in BCP-47 format
|
|
17
25
|
"""
|
|
18
26
|
|
|
19
27
|
speaker: ConfigureConnectionDataSpeaker = pydantic.Field()
|
|
20
28
|
"""
|
|
21
29
|
The speaker voice to be used for the output audio.
|
|
22
30
|
|
|
23
|
-
**
|
|
24
|
-
|
|
25
|
-
**
|
|
26
|
-
- **bulbul:v2:**
|
|
27
|
-
- Female: Anushka, Manisha, Vidya, Arya
|
|
28
|
-
- Male: Abhilash, Karun, Hitesh
|
|
31
|
+
**Model Compatibility:**
|
|
32
|
+
- **bulbul:v2:** anushka (default), abhilash, manisha, vidya, arya, karun, hitesh
|
|
33
|
+
- **bulbul:v3-beta:** aditya (default), ritu, priya, neha, rahul, pooja, rohan, simran, kavya, amit, dev, ishita, shreya, ratan, varun, manan, sumit, roopa, kabir, aayan, shubh, ashutosh, advait, amelia, sophia
|
|
29
34
|
|
|
30
35
|
**Note:** Speaker selection must match the chosen model version.
|
|
31
36
|
"""
|
|
@@ -35,13 +40,18 @@ class ConfigureConnectionData(UniversalBaseModel):
|
|
|
35
40
|
Controls the pitch of the audio. Lower values result in a deeper voice,
|
|
36
41
|
while higher values make it sharper. The suitable range is between -0.75
|
|
37
42
|
and 0.75. Default is 0.0.
|
|
43
|
+
|
|
44
|
+
**Note:** NOT supported for bulbul:v3-beta. Will be ignored if provided.
|
|
38
45
|
"""
|
|
39
46
|
|
|
40
47
|
pace: typing.Optional[float] = pydantic.Field(default=None)
|
|
41
48
|
"""
|
|
42
49
|
Controls the speed of the audio. Lower values result in slower speech,
|
|
43
|
-
while higher values make it faster.
|
|
44
|
-
|
|
50
|
+
while higher values make it faster. Default is 1.0.
|
|
51
|
+
|
|
52
|
+
**Model-specific ranges:**
|
|
53
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
54
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
45
55
|
"""
|
|
46
56
|
|
|
47
57
|
loudness: typing.Optional[float] = pydantic.Field(default=None)
|
|
@@ -49,19 +59,38 @@ class ConfigureConnectionData(UniversalBaseModel):
|
|
|
49
59
|
Controls the loudness of the audio. Lower values result in quieter audio,
|
|
50
60
|
while higher values make it louder. The suitable range is between 0.3
|
|
51
61
|
and 3.0. Default is 1.0.
|
|
62
|
+
|
|
63
|
+
**Note:** NOT supported for bulbul:v3-beta. Will be ignored if provided.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
temperature: typing.Optional[float] = pydantic.Field(default=None)
|
|
67
|
+
"""
|
|
68
|
+
Controls the randomness of the output. Lower values make the output more
|
|
69
|
+
focused and deterministic, while higher values make it more random.
|
|
70
|
+
The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
71
|
+
|
|
72
|
+
**Note:** Only supported for bulbul:v3-beta. Will be ignored for bulbul:v2.
|
|
52
73
|
"""
|
|
53
74
|
|
|
54
75
|
speech_sample_rate: typing.Optional[int] = pydantic.Field(default=None)
|
|
55
76
|
"""
|
|
56
77
|
Specifies the sample rate of the output audio. Supported values are
|
|
57
|
-
8000, 16000, 22050, 24000 Hz.
|
|
78
|
+
8000, 16000, 22050, 24000 Hz.
|
|
79
|
+
|
|
80
|
+
**Model-specific defaults:**
|
|
81
|
+
- **bulbul:v2:** 22050 Hz
|
|
82
|
+
- **bulbul:v3-beta:** 24000 Hz
|
|
58
83
|
"""
|
|
59
84
|
|
|
60
85
|
enable_preprocessing: typing.Optional[bool] = pydantic.Field(default=None)
|
|
61
86
|
"""
|
|
62
87
|
Controls whether normalization of English words and numeric entities
|
|
63
88
|
(e.g., numbers, dates) is performed. Set to true for better handling
|
|
64
|
-
of mixed-language text.
|
|
89
|
+
of mixed-language text.
|
|
90
|
+
|
|
91
|
+
**Model-specific defaults:**
|
|
92
|
+
- **bulbul:v2:** false (optional)
|
|
93
|
+
- **bulbul:v3-beta:** Always enabled (cannot be disabled)
|
|
65
94
|
"""
|
|
66
95
|
|
|
67
96
|
output_audio_codec: typing.Optional[ConfigureConnectionDataOutputAudioCodec] = pydantic.Field(default=None)
|
|
@@ -3,5 +3,39 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
ConfigureConnectionDataSpeaker = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
6
|
+
typing.Literal[
|
|
7
|
+
"anushka",
|
|
8
|
+
"abhilash",
|
|
9
|
+
"manisha",
|
|
10
|
+
"vidya",
|
|
11
|
+
"arya",
|
|
12
|
+
"karun",
|
|
13
|
+
"hitesh",
|
|
14
|
+
"aditya",
|
|
15
|
+
"ritu",
|
|
16
|
+
"priya",
|
|
17
|
+
"neha",
|
|
18
|
+
"rahul",
|
|
19
|
+
"pooja",
|
|
20
|
+
"rohan",
|
|
21
|
+
"simran",
|
|
22
|
+
"kavya",
|
|
23
|
+
"amit",
|
|
24
|
+
"dev",
|
|
25
|
+
"ishita",
|
|
26
|
+
"shreya",
|
|
27
|
+
"ratan",
|
|
28
|
+
"varun",
|
|
29
|
+
"manan",
|
|
30
|
+
"sumit",
|
|
31
|
+
"roopa",
|
|
32
|
+
"kabir",
|
|
33
|
+
"aayan",
|
|
34
|
+
"shubh",
|
|
35
|
+
"ashutosh",
|
|
36
|
+
"advait",
|
|
37
|
+
"amelia",
|
|
38
|
+
"sophia",
|
|
39
|
+
],
|
|
40
|
+
typing.Any,
|
|
7
41
|
]
|