sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -3
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_streaming/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/client.py +102 -18
- sarvamai/speech_to_text_streaming/raw_client.py +102 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_streaming/client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -2
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -32,11 +32,11 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
32
32
|
self,
|
|
33
33
|
*,
|
|
34
34
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
35
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
36
35
|
sample_rate: typing.Optional[str] = None,
|
|
37
36
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
38
37
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
39
38
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
39
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
40
40
|
api_subscription_key: typing.Optional[str] = None,
|
|
41
41
|
request_options: typing.Optional[RequestOptions] = None,
|
|
42
42
|
) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -50,10 +50,10 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
50
50
|
Parameters
|
|
51
51
|
----------
|
|
52
52
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
53
|
-
|
|
53
|
+
Model to be used for speech to text translation.
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
56
|
+
- Example: Hindi audio → English text output
|
|
57
57
|
|
|
58
58
|
sample_rate : typing.Optional[str]
|
|
59
59
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -67,6 +67,10 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
67
67
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
68
68
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
69
69
|
|
|
70
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
71
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
72
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
73
|
+
|
|
70
74
|
api_subscription_key : typing.Optional[str]
|
|
71
75
|
API subscription key for authentication
|
|
72
76
|
|
|
@@ -81,8 +85,6 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
81
85
|
query_params = httpx.QueryParams()
|
|
82
86
|
if model is not None:
|
|
83
87
|
query_params = query_params.add("model", model)
|
|
84
|
-
if input_audio_codec is not None:
|
|
85
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
86
88
|
if sample_rate is not None:
|
|
87
89
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
88
90
|
if high_vad_sensitivity is not None:
|
|
@@ -91,6 +93,8 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
91
93
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
92
94
|
if flush_signal is not None:
|
|
93
95
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
96
|
+
if input_audio_codec is not None:
|
|
97
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
94
98
|
ws_url = ws_url + f"?{query_params}"
|
|
95
99
|
headers = self._client_wrapper.get_headers()
|
|
96
100
|
if api_subscription_key is not None:
|
|
@@ -124,11 +128,11 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
124
128
|
self,
|
|
125
129
|
*,
|
|
126
130
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
127
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
128
131
|
sample_rate: typing.Optional[str] = None,
|
|
129
132
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
130
133
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
131
134
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
135
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
132
136
|
api_subscription_key: typing.Optional[str] = None,
|
|
133
137
|
request_options: typing.Optional[RequestOptions] = None,
|
|
134
138
|
) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -142,10 +146,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
142
146
|
Parameters
|
|
143
147
|
----------
|
|
144
148
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
145
|
-
|
|
149
|
+
Model to be used for speech to text translation.
|
|
146
150
|
|
|
147
|
-
|
|
148
|
-
|
|
151
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
152
|
+
- Example: Hindi audio → English text output
|
|
149
153
|
|
|
150
154
|
sample_rate : typing.Optional[str]
|
|
151
155
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -159,6 +163,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
159
163
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
160
164
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
161
165
|
|
|
166
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
167
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
168
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
169
|
+
|
|
162
170
|
api_subscription_key : typing.Optional[str]
|
|
163
171
|
API subscription key for authentication
|
|
164
172
|
|
|
@@ -173,8 +181,6 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
173
181
|
query_params = httpx.QueryParams()
|
|
174
182
|
if model is not None:
|
|
175
183
|
query_params = query_params.add("model", model)
|
|
176
|
-
if input_audio_codec is not None:
|
|
177
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
178
184
|
if sample_rate is not None:
|
|
179
185
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
180
186
|
if high_vad_sensitivity is not None:
|
|
@@ -183,6 +189,8 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
183
189
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
184
190
|
if flush_signal is not None:
|
|
185
191
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
192
|
+
if input_audio_codec is not None:
|
|
193
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
186
194
|
ws_url = ws_url + f"?{query_params}"
|
|
187
195
|
headers = self._client_wrapper.get_headers()
|
|
188
196
|
if api_subscription_key is not None:
|
|
@@ -3,31 +3,5 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
SpeechToTextTranslateStreamingInputAudioCodec = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
7
|
-
"wav",
|
|
8
|
-
"x-wav",
|
|
9
|
-
"wave",
|
|
10
|
-
"mp3",
|
|
11
|
-
"mpeg",
|
|
12
|
-
"mpeg3",
|
|
13
|
-
"x-mp3",
|
|
14
|
-
"x-mpeg-3",
|
|
15
|
-
"aac",
|
|
16
|
-
"x-aac",
|
|
17
|
-
"aiff",
|
|
18
|
-
"x-aiff",
|
|
19
|
-
"ogg",
|
|
20
|
-
"opus",
|
|
21
|
-
"flac",
|
|
22
|
-
"x-flac",
|
|
23
|
-
"mp4",
|
|
24
|
-
"x-m4a",
|
|
25
|
-
"amr",
|
|
26
|
-
"x-ms-wma",
|
|
27
|
-
"webm",
|
|
28
|
-
"pcm_s16le",
|
|
29
|
-
"pcm_l16",
|
|
30
|
-
"pcm_raw",
|
|
31
|
-
],
|
|
32
|
-
typing.Any,
|
|
6
|
+
typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
|
|
33
7
|
]
|
sarvamai/text/client.py
CHANGED
|
@@ -47,7 +47,6 @@ class TextClient:
|
|
|
47
47
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
48
48
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
49
49
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
50
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
51
50
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
52
51
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
53
52
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -125,10 +124,6 @@ class TextClient:
|
|
|
125
124
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
126
125
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
127
126
|
|
|
128
|
-
enable_preprocessing : typing.Optional[bool]
|
|
129
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
130
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
131
|
-
|
|
132
127
|
output_script : typing.Optional[TransliterateMode]
|
|
133
128
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
134
129
|
|
|
@@ -186,7 +181,6 @@ class TextClient:
|
|
|
186
181
|
speaker_gender=speaker_gender,
|
|
187
182
|
mode=mode,
|
|
188
183
|
model=model,
|
|
189
|
-
enable_preprocessing=enable_preprocessing,
|
|
190
184
|
output_script=output_script,
|
|
191
185
|
numerals_format=numerals_format,
|
|
192
186
|
request_options=request_options,
|
|
@@ -371,7 +365,6 @@ class AsyncTextClient:
|
|
|
371
365
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
372
366
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
373
367
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
374
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
375
368
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
376
369
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
377
370
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -449,10 +442,6 @@ class AsyncTextClient:
|
|
|
449
442
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
450
443
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
451
444
|
|
|
452
|
-
enable_preprocessing : typing.Optional[bool]
|
|
453
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
454
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
455
|
-
|
|
456
445
|
output_script : typing.Optional[TransliterateMode]
|
|
457
446
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
458
447
|
|
|
@@ -518,7 +507,6 @@ class AsyncTextClient:
|
|
|
518
507
|
speaker_gender=speaker_gender,
|
|
519
508
|
mode=mode,
|
|
520
509
|
model=model,
|
|
521
|
-
enable_preprocessing=enable_preprocessing,
|
|
522
510
|
output_script=output_script,
|
|
523
511
|
numerals_format=numerals_format,
|
|
524
512
|
request_options=request_options,
|
sarvamai/text/raw_client.py
CHANGED
|
@@ -44,7 +44,6 @@ class RawTextClient:
|
|
|
44
44
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
45
45
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
46
46
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
47
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
48
47
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
49
48
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
50
49
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -122,10 +121,6 @@ class RawTextClient:
|
|
|
122
121
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
123
122
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
124
123
|
|
|
125
|
-
enable_preprocessing : typing.Optional[bool]
|
|
126
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
127
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
128
|
-
|
|
129
124
|
output_script : typing.Optional[TransliterateMode]
|
|
130
125
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
131
126
|
|
|
@@ -174,7 +169,6 @@ class RawTextClient:
|
|
|
174
169
|
"speaker_gender": speaker_gender,
|
|
175
170
|
"mode": mode,
|
|
176
171
|
"model": model,
|
|
177
|
-
"enable_preprocessing": enable_preprocessing,
|
|
178
172
|
"output_script": output_script,
|
|
179
173
|
"numerals_format": numerals_format,
|
|
180
174
|
},
|
|
@@ -554,7 +548,6 @@ class AsyncRawTextClient:
|
|
|
554
548
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
555
549
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
556
550
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
557
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
558
551
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
559
552
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
560
553
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -632,10 +625,6 @@ class AsyncRawTextClient:
|
|
|
632
625
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
633
626
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
634
627
|
|
|
635
|
-
enable_preprocessing : typing.Optional[bool]
|
|
636
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
637
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
638
|
-
|
|
639
628
|
output_script : typing.Optional[TransliterateMode]
|
|
640
629
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
641
630
|
|
|
@@ -684,7 +673,6 @@ class AsyncRawTextClient:
|
|
|
684
673
|
"speaker_gender": speaker_gender,
|
|
685
674
|
"mode": mode,
|
|
686
675
|
"model": model,
|
|
687
|
-
"enable_preprocessing": enable_preprocessing,
|
|
688
676
|
"output_script": output_script,
|
|
689
677
|
"numerals_format": numerals_format,
|
|
690
678
|
},
|
|
@@ -44,15 +44,38 @@ class TextToSpeechClient:
|
|
|
44
44
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
45
45
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
46
46
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
47
|
+
temperature: typing.Optional[float] = OMIT,
|
|
47
48
|
request_options: typing.Optional[RequestOptions] = None,
|
|
48
49
|
) -> TextToSpeechResponse:
|
|
49
50
|
"""
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
52
|
+
|
|
53
|
+
**Available Models:**
|
|
54
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
55
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
56
|
+
|
|
57
|
+
**Important Notes for bulbul:v3-beta:**
|
|
58
|
+
- Pitch and loudness parameters are NOT supported
|
|
59
|
+
- Pace must be between 0.5 and 2.0
|
|
60
|
+
- Preprocessing is automatically enabled
|
|
61
|
+
- Default sample rate is 24000 Hz
|
|
62
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
52
63
|
|
|
53
64
|
Parameters
|
|
54
65
|
----------
|
|
55
66
|
text : str
|
|
67
|
+
The text(s) to be converted into speech.
|
|
68
|
+
|
|
69
|
+
**Features:**
|
|
70
|
+
- Supports code-mixed text (English and Indic languages)
|
|
71
|
+
|
|
72
|
+
**Model-specific limits:**
|
|
73
|
+
- **bulbul:v2:** Max 1500 characters
|
|
74
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
75
|
+
|
|
76
|
+
**Important Note:**
|
|
77
|
+
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
78
|
+
- This ensures proper pronunciation as a whole number
|
|
56
79
|
|
|
57
80
|
target_language_code : TextToSpeechLanguage
|
|
58
81
|
The language of the text is BCP-47 format
|
|
@@ -60,36 +83,63 @@ class TextToSpeechClient:
|
|
|
60
83
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
61
84
|
The speaker voice to be used for the output audio.
|
|
62
85
|
|
|
63
|
-
**Default:** Anushka
|
|
86
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
64
87
|
|
|
65
88
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
66
89
|
- **bulbul:v2:**
|
|
67
90
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
68
91
|
- Male: Abhilash, Karun, Hitesh
|
|
92
|
+
- **bulbul:v3-beta:**
|
|
93
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
69
94
|
|
|
70
95
|
**Note:** Speaker selection must match the chosen model version.
|
|
71
96
|
|
|
72
97
|
pitch : typing.Optional[float]
|
|
73
98
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
74
99
|
|
|
100
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
101
|
+
|
|
75
102
|
pace : typing.Optional[float]
|
|
76
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
103
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
104
|
+
|
|
105
|
+
**Model-specific ranges:**
|
|
106
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
107
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
77
108
|
|
|
78
109
|
loudness : typing.Optional[float]
|
|
79
110
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
80
111
|
|
|
112
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
113
|
+
|
|
81
114
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
82
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
115
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
116
|
+
|
|
117
|
+
**Model-specific defaults:**
|
|
118
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
119
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
83
120
|
|
|
84
121
|
enable_preprocessing : typing.Optional[bool]
|
|
85
|
-
|
|
122
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
123
|
+
|
|
124
|
+
**Model-specific behavior:**
|
|
125
|
+
- **bulbul:v2:** Default is false
|
|
126
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
86
127
|
|
|
87
128
|
model : typing.Optional[TextToSpeechModel]
|
|
88
|
-
Specifies the model to use for text-to-speech conversion.
|
|
129
|
+
Specifies the model to use for text-to-speech conversion.
|
|
130
|
+
|
|
131
|
+
**Available models:**
|
|
132
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
133
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
89
134
|
|
|
90
135
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
91
136
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
92
137
|
|
|
138
|
+
temperature : typing.Optional[float]
|
|
139
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
140
|
+
|
|
141
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
142
|
+
|
|
93
143
|
request_options : typing.Optional[RequestOptions]
|
|
94
144
|
Request-specific configuration.
|
|
95
145
|
|
|
@@ -121,6 +171,7 @@ class TextToSpeechClient:
|
|
|
121
171
|
enable_preprocessing=enable_preprocessing,
|
|
122
172
|
model=model,
|
|
123
173
|
output_audio_codec=output_audio_codec,
|
|
174
|
+
temperature=temperature,
|
|
124
175
|
request_options=request_options,
|
|
125
176
|
)
|
|
126
177
|
return _response.data
|
|
@@ -154,15 +205,38 @@ class AsyncTextToSpeechClient:
|
|
|
154
205
|
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
155
206
|
model: typing.Optional[TextToSpeechModel] = OMIT,
|
|
156
207
|
output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
|
|
208
|
+
temperature: typing.Optional[float] = OMIT,
|
|
157
209
|
request_options: typing.Optional[RequestOptions] = None,
|
|
158
210
|
) -> TextToSpeechResponse:
|
|
159
211
|
"""
|
|
160
|
-
|
|
161
|
-
|
|
212
|
+
Convert text into spoken audio. The output is a wave file encoded as a base64 string.
|
|
213
|
+
|
|
214
|
+
**Available Models:**
|
|
215
|
+
- **bulbul:v2** (default): Supports pitch, loudness, and pace controls
|
|
216
|
+
- **bulbul:v3-beta**: Newer model with temperature control and improved quality
|
|
217
|
+
|
|
218
|
+
**Important Notes for bulbul:v3-beta:**
|
|
219
|
+
- Pitch and loudness parameters are NOT supported
|
|
220
|
+
- Pace must be between 0.5 and 2.0
|
|
221
|
+
- Preprocessing is automatically enabled
|
|
222
|
+
- Default sample rate is 24000 Hz
|
|
223
|
+
- Temperature parameter available (0.01-1.0, default 0.6)
|
|
162
224
|
|
|
163
225
|
Parameters
|
|
164
226
|
----------
|
|
165
227
|
text : str
|
|
228
|
+
The text(s) to be converted into speech.
|
|
229
|
+
|
|
230
|
+
**Features:**
|
|
231
|
+
- Supports code-mixed text (English and Indic languages)
|
|
232
|
+
|
|
233
|
+
**Model-specific limits:**
|
|
234
|
+
- **bulbul:v2:** Max 1500 characters
|
|
235
|
+
- **bulbul:v3-beta:** Max 2500 characters
|
|
236
|
+
|
|
237
|
+
**Important Note:**
|
|
238
|
+
- For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
|
|
239
|
+
- This ensures proper pronunciation as a whole number
|
|
166
240
|
|
|
167
241
|
target_language_code : TextToSpeechLanguage
|
|
168
242
|
The language of the text is BCP-47 format
|
|
@@ -170,36 +244,63 @@ class AsyncTextToSpeechClient:
|
|
|
170
244
|
speaker : typing.Optional[TextToSpeechSpeaker]
|
|
171
245
|
The speaker voice to be used for the output audio.
|
|
172
246
|
|
|
173
|
-
**Default:** Anushka
|
|
247
|
+
**Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
|
|
174
248
|
|
|
175
249
|
**Model Compatibility (Speakers compatible with respective model):**
|
|
176
250
|
- **bulbul:v2:**
|
|
177
251
|
- Female: Anushka, Manisha, Vidya, Arya
|
|
178
252
|
- Male: Abhilash, Karun, Hitesh
|
|
253
|
+
- **bulbul:v3-beta:**
|
|
254
|
+
- Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
|
|
179
255
|
|
|
180
256
|
**Note:** Speaker selection must match the chosen model version.
|
|
181
257
|
|
|
182
258
|
pitch : typing.Optional[float]
|
|
183
259
|
Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
|
|
184
260
|
|
|
261
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
262
|
+
|
|
185
263
|
pace : typing.Optional[float]
|
|
186
|
-
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster.
|
|
264
|
+
Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
|
|
265
|
+
|
|
266
|
+
**Model-specific ranges:**
|
|
267
|
+
- **bulbul:v2:** 0.3 to 3.0
|
|
268
|
+
- **bulbul:v3-beta:** 0.5 to 2.0
|
|
187
269
|
|
|
188
270
|
loudness : typing.Optional[float]
|
|
189
271
|
Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
|
|
190
272
|
|
|
273
|
+
**Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
|
|
274
|
+
|
|
191
275
|
speech_sample_rate : typing.Optional[SpeechSampleRate]
|
|
192
|
-
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
276
|
+
Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
|
|
277
|
+
|
|
278
|
+
**Model-specific defaults:**
|
|
279
|
+
- **bulbul:v2:** Default is 22050 Hz
|
|
280
|
+
- **bulbul:v3-beta:** Default is 24000 Hz
|
|
193
281
|
|
|
194
282
|
enable_preprocessing : typing.Optional[bool]
|
|
195
|
-
|
|
283
|
+
Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
|
|
284
|
+
|
|
285
|
+
**Model-specific behavior:**
|
|
286
|
+
- **bulbul:v2:** Default is false
|
|
287
|
+
- **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
|
|
196
288
|
|
|
197
289
|
model : typing.Optional[TextToSpeechModel]
|
|
198
|
-
Specifies the model to use for text-to-speech conversion.
|
|
290
|
+
Specifies the model to use for text-to-speech conversion.
|
|
291
|
+
|
|
292
|
+
**Available models:**
|
|
293
|
+
- **bulbul:v2:** Default model with pitch, loudness controls
|
|
294
|
+
- **bulbul:v3-beta:** Newer model with temperature control, improved quality
|
|
199
295
|
|
|
200
296
|
output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
|
|
201
297
|
Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
|
|
202
298
|
|
|
299
|
+
temperature : typing.Optional[float]
|
|
300
|
+
Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
|
|
301
|
+
|
|
302
|
+
**Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
|
|
303
|
+
|
|
203
304
|
request_options : typing.Optional[RequestOptions]
|
|
204
305
|
Request-specific configuration.
|
|
205
306
|
|
|
@@ -239,6 +340,7 @@ class AsyncTextToSpeechClient:
|
|
|
239
340
|
enable_preprocessing=enable_preprocessing,
|
|
240
341
|
model=model,
|
|
241
342
|
output_audio_codec=output_audio_codec,
|
|
343
|
+
temperature=temperature,
|
|
242
344
|
request_options=request_options,
|
|
243
345
|
)
|
|
244
346
|
return _response.data
|