sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. sarvamai/__init__.py +62 -3
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +776 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  24. sarvamai/speech_to_text/client.py +95 -10
  25. sarvamai/speech_to_text/raw_client.py +95 -10
  26. sarvamai/speech_to_text_job/client.py +60 -15
  27. sarvamai/speech_to_text_streaming/__init__.py +4 -0
  28. sarvamai/speech_to_text_streaming/client.py +102 -18
  29. sarvamai/speech_to_text_streaming/raw_client.py +102 -18
  30. sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
  31. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  32. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  33. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  34. sarvamai/speech_to_text_translate_streaming/client.py +20 -12
  35. sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
  36. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  37. sarvamai/text/client.py +0 -12
  38. sarvamai/text/raw_client.py +0 -12
  39. sarvamai/text_to_speech/client.py +116 -14
  40. sarvamai/text_to_speech/raw_client.py +116 -14
  41. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  42. sarvamai/text_to_speech_streaming/client.py +19 -6
  43. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  44. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  45. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  46. sarvamai/types/__init__.py +34 -2
  47. sarvamai/types/audio_data.py +0 -6
  48. sarvamai/types/configure_connection.py +4 -0
  49. sarvamai/types/configure_connection_data.py +40 -11
  50. sarvamai/types/configure_connection_data_model.py +5 -0
  51. sarvamai/types/configure_connection_data_speaker.py +35 -1
  52. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  53. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  54. sarvamai/types/doc_digitization_error_code.py +15 -0
  55. sarvamai/types/doc_digitization_error_details.py +33 -0
  56. sarvamai/types/doc_digitization_error_message.py +23 -0
  57. sarvamai/types/doc_digitization_job_detail.py +74 -0
  58. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  59. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  60. sarvamai/types/doc_digitization_job_state.py +7 -0
  61. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  62. sarvamai/types/doc_digitization_output_format.py +5 -0
  63. sarvamai/types/doc_digitization_page_error.py +36 -0
  64. sarvamai/types/doc_digitization_supported_language.py +32 -0
  65. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  66. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  67. sarvamai/types/mode.py +5 -0
  68. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  69. sarvamai/types/speech_to_text_model.py +1 -1
  70. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  71. sarvamai/types/text_to_speech_model.py +1 -1
  72. sarvamai/types/text_to_speech_speaker.py +35 -1
  73. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
  74. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
  75. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  76. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
@@ -32,11 +32,11 @@ class RawSpeechToTextTranslateStreamingClient:
32
32
  self,
33
33
  *,
34
34
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
35
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
36
35
  sample_rate: typing.Optional[str] = None,
37
36
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
38
37
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
39
38
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
39
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
40
40
  api_subscription_key: typing.Optional[str] = None,
41
41
  request_options: typing.Optional[RequestOptions] = None,
42
42
  ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -50,10 +50,10 @@ class RawSpeechToTextTranslateStreamingClient:
50
50
  Parameters
51
51
  ----------
52
52
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
53
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
53
+ Model to be used for speech to text translation.
54
54
 
55
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
56
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
55
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
56
+ - Example: Hindi audio English text output
57
57
 
58
58
  sample_rate : typing.Optional[str]
59
59
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -67,6 +67,10 @@ class RawSpeechToTextTranslateStreamingClient:
67
67
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
68
68
  Signal to flush the audio buffer and finalize transcription and translation
69
69
 
70
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
71
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
72
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
73
+
70
74
  api_subscription_key : typing.Optional[str]
71
75
  API subscription key for authentication
72
76
 
@@ -81,8 +85,6 @@ class RawSpeechToTextTranslateStreamingClient:
81
85
  query_params = httpx.QueryParams()
82
86
  if model is not None:
83
87
  query_params = query_params.add("model", model)
84
- if input_audio_codec is not None:
85
- query_params = query_params.add("input_audio_codec", input_audio_codec)
86
88
  if sample_rate is not None:
87
89
  query_params = query_params.add("sample_rate", sample_rate)
88
90
  if high_vad_sensitivity is not None:
@@ -91,6 +93,8 @@ class RawSpeechToTextTranslateStreamingClient:
91
93
  query_params = query_params.add("vad_signals", vad_signals)
92
94
  if flush_signal is not None:
93
95
  query_params = query_params.add("flush_signal", flush_signal)
96
+ if input_audio_codec is not None:
97
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
94
98
  ws_url = ws_url + f"?{query_params}"
95
99
  headers = self._client_wrapper.get_headers()
96
100
  if api_subscription_key is not None:
@@ -124,11 +128,11 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
124
128
  self,
125
129
  *,
126
130
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
127
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
128
131
  sample_rate: typing.Optional[str] = None,
129
132
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
130
133
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
131
134
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
135
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
132
136
  api_subscription_key: typing.Optional[str] = None,
133
137
  request_options: typing.Optional[RequestOptions] = None,
134
138
  ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -142,10 +146,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
142
146
  Parameters
143
147
  ----------
144
148
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
145
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
149
+ Model to be used for speech to text translation.
146
150
 
147
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
148
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
151
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
152
+ - Example: Hindi audio English text output
149
153
 
150
154
  sample_rate : typing.Optional[str]
151
155
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -159,6 +163,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
159
163
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
160
164
  Signal to flush the audio buffer and finalize transcription and translation
161
165
 
166
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
167
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
168
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
169
+
162
170
  api_subscription_key : typing.Optional[str]
163
171
  API subscription key for authentication
164
172
 
@@ -173,8 +181,6 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
173
181
  query_params = httpx.QueryParams()
174
182
  if model is not None:
175
183
  query_params = query_params.add("model", model)
176
- if input_audio_codec is not None:
177
- query_params = query_params.add("input_audio_codec", input_audio_codec)
178
184
  if sample_rate is not None:
179
185
  query_params = query_params.add("sample_rate", sample_rate)
180
186
  if high_vad_sensitivity is not None:
@@ -183,6 +189,8 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
183
189
  query_params = query_params.add("vad_signals", vad_signals)
184
190
  if flush_signal is not None:
185
191
  query_params = query_params.add("flush_signal", flush_signal)
192
+ if input_audio_codec is not None:
193
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
186
194
  ws_url = ws_url + f"?{query_params}"
187
195
  headers = self._client_wrapper.get_headers()
188
196
  if api_subscription_key is not None:
@@ -3,31 +3,5 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextTranslateStreamingInputAudioCodec = typing.Union[
6
- typing.Literal[
7
- "wav",
8
- "x-wav",
9
- "wave",
10
- "mp3",
11
- "mpeg",
12
- "mpeg3",
13
- "x-mp3",
14
- "x-mpeg-3",
15
- "aac",
16
- "x-aac",
17
- "aiff",
18
- "x-aiff",
19
- "ogg",
20
- "opus",
21
- "flac",
22
- "x-flac",
23
- "mp4",
24
- "x-m4a",
25
- "amr",
26
- "x-ms-wma",
27
- "webm",
28
- "pcm_s16le",
29
- "pcm_l16",
30
- "pcm_raw",
31
- ],
32
- typing.Any,
6
+ typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
33
7
  ]
sarvamai/text/client.py CHANGED
@@ -47,7 +47,6 @@ class TextClient:
47
47
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
48
48
  mode: typing.Optional[TranslateMode] = OMIT,
49
49
  model: typing.Optional[TranslateModel] = OMIT,
50
- enable_preprocessing: typing.Optional[bool] = OMIT,
51
50
  output_script: typing.Optional[TransliterateMode] = OMIT,
52
51
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
53
52
  request_options: typing.Optional[RequestOptions] = None,
@@ -125,10 +124,6 @@ class TextClient:
125
124
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
126
125
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
127
126
 
128
- enable_preprocessing : typing.Optional[bool]
129
- This will enable custom preprocessing of the input text which can result in better translations.
130
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
131
-
132
127
  output_script : typing.Optional[TransliterateMode]
133
128
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
134
129
 
@@ -186,7 +181,6 @@ class TextClient:
186
181
  speaker_gender=speaker_gender,
187
182
  mode=mode,
188
183
  model=model,
189
- enable_preprocessing=enable_preprocessing,
190
184
  output_script=output_script,
191
185
  numerals_format=numerals_format,
192
186
  request_options=request_options,
@@ -371,7 +365,6 @@ class AsyncTextClient:
371
365
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
372
366
  mode: typing.Optional[TranslateMode] = OMIT,
373
367
  model: typing.Optional[TranslateModel] = OMIT,
374
- enable_preprocessing: typing.Optional[bool] = OMIT,
375
368
  output_script: typing.Optional[TransliterateMode] = OMIT,
376
369
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
377
370
  request_options: typing.Optional[RequestOptions] = None,
@@ -449,10 +442,6 @@ class AsyncTextClient:
449
442
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
450
443
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
451
444
 
452
- enable_preprocessing : typing.Optional[bool]
453
- This will enable custom preprocessing of the input text which can result in better translations.
454
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
455
-
456
445
  output_script : typing.Optional[TransliterateMode]
457
446
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
458
447
 
@@ -518,7 +507,6 @@ class AsyncTextClient:
518
507
  speaker_gender=speaker_gender,
519
508
  mode=mode,
520
509
  model=model,
521
- enable_preprocessing=enable_preprocessing,
522
510
  output_script=output_script,
523
511
  numerals_format=numerals_format,
524
512
  request_options=request_options,
@@ -44,7 +44,6 @@ class RawTextClient:
44
44
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
45
45
  mode: typing.Optional[TranslateMode] = OMIT,
46
46
  model: typing.Optional[TranslateModel] = OMIT,
47
- enable_preprocessing: typing.Optional[bool] = OMIT,
48
47
  output_script: typing.Optional[TransliterateMode] = OMIT,
49
48
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
50
49
  request_options: typing.Optional[RequestOptions] = None,
@@ -122,10 +121,6 @@ class RawTextClient:
122
121
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
123
122
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
124
123
 
125
- enable_preprocessing : typing.Optional[bool]
126
- This will enable custom preprocessing of the input text which can result in better translations.
127
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
128
-
129
124
  output_script : typing.Optional[TransliterateMode]
130
125
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
131
126
 
@@ -174,7 +169,6 @@ class RawTextClient:
174
169
  "speaker_gender": speaker_gender,
175
170
  "mode": mode,
176
171
  "model": model,
177
- "enable_preprocessing": enable_preprocessing,
178
172
  "output_script": output_script,
179
173
  "numerals_format": numerals_format,
180
174
  },
@@ -554,7 +548,6 @@ class AsyncRawTextClient:
554
548
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
555
549
  mode: typing.Optional[TranslateMode] = OMIT,
556
550
  model: typing.Optional[TranslateModel] = OMIT,
557
- enable_preprocessing: typing.Optional[bool] = OMIT,
558
551
  output_script: typing.Optional[TransliterateMode] = OMIT,
559
552
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
560
553
  request_options: typing.Optional[RequestOptions] = None,
@@ -632,10 +625,6 @@ class AsyncRawTextClient:
632
625
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
633
626
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
634
627
 
635
- enable_preprocessing : typing.Optional[bool]
636
- This will enable custom preprocessing of the input text which can result in better translations.
637
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
638
-
639
628
  output_script : typing.Optional[TransliterateMode]
640
629
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
641
630
 
@@ -684,7 +673,6 @@ class AsyncRawTextClient:
684
673
  "speaker_gender": speaker_gender,
685
674
  "mode": mode,
686
675
  "model": model,
687
- "enable_preprocessing": enable_preprocessing,
688
676
  "output_script": output_script,
689
677
  "numerals_format": numerals_format,
690
678
  },
@@ -44,15 +44,38 @@ class TextToSpeechClient:
44
44
  enable_preprocessing: typing.Optional[bool] = OMIT,
45
45
  model: typing.Optional[TextToSpeechModel] = OMIT,
46
46
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
47
+ temperature: typing.Optional[float] = OMIT,
47
48
  request_options: typing.Optional[RequestOptions] = None,
48
49
  ) -> TextToSpeechResponse:
49
50
  """
50
- This is the model to convert text into spoken audio.
51
- The output is a wave file encoded as a base64 string.
51
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
52
+
53
+ **Available Models:**
54
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
55
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
56
+
57
+ **Important Notes for bulbul:v3-beta:**
58
+ - Pitch and loudness parameters are NOT supported
59
+ - Pace must be between 0.5 and 2.0
60
+ - Preprocessing is automatically enabled
61
+ - Default sample rate is 24000 Hz
62
+ - Temperature parameter available (0.01-1.0, default 0.6)
52
63
 
53
64
  Parameters
54
65
  ----------
55
66
  text : str
67
+ The text(s) to be converted into speech.
68
+
69
+ **Features:**
70
+ - Supports code-mixed text (English and Indic languages)
71
+
72
+ **Model-specific limits:**
73
+ - **bulbul:v2:** Max 1500 characters
74
+ - **bulbul:v3-beta:** Max 2500 characters
75
+
76
+ **Important Note:**
77
+ - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
78
+ - This ensures proper pronunciation as a whole number
56
79
 
57
80
  target_language_code : TextToSpeechLanguage
58
81
  The language of the text is BCP-47 format
@@ -60,36 +83,63 @@ class TextToSpeechClient:
60
83
  speaker : typing.Optional[TextToSpeechSpeaker]
61
84
  The speaker voice to be used for the output audio.
62
85
 
63
- **Default:** Anushka
86
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
64
87
 
65
88
  **Model Compatibility (Speakers compatible with respective model):**
66
89
  - **bulbul:v2:**
67
90
  - Female: Anushka, Manisha, Vidya, Arya
68
91
  - Male: Abhilash, Karun, Hitesh
92
+ - **bulbul:v3-beta:**
93
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
69
94
 
70
95
  **Note:** Speaker selection must match the chosen model version.
71
96
 
72
97
  pitch : typing.Optional[float]
73
98
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
74
99
 
100
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
101
+
75
102
  pace : typing.Optional[float]
76
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
103
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
104
+
105
+ **Model-specific ranges:**
106
+ - **bulbul:v2:** 0.3 to 3.0
107
+ - **bulbul:v3-beta:** 0.5 to 2.0
77
108
 
78
109
  loudness : typing.Optional[float]
79
110
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
80
111
 
112
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
113
+
81
114
  speech_sample_rate : typing.Optional[SpeechSampleRate]
82
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
115
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
116
+
117
+ **Model-specific defaults:**
118
+ - **bulbul:v2:** Default is 22050 Hz
119
+ - **bulbul:v3-beta:** Default is 24000 Hz
83
120
 
84
121
  enable_preprocessing : typing.Optional[bool]
85
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
122
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
123
+
124
+ **Model-specific behavior:**
125
+ - **bulbul:v2:** Default is false
126
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
86
127
 
87
128
  model : typing.Optional[TextToSpeechModel]
88
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
129
+ Specifies the model to use for text-to-speech conversion.
130
+
131
+ **Available models:**
132
+ - **bulbul:v2:** Default model with pitch, loudness controls
133
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
89
134
 
90
135
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
91
136
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
92
137
 
138
+ temperature : typing.Optional[float]
139
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
140
+
141
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
142
+
93
143
  request_options : typing.Optional[RequestOptions]
94
144
  Request-specific configuration.
95
145
 
@@ -121,6 +171,7 @@ class TextToSpeechClient:
121
171
  enable_preprocessing=enable_preprocessing,
122
172
  model=model,
123
173
  output_audio_codec=output_audio_codec,
174
+ temperature=temperature,
124
175
  request_options=request_options,
125
176
  )
126
177
  return _response.data
@@ -154,15 +205,38 @@ class AsyncTextToSpeechClient:
154
205
  enable_preprocessing: typing.Optional[bool] = OMIT,
155
206
  model: typing.Optional[TextToSpeechModel] = OMIT,
156
207
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
208
+ temperature: typing.Optional[float] = OMIT,
157
209
  request_options: typing.Optional[RequestOptions] = None,
158
210
  ) -> TextToSpeechResponse:
159
211
  """
160
- This is the model to convert text into spoken audio.
161
- The output is a wave file encoded as a base64 string.
212
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
213
+
214
+ **Available Models:**
215
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
216
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
217
+
218
+ **Important Notes for bulbul:v3-beta:**
219
+ - Pitch and loudness parameters are NOT supported
220
+ - Pace must be between 0.5 and 2.0
221
+ - Preprocessing is automatically enabled
222
+ - Default sample rate is 24000 Hz
223
+ - Temperature parameter available (0.01-1.0, default 0.6)
162
224
 
163
225
  Parameters
164
226
  ----------
165
227
  text : str
228
+ The text(s) to be converted into speech.
229
+
230
+ **Features:**
231
+ - Supports code-mixed text (English and Indic languages)
232
+
233
+ **Model-specific limits:**
234
+ - **bulbul:v2:** Max 1500 characters
235
+ - **bulbul:v3-beta:** Max 2500 characters
236
+
237
+ **Important Note:**
238
+ - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
239
+ - This ensures proper pronunciation as a whole number
166
240
 
167
241
  target_language_code : TextToSpeechLanguage
168
242
  The language of the text is BCP-47 format
@@ -170,36 +244,63 @@ class AsyncTextToSpeechClient:
170
244
  speaker : typing.Optional[TextToSpeechSpeaker]
171
245
  The speaker voice to be used for the output audio.
172
246
 
173
- **Default:** Anushka
247
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
174
248
 
175
249
  **Model Compatibility (Speakers compatible with respective model):**
176
250
  - **bulbul:v2:**
177
251
  - Female: Anushka, Manisha, Vidya, Arya
178
252
  - Male: Abhilash, Karun, Hitesh
253
+ - **bulbul:v3-beta:**
254
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
179
255
 
180
256
  **Note:** Speaker selection must match the chosen model version.
181
257
 
182
258
  pitch : typing.Optional[float]
183
259
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
184
260
 
261
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
262
+
185
263
  pace : typing.Optional[float]
186
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
264
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
265
+
266
+ **Model-specific ranges:**
267
+ - **bulbul:v2:** 0.3 to 3.0
268
+ - **bulbul:v3-beta:** 0.5 to 2.0
187
269
 
188
270
  loudness : typing.Optional[float]
189
271
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
190
272
 
273
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
274
+
191
275
  speech_sample_rate : typing.Optional[SpeechSampleRate]
192
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
276
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
277
+
278
+ **Model-specific defaults:**
279
+ - **bulbul:v2:** Default is 22050 Hz
280
+ - **bulbul:v3-beta:** Default is 24000 Hz
193
281
 
194
282
  enable_preprocessing : typing.Optional[bool]
195
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
283
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
284
+
285
+ **Model-specific behavior:**
286
+ - **bulbul:v2:** Default is false
287
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
196
288
 
197
289
  model : typing.Optional[TextToSpeechModel]
198
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
290
+ Specifies the model to use for text-to-speech conversion.
291
+
292
+ **Available models:**
293
+ - **bulbul:v2:** Default model with pitch, loudness controls
294
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
199
295
 
200
296
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
201
297
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
202
298
 
299
+ temperature : typing.Optional[float]
300
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
301
+
302
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
303
+
203
304
  request_options : typing.Optional[RequestOptions]
204
305
  Request-specific configuration.
205
306
 
@@ -239,6 +340,7 @@ class AsyncTextToSpeechClient:
239
340
  enable_preprocessing=enable_preprocessing,
240
341
  model=model,
241
342
  output_audio_codec=output_audio_codec,
343
+ temperature=temperature,
242
344
  request_options=request_options,
243
345
  )
244
346
  return _response.data