sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. sarvamai/__init__.py +62 -3
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +776 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  24. sarvamai/speech_to_text/client.py +95 -10
  25. sarvamai/speech_to_text/raw_client.py +95 -10
  26. sarvamai/speech_to_text_job/client.py +60 -15
  27. sarvamai/speech_to_text_streaming/__init__.py +4 -0
  28. sarvamai/speech_to_text_streaming/client.py +102 -18
  29. sarvamai/speech_to_text_streaming/raw_client.py +102 -18
  30. sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
  31. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  32. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  33. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  34. sarvamai/speech_to_text_translate_streaming/client.py +20 -12
  35. sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
  36. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  37. sarvamai/text/client.py +0 -12
  38. sarvamai/text/raw_client.py +0 -12
  39. sarvamai/text_to_speech/client.py +116 -14
  40. sarvamai/text_to_speech/raw_client.py +116 -14
  41. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  42. sarvamai/text_to_speech_streaming/client.py +19 -6
  43. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  44. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  45. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  46. sarvamai/types/__init__.py +34 -2
  47. sarvamai/types/audio_data.py +0 -6
  48. sarvamai/types/configure_connection.py +4 -0
  49. sarvamai/types/configure_connection_data.py +40 -11
  50. sarvamai/types/configure_connection_data_model.py +5 -0
  51. sarvamai/types/configure_connection_data_speaker.py +35 -1
  52. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  53. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  54. sarvamai/types/doc_digitization_error_code.py +15 -0
  55. sarvamai/types/doc_digitization_error_details.py +33 -0
  56. sarvamai/types/doc_digitization_error_message.py +23 -0
  57. sarvamai/types/doc_digitization_job_detail.py +74 -0
  58. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  59. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  60. sarvamai/types/doc_digitization_job_state.py +7 -0
  61. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  62. sarvamai/types/doc_digitization_output_format.py +5 -0
  63. sarvamai/types/doc_digitization_page_error.py +36 -0
  64. sarvamai/types/doc_digitization_supported_language.py +32 -0
  65. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  66. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  67. sarvamai/types/mode.py +5 -0
  68. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  69. sarvamai/types/speech_to_text_model.py +1 -1
  70. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  71. sarvamai/types/text_to_speech_model.py +1 -1
  72. sarvamai/types/text_to_speech_speaker.py +35 -1
  73. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
  74. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
  75. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  76. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
@@ -41,15 +41,38 @@ class RawTextToSpeechClient:
41
41
  enable_preprocessing: typing.Optional[bool] = OMIT,
42
42
  model: typing.Optional[TextToSpeechModel] = OMIT,
43
43
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
44
+ temperature: typing.Optional[float] = OMIT,
44
45
  request_options: typing.Optional[RequestOptions] = None,
45
46
  ) -> HttpResponse[TextToSpeechResponse]:
46
47
  """
47
- This is the model to convert text into spoken audio.
48
- The output is a wave file encoded as a base64 string.
48
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
49
+
50
+ **Available Models:**
51
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
52
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
53
+
54
+ **Important Notes for bulbul:v3-beta:**
55
+ - Pitch and loudness parameters are NOT supported
56
+ - Pace must be between 0.5 and 2.0
57
+ - Preprocessing is automatically enabled
58
+ - Default sample rate is 24000 Hz
59
+ - Temperature parameter available (0.01-1.0, default 0.6)
49
60
 
50
61
  Parameters
51
62
  ----------
52
63
  text : str
64
+ The text(s) to be converted into speech.
65
+
66
+ **Features:**
67
+ - Supports code-mixed text (English and Indic languages)
68
+
69
+ **Model-specific limits:**
70
+ - **bulbul:v2:** Max 1500 characters
71
+ - **bulbul:v3-beta:** Max 2500 characters
72
+
73
+ **Important Note:**
74
+ - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
75
+ - This ensures proper pronunciation as a whole number
53
76
 
54
77
  target_language_code : TextToSpeechLanguage
55
78
  The language of the text is BCP-47 format
@@ -57,36 +80,63 @@ class RawTextToSpeechClient:
57
80
  speaker : typing.Optional[TextToSpeechSpeaker]
58
81
  The speaker voice to be used for the output audio.
59
82
 
60
- **Default:** Anushka
83
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
61
84
 
62
85
  **Model Compatibility (Speakers compatible with respective model):**
63
86
  - **bulbul:v2:**
64
87
  - Female: Anushka, Manisha, Vidya, Arya
65
88
  - Male: Abhilash, Karun, Hitesh
89
+ - **bulbul:v3-beta:**
90
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
66
91
 
67
92
  **Note:** Speaker selection must match the chosen model version.
68
93
 
69
94
  pitch : typing.Optional[float]
70
95
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
71
96
 
97
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
98
+
72
99
  pace : typing.Optional[float]
73
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
100
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
101
+
102
+ **Model-specific ranges:**
103
+ - **bulbul:v2:** 0.3 to 3.0
104
+ - **bulbul:v3-beta:** 0.5 to 2.0
74
105
 
75
106
  loudness : typing.Optional[float]
76
107
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
77
108
 
109
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
110
+
78
111
  speech_sample_rate : typing.Optional[SpeechSampleRate]
79
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
112
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
113
+
114
+ **Model-specific defaults:**
115
+ - **bulbul:v2:** Default is 22050 Hz
116
+ - **bulbul:v3-beta:** Default is 24000 Hz
80
117
 
81
118
  enable_preprocessing : typing.Optional[bool]
82
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
119
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
120
+
121
+ **Model-specific behavior:**
122
+ - **bulbul:v2:** Default is false
123
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
83
124
 
84
125
  model : typing.Optional[TextToSpeechModel]
85
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
126
+ Specifies the model to use for text-to-speech conversion.
127
+
128
+ **Available models:**
129
+ - **bulbul:v2:** Default model with pitch, loudness controls
130
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
86
131
 
87
132
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
88
133
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
89
134
 
135
+ temperature : typing.Optional[float]
136
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
137
+
138
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
139
+
90
140
  request_options : typing.Optional[RequestOptions]
91
141
  Request-specific configuration.
92
142
 
@@ -110,6 +160,7 @@ class RawTextToSpeechClient:
110
160
  "enable_preprocessing": enable_preprocessing,
111
161
  "model": model,
112
162
  "output_audio_codec": output_audio_codec,
163
+ "temperature": temperature,
113
164
  },
114
165
  headers={
115
166
  "content-type": "application/json",
@@ -205,15 +256,38 @@ class AsyncRawTextToSpeechClient:
205
256
  enable_preprocessing: typing.Optional[bool] = OMIT,
206
257
  model: typing.Optional[TextToSpeechModel] = OMIT,
207
258
  output_audio_codec: typing.Optional[TextToSpeechOutputAudioCodec] = OMIT,
259
+ temperature: typing.Optional[float] = OMIT,
208
260
  request_options: typing.Optional[RequestOptions] = None,
209
261
  ) -> AsyncHttpResponse[TextToSpeechResponse]:
210
262
  """
211
- This is the model to convert text into spoken audio.
212
- The output is a wave file encoded as a base64 string.
263
+ Convert text into spoken audio. The output is a wave file encoded as a base64 string.
264
+
265
+ **Available Models:**
266
+ - **bulbul:v2** (default): Supports pitch, loudness, and pace controls
267
+ - **bulbul:v3-beta**: Newer model with temperature control and improved quality
268
+
269
+ **Important Notes for bulbul:v3-beta:**
270
+ - Pitch and loudness parameters are NOT supported
271
+ - Pace must be between 0.5 and 2.0
272
+ - Preprocessing is automatically enabled
273
+ - Default sample rate is 24000 Hz
274
+ - Temperature parameter available (0.01-1.0, default 0.6)
213
275
 
214
276
  Parameters
215
277
  ----------
216
278
  text : str
279
+ The text(s) to be converted into speech.
280
+
281
+ **Features:**
282
+ - Supports code-mixed text (English and Indic languages)
283
+
284
+ **Model-specific limits:**
285
+ - **bulbul:v2:** Max 1500 characters
286
+ - **bulbul:v3-beta:** Max 2500 characters
287
+
288
+ **Important Note:**
289
+ - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000')
290
+ - This ensures proper pronunciation as a whole number
217
291
 
218
292
  target_language_code : TextToSpeechLanguage
219
293
  The language of the text is BCP-47 format
@@ -221,36 +295,63 @@ class AsyncRawTextToSpeechClient:
221
295
  speaker : typing.Optional[TextToSpeechSpeaker]
222
296
  The speaker voice to be used for the output audio.
223
297
 
224
- **Default:** Anushka
298
+ **Default:** Anushka (for bulbul:v2), Aditya (for bulbul:v3-beta)
225
299
 
226
300
  **Model Compatibility (Speakers compatible with respective model):**
227
301
  - **bulbul:v2:**
228
302
  - Female: Anushka, Manisha, Vidya, Arya
229
303
  - Male: Abhilash, Karun, Hitesh
304
+ - **bulbul:v3-beta:**
305
+ - Aditya, Ritu, Priya, Neha, Rahul, Pooja, Rohan, Simran, Kavya, Amit, Dev, Ishita, Shreya, Ratan, Varun, Manan, Sumit, Roopa, Kabir, Aayan, Shubh, Ashutosh, Advait, Amelia, Sophia
230
306
 
231
307
  **Note:** Speaker selection must match the chosen model version.
232
308
 
233
309
  pitch : typing.Optional[float]
234
310
  Controls the pitch of the audio. Lower values result in a deeper voice, while higher values make it sharper. The suitable range is between -0.75 and 0.75. Default is 0.0.
235
311
 
312
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
313
+
236
314
  pace : typing.Optional[float]
237
- Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. The suitable range is between 0.5 and 2.0. Default is 1.0.
315
+ Controls the speed of the audio. Lower values result in slower speech, while higher values make it faster. Default is 1.0.
316
+
317
+ **Model-specific ranges:**
318
+ - **bulbul:v2:** 0.3 to 3.0
319
+ - **bulbul:v3-beta:** 0.5 to 2.0
238
320
 
239
321
  loudness : typing.Optional[float]
240
322
  Controls the loudness of the audio. Lower values result in quieter audio, while higher values make it louder. The suitable range is between 0.3 and 3.0. Default is 1.0.
241
323
 
324
+ **Note:** This parameter is only supported for bulbul:v2. It is NOT supported for bulbul:v3-beta and will cause a validation error if provided.
325
+
242
326
  speech_sample_rate : typing.Optional[SpeechSampleRate]
243
- Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
327
+ Specifies the sample rate of the output audio. Supported values are 8000, 16000, 22050, 24000 Hz.
328
+
329
+ **Model-specific defaults:**
330
+ - **bulbul:v2:** Default is 22050 Hz
331
+ - **bulbul:v3-beta:** Default is 24000 Hz
244
332
 
245
333
  enable_preprocessing : typing.Optional[bool]
246
- Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text. Default is false.
334
+ Controls whether normalization of English words and numeric entities (e.g., numbers, dates) is performed. Set to true for better handling of mixed-language text.
335
+
336
+ **Model-specific behavior:**
337
+ - **bulbul:v2:** Default is false
338
+ - **bulbul:v3-beta:** Automatically enabled (true) and cannot be disabled
247
339
 
248
340
  model : typing.Optional[TextToSpeechModel]
249
- Specifies the model to use for text-to-speech conversion. Default is bulbul:v2.
341
+ Specifies the model to use for text-to-speech conversion.
342
+
343
+ **Available models:**
344
+ - **bulbul:v2:** Default model with pitch, loudness controls
345
+ - **bulbul:v3-beta:** Newer model with temperature control, improved quality
250
346
 
251
347
  output_audio_codec : typing.Optional[TextToSpeechOutputAudioCodec]
252
348
  Specifies the audio codec for the output audio file. Different codecs offer various compression and quality characteristics.
253
349
 
350
+ temperature : typing.Optional[float]
351
+ Controls the randomness of the output. Lower values make the output more focused and deterministic, while higher values make it more random. The suitable range is between 0.01 and 1.0. Default is 0.6.
352
+
353
+ **Note:** This parameter is only supported for bulbul:v3-beta. It has no effect on bulbul:v2.
354
+
254
355
  request_options : typing.Optional[RequestOptions]
255
356
  Request-specific configuration.
256
357
 
@@ -274,6 +375,7 @@ class AsyncRawTextToSpeechClient:
274
375
  "enable_preprocessing": enable_preprocessing,
275
376
  "model": model,
276
377
  "output_audio_codec": output_audio_codec,
378
+ "temperature": temperature,
277
379
  },
278
380
  headers={
279
381
  "content-type": "application/json",
@@ -2,6 +2,6 @@
2
2
 
3
3
  # isort: skip_file
4
4
 
5
- from .types import TextToSpeechStreamingSendCompletionEvent
5
+ from .types import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
6
6
 
7
- __all__ = ["TextToSpeechStreamingSendCompletionEvent"]
7
+ __all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]
@@ -11,6 +11,7 @@ from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
11
11
  from ..core.request_options import RequestOptions
12
12
  from .raw_client import AsyncRawTextToSpeechStreamingClient, RawTextToSpeechStreamingClient
13
13
  from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
14
+ from .types.text_to_speech_streaming_model import TextToSpeechStreamingModel
14
15
  from .types.text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
15
16
 
16
17
  try:
@@ -38,7 +39,7 @@ class TextToSpeechStreamingClient:
38
39
  def connect(
39
40
  self,
40
41
  *,
41
- model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
42
+ model: typing.Optional[TextToSpeechStreamingModel] = None,
42
43
  send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
43
44
  api_subscription_key: typing.Optional[str] = None,
44
45
  request_options: typing.Optional[RequestOptions] = None,
@@ -50,10 +51,16 @@ class TextToSpeechStreamingClient:
50
51
  The Try It playground may not provide the best experience for streaming audio.
51
52
  For optimal streaming performance, please use the SDK or implement your own WebSocket client.
52
53
 
54
+ **Model-Specific Notes:**
55
+ - **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
56
+ - **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
57
+
53
58
  Parameters
54
59
  ----------
55
- model : typing.Optional[typing.Literal["bulbul:v2"]]
56
- Text to speech model to use
60
+ model : typing.Optional[TextToSpeechStreamingModel]
61
+ Text to speech model to use.
62
+ - **bulbul:v2** (default): Standard TTS model with pitch/loudness support
63
+ - **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
57
64
 
58
65
  send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
59
66
  Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
@@ -117,7 +124,7 @@ class AsyncTextToSpeechStreamingClient:
117
124
  async def connect(
118
125
  self,
119
126
  *,
120
- model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
127
+ model: typing.Optional[TextToSpeechStreamingModel] = None,
121
128
  send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
122
129
  api_subscription_key: typing.Optional[str] = None,
123
130
  request_options: typing.Optional[RequestOptions] = None,
@@ -129,10 +136,16 @@ class AsyncTextToSpeechStreamingClient:
129
136
  The Try It playground may not provide the best experience for streaming audio.
130
137
  For optimal streaming performance, please use the SDK or implement your own WebSocket client.
131
138
 
139
+ **Model-Specific Notes:**
140
+ - **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
141
+ - **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
142
+
132
143
  Parameters
133
144
  ----------
134
- model : typing.Optional[typing.Literal["bulbul:v2"]]
135
- Text to speech model to use
145
+ model : typing.Optional[TextToSpeechStreamingModel]
146
+ Text to speech model to use.
147
+ - **bulbul:v2** (default): Standard TTS model with pitch/loudness support
148
+ - **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
136
149
 
137
150
  send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
138
151
  Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
@@ -10,6 +10,7 @@ from ..core.api_error import ApiError
10
10
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
11
11
  from ..core.request_options import RequestOptions
12
12
  from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
13
+ from .types.text_to_speech_streaming_model import TextToSpeechStreamingModel
13
14
  from .types.text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
14
15
 
15
16
  try:
@@ -26,7 +27,7 @@ class RawTextToSpeechStreamingClient:
26
27
  def connect(
27
28
  self,
28
29
  *,
29
- model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
30
+ model: typing.Optional[TextToSpeechStreamingModel] = None,
30
31
  send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
31
32
  api_subscription_key: typing.Optional[str] = None,
32
33
  request_options: typing.Optional[RequestOptions] = None,
@@ -38,10 +39,16 @@ class RawTextToSpeechStreamingClient:
38
39
  The Try It playground may not provide the best experience for streaming audio.
39
40
  For optimal streaming performance, please use the SDK or implement your own WebSocket client.
40
41
 
42
+ **Model-Specific Notes:**
43
+ - **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
44
+ - **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
45
+
41
46
  Parameters
42
47
  ----------
43
- model : typing.Optional[typing.Literal["bulbul:v2"]]
44
- Text to speech model to use
48
+ model : typing.Optional[TextToSpeechStreamingModel]
49
+ Text to speech model to use.
50
+ - **bulbul:v2** (default): Standard TTS model with pitch/loudness support
51
+ - **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
45
52
 
46
53
  send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
47
54
  Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
@@ -94,7 +101,7 @@ class AsyncRawTextToSpeechStreamingClient:
94
101
  async def connect(
95
102
  self,
96
103
  *,
97
- model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
104
+ model: typing.Optional[TextToSpeechStreamingModel] = None,
98
105
  send_completion_event: typing.Optional[TextToSpeechStreamingSendCompletionEvent] = None,
99
106
  api_subscription_key: typing.Optional[str] = None,
100
107
  request_options: typing.Optional[RequestOptions] = None,
@@ -106,10 +113,16 @@ class AsyncRawTextToSpeechStreamingClient:
106
113
  The Try It playground may not provide the best experience for streaming audio.
107
114
  For optimal streaming performance, please use the SDK or implement your own WebSocket client.
108
115
 
116
+ **Model-Specific Notes:**
117
+ - **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
118
+ - **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature parameter. Default sample rate: 24000 Hz. Preprocessing is always enabled.
119
+
109
120
  Parameters
110
121
  ----------
111
- model : typing.Optional[typing.Literal["bulbul:v2"]]
112
- Text to speech model to use
122
+ model : typing.Optional[TextToSpeechStreamingModel]
123
+ Text to speech model to use.
124
+ - **bulbul:v2** (default): Standard TTS model with pitch/loudness support
125
+ - **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
113
126
 
114
127
  send_completion_event : typing.Optional[TextToSpeechStreamingSendCompletionEvent]
115
128
  Enable completion event notifications when TTS generation finishes. When set to true, an event message will be sent when the final audio chunk has been generated.
@@ -2,6 +2,7 @@
2
2
 
3
3
  # isort: skip_file
4
4
 
5
+ from .text_to_speech_streaming_model import TextToSpeechStreamingModel
5
6
  from .text_to_speech_streaming_send_completion_event import TextToSpeechStreamingSendCompletionEvent
6
7
 
7
- __all__ = ["TextToSpeechStreamingSendCompletionEvent"]
8
+ __all__ = ["TextToSpeechStreamingModel", "TextToSpeechStreamingSendCompletionEvent"]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ TextToSpeechStreamingModel = typing.Union[typing.Literal["bulbul:v2", "bulbul:v3-beta"], typing.Any]
@@ -3,7 +3,6 @@
3
3
  # isort: skip_file
4
4
 
5
5
  from .audio_data import AudioData
6
- from .audio_data_input_audio_codec import AudioDataInputAudioCodec
7
6
  from .audio_message import AudioMessage
8
7
  from .audio_output import AudioOutput
9
8
  from .audio_output_data import AudioOutputData
@@ -26,6 +25,7 @@ from .completion_usage import CompletionUsage
26
25
  from .config_message import ConfigMessage
27
26
  from .configure_connection import ConfigureConnection
28
27
  from .configure_connection_data import ConfigureConnectionData
28
+ from .configure_connection_data_model import ConfigureConnectionDataModel
29
29
  from .configure_connection_data_output_audio_bitrate import ConfigureConnectionDataOutputAudioBitrate
30
30
  from .configure_connection_data_output_audio_codec import ConfigureConnectionDataOutputAudioCodec
31
31
  from .configure_connection_data_speaker import ConfigureConnectionDataSpeaker
@@ -34,6 +34,21 @@ from .connection_sample_rate import ConnectionSampleRate
34
34
  from .create_chat_completion_response import CreateChatCompletionResponse
35
35
  from .diarized_entry import DiarizedEntry
36
36
  from .diarized_transcript import DiarizedTranscript
37
+ from .doc_digitization_create_job_response import DocDigitizationCreateJobResponse
38
+ from .doc_digitization_download_files_response import DocDigitizationDownloadFilesResponse
39
+ from .doc_digitization_error_code import DocDigitizationErrorCode
40
+ from .doc_digitization_error_details import DocDigitizationErrorDetails
41
+ from .doc_digitization_error_message import DocDigitizationErrorMessage
42
+ from .doc_digitization_job_detail import DocDigitizationJobDetail
43
+ from .doc_digitization_job_detail_state import DocDigitizationJobDetailState
44
+ from .doc_digitization_job_parameters import DocDigitizationJobParameters
45
+ from .doc_digitization_job_state import DocDigitizationJobState
46
+ from .doc_digitization_job_status_response import DocDigitizationJobStatusResponse
47
+ from .doc_digitization_output_format import DocDigitizationOutputFormat
48
+ from .doc_digitization_page_error import DocDigitizationPageError
49
+ from .doc_digitization_supported_language import DocDigitizationSupportedLanguage
50
+ from .doc_digitization_upload_files_response import DocDigitizationUploadFilesResponse
51
+ from .doc_digitization_webhook_callback import DocDigitizationWebhookCallback
37
52
  from .error_code import ErrorCode
38
53
  from .error_data import ErrorData
39
54
  from .error_details import ErrorDetails
@@ -54,6 +69,7 @@ from .input_audio_codec import InputAudioCodec
54
69
  from .job_state import JobState
55
70
  from .job_status_v_1_response import JobStatusV1Response
56
71
  from .language_identification_response import LanguageIdentificationResponse
72
+ from .mode import Mode
57
73
  from .numerals_format import NumeralsFormat
58
74
  from .ping_signal import PingSignal
59
75
  from .reasoning_effort import ReasoningEffort
@@ -104,7 +120,6 @@ from .transliteration_response import TransliterationResponse
104
120
 
105
121
  __all__ = [
106
122
  "AudioData",
107
- "AudioDataInputAudioCodec",
108
123
  "AudioMessage",
109
124
  "AudioOutput",
110
125
  "AudioOutputData",
@@ -125,6 +140,7 @@ __all__ = [
125
140
  "ConfigMessage",
126
141
  "ConfigureConnection",
127
142
  "ConfigureConnectionData",
143
+ "ConfigureConnectionDataModel",
128
144
  "ConfigureConnectionDataOutputAudioBitrate",
129
145
  "ConfigureConnectionDataOutputAudioCodec",
130
146
  "ConfigureConnectionDataSpeaker",
@@ -133,6 +149,21 @@ __all__ = [
133
149
  "CreateChatCompletionResponse",
134
150
  "DiarizedEntry",
135
151
  "DiarizedTranscript",
152
+ "DocDigitizationCreateJobResponse",
153
+ "DocDigitizationDownloadFilesResponse",
154
+ "DocDigitizationErrorCode",
155
+ "DocDigitizationErrorDetails",
156
+ "DocDigitizationErrorMessage",
157
+ "DocDigitizationJobDetail",
158
+ "DocDigitizationJobDetailState",
159
+ "DocDigitizationJobParameters",
160
+ "DocDigitizationJobState",
161
+ "DocDigitizationJobStatusResponse",
162
+ "DocDigitizationOutputFormat",
163
+ "DocDigitizationPageError",
164
+ "DocDigitizationSupportedLanguage",
165
+ "DocDigitizationUploadFilesResponse",
166
+ "DocDigitizationWebhookCallback",
136
167
  "ErrorCode",
137
168
  "ErrorData",
138
169
  "ErrorDetails",
@@ -153,6 +184,7 @@ __all__ = [
153
184
  "JobState",
154
185
  "JobStatusV1Response",
155
186
  "LanguageIdentificationResponse",
187
+ "Mode",
156
188
  "NumeralsFormat",
157
189
  "PingSignal",
158
190
  "ReasoningEffort",
@@ -4,7 +4,6 @@ import typing
4
4
 
5
5
  import pydantic
6
6
  from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
7
- from .audio_data_input_audio_codec import AudioDataInputAudioCodec
8
7
 
9
8
 
10
9
  class AudioData(UniversalBaseModel):
@@ -29,11 +28,6 @@ class AudioData(UniversalBaseModel):
29
28
  Audio encoding format
30
29
  """
31
30
 
32
- input_audio_codec: typing.Optional[AudioDataInputAudioCodec] = pydantic.Field(default=None)
33
- """
34
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
35
- """
36
-
37
31
  if IS_PYDANTIC_V2:
38
32
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
39
33
  else:
@@ -13,6 +13,10 @@ class ConfigureConnection(UniversalBaseModel):
13
13
  This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
14
14
  by sending a new config message. When a config update is sent, any text currently in the buffer
15
15
  will be automatically flushed and processed before applying the new configuration.
16
+
17
+ **Model-Specific Notes:**
18
+ - **bulbul:v2:** Supports pitch, loudness, pace (0.3-3.0). Default sample rate: 22050 Hz.
19
+ - **bulbul:v3-beta:** Does NOT support pitch/loudness. Pace range: 0.5-2.0. Supports temperature. Default sample rate: 24000 Hz.
16
20
  """
17
21
 
18
22
  type: typing.Literal["config"] = "config"
@@ -4,6 +4,7 @@ import typing
4
4
 
5
5
  import pydantic
6
6
  from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
7
+ from .configure_connection_data_model import ConfigureConnectionDataModel
7
8
  from .configure_connection_data_output_audio_bitrate import ConfigureConnectionDataOutputAudioBitrate
8
9
  from .configure_connection_data_output_audio_codec import ConfigureConnectionDataOutputAudioCodec
9
10
  from .configure_connection_data_speaker import ConfigureConnectionDataSpeaker
@@ -11,21 +12,25 @@ from .configure_connection_data_target_language_code import ConfigureConnectionD
11
12
 
12
13
 
13
14
  class ConfigureConnectionData(UniversalBaseModel):
15
+ model: typing.Optional[ConfigureConnectionDataModel] = pydantic.Field(default=None)
16
+ """
17
+ Specifies the model to use for text-to-speech conversion.
18
+ - **bulbul:v2** (default): Standard TTS model with pitch/loudness support
19
+ - **bulbul:v3-beta**: Advanced model with temperature control (no pitch/loudness)
20
+ """
21
+
14
22
  target_language_code: ConfigureConnectionDataTargetLanguageCode = pydantic.Field()
15
23
  """
16
- The language of the text is BCP-47 format
24
+ The language of the text in BCP-47 format
17
25
  """
18
26
 
19
27
  speaker: ConfigureConnectionDataSpeaker = pydantic.Field()
20
28
  """
21
29
  The speaker voice to be used for the output audio.
22
30
 
23
- **Default:** Anushka
24
-
25
- **Model Compatibility (Speakers compatible with respective model):**
26
- - **bulbul:v2:**
27
- - Female: Anushka, Manisha, Vidya, Arya
28
- - Male: Abhilash, Karun, Hitesh
31
+ **Model Compatibility:**
32
+ - **bulbul:v2:** anushka (default), abhilash, manisha, vidya, arya, karun, hitesh
33
+ - **bulbul:v3-beta:** aditya (default), ritu, priya, neha, rahul, pooja, rohan, simran, kavya, amit, dev, ishita, shreya, ratan, varun, manan, sumit, roopa, kabir, aayan, shubh, ashutosh, advait, amelia, sophia
29
34
 
30
35
  **Note:** Speaker selection must match the chosen model version.
31
36
  """
@@ -35,13 +40,18 @@ class ConfigureConnectionData(UniversalBaseModel):
35
40
  Controls the pitch of the audio. Lower values result in a deeper voice,
36
41
  while higher values make it sharper. The suitable range is between -0.75
37
42
  and 0.75. Default is 0.0.
43
+
44
+ **Note:** NOT supported for bulbul:v3-beta. Will be ignored if provided.
38
45
  """
39
46
 
40
47
  pace: typing.Optional[float] = pydantic.Field(default=None)
41
48
  """
42
49
  Controls the speed of the audio. Lower values result in slower speech,
43
- while higher values make it faster. The suitable range is between 0.5
44
- and 2.0. Default is 1.0.
50
+ while higher values make it faster. Default is 1.0.
51
+
52
+ **Model-specific ranges:**
53
+ - **bulbul:v2:** 0.3 to 3.0
54
+ - **bulbul:v3-beta:** 0.5 to 2.0
45
55
  """
46
56
 
47
57
  loudness: typing.Optional[float] = pydantic.Field(default=None)
@@ -49,19 +59,38 @@ class ConfigureConnectionData(UniversalBaseModel):
49
59
  Controls the loudness of the audio. Lower values result in quieter audio,
50
60
  while higher values make it louder. The suitable range is between 0.3
51
61
  and 3.0. Default is 1.0.
62
+
63
+ **Note:** NOT supported for bulbul:v3-beta. Will be ignored if provided.
64
+ """
65
+
66
+ temperature: typing.Optional[float] = pydantic.Field(default=None)
67
+ """
68
+ Controls the randomness of the output. Lower values make the output more
69
+ focused and deterministic, while higher values make it more random.
70
+ The suitable range is between 0.01 and 1.0. Default is 0.6.
71
+
72
+ **Note:** Only supported for bulbul:v3-beta. Will be ignored for bulbul:v2.
52
73
  """
53
74
 
54
75
  speech_sample_rate: typing.Optional[int] = pydantic.Field(default=None)
55
76
  """
56
77
  Specifies the sample rate of the output audio. Supported values are
57
- 8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
78
+ 8000, 16000, 22050, 24000 Hz.
79
+
80
+ **Model-specific defaults:**
81
+ - **bulbul:v2:** 22050 Hz
82
+ - **bulbul:v3-beta:** 24000 Hz
58
83
  """
59
84
 
60
85
  enable_preprocessing: typing.Optional[bool] = pydantic.Field(default=None)
61
86
  """
62
87
  Controls whether normalization of English words and numeric entities
63
88
  (e.g., numbers, dates) is performed. Set to true for better handling
64
- of mixed-language text. Default is false.
89
+ of mixed-language text.
90
+
91
+ **Model-specific defaults:**
92
+ - **bulbul:v2:** false (optional)
93
+ - **bulbul:v3-beta:** Always enabled (cannot be disabled)
65
94
  """
66
95
 
67
96
  output_audio_codec: typing.Optional[ConfigureConnectionDataOutputAudioCodec] = pydantic.Field(default=None)
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ ConfigureConnectionDataModel = typing.Union[typing.Literal["bulbul:v2", "bulbul:v3-beta"], typing.Any]
@@ -3,5 +3,39 @@
3
3
  import typing
4
4
 
5
5
  ConfigureConnectionDataSpeaker = typing.Union[
6
- typing.Literal["anushka", "abhilash", "manisha", "vidya", "arya", "karun", "hitesh"], typing.Any
6
+ typing.Literal[
7
+ "anushka",
8
+ "abhilash",
9
+ "manisha",
10
+ "vidya",
11
+ "arya",
12
+ "karun",
13
+ "hitesh",
14
+ "aditya",
15
+ "ritu",
16
+ "priya",
17
+ "neha",
18
+ "rahul",
19
+ "pooja",
20
+ "rohan",
21
+ "simran",
22
+ "kavya",
23
+ "amit",
24
+ "dev",
25
+ "ishita",
26
+ "shreya",
27
+ "ratan",
28
+ "varun",
29
+ "manan",
30
+ "sumit",
31
+ "roopa",
32
+ "kabir",
33
+ "aayan",
34
+ "shubh",
35
+ "ashutosh",
36
+ "advait",
37
+ "amelia",
38
+ "sophia",
39
+ ],
40
+ typing.Any,
7
41
  ]