sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sarvamai/__init__.py +203 -405
  2. sarvamai/chat/raw_client.py +20 -20
  3. sarvamai/client.py +34 -186
  4. sarvamai/core/__init__.py +21 -76
  5. sarvamai/core/client_wrapper.py +3 -19
  6. sarvamai/core/force_multipart.py +2 -4
  7. sarvamai/core/http_client.py +97 -217
  8. sarvamai/core/http_response.py +1 -1
  9. sarvamai/core/jsonable_encoder.py +0 -8
  10. sarvamai/core/pydantic_utilities.py +4 -110
  11. sarvamai/errors/__init__.py +6 -40
  12. sarvamai/errors/bad_request_error.py +1 -1
  13. sarvamai/errors/forbidden_error.py +1 -1
  14. sarvamai/errors/internal_server_error.py +1 -1
  15. sarvamai/errors/service_unavailable_error.py +1 -1
  16. sarvamai/errors/too_many_requests_error.py +1 -1
  17. sarvamai/errors/unprocessable_entity_error.py +1 -1
  18. sarvamai/requests/__init__.py +62 -150
  19. sarvamai/requests/configure_connection.py +4 -0
  20. sarvamai/requests/configure_connection_data.py +40 -11
  21. sarvamai/requests/error_response_data.py +1 -1
  22. sarvamai/requests/file_signed_url_details.py +1 -1
  23. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  24. sarvamai/requests/speech_to_text_transcription_data.py +2 -2
  25. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  26. sarvamai/speech_to_text/client.py +95 -10
  27. sarvamai/speech_to_text/raw_client.py +147 -64
  28. sarvamai/speech_to_text_job/client.py +60 -15
  29. sarvamai/speech_to_text_job/raw_client.py +120 -120
  30. sarvamai/speech_to_text_streaming/__init__.py +10 -38
  31. sarvamai/speech_to_text_streaming/client.py +90 -8
  32. sarvamai/speech_to_text_streaming/raw_client.py +90 -8
  33. sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
  34. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  35. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  36. sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
  37. sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
  38. sarvamai/speech_to_text_translate_streaming/client.py +8 -2
  39. sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
  40. sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
  41. sarvamai/text/raw_client.py +60 -60
  42. sarvamai/text_to_speech/client.py +100 -16
  43. sarvamai/text_to_speech/raw_client.py +120 -36
  44. sarvamai/text_to_speech_streaming/__init__.py +2 -29
  45. sarvamai/text_to_speech_streaming/client.py +19 -6
  46. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  47. sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
  48. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  49. sarvamai/types/__init__.py +102 -222
  50. sarvamai/types/chat_completion_request_message.py +2 -6
  51. sarvamai/types/configure_connection.py +4 -0
  52. sarvamai/types/configure_connection_data.py +40 -11
  53. sarvamai/types/configure_connection_data_model.py +5 -0
  54. sarvamai/types/configure_connection_data_speaker.py +35 -1
  55. sarvamai/types/error_response_data.py +1 -1
  56. sarvamai/types/file_signed_url_details.py +1 -1
  57. sarvamai/types/mode.py +5 -0
  58. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  59. sarvamai/types/speech_to_text_model.py +1 -1
  60. sarvamai/types/speech_to_text_transcription_data.py +2 -2
  61. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  62. sarvamai/types/text_to_speech_model.py +1 -1
  63. sarvamai/types/text_to_speech_speaker.py +35 -1
  64. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
  65. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
  66. sarvamai/core/http_sse/__init__.py +0 -42
  67. sarvamai/core/http_sse/_api.py +0 -112
  68. sarvamai/core/http_sse/_decoders.py +0 -61
  69. sarvamai/core/http_sse/_exceptions.py +0 -7
  70. sarvamai/core/http_sse/_models.py +0 -17
  71. {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0
@@ -6,6 +6,7 @@ from .. import core
6
6
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
7
7
  from ..core.request_options import RequestOptions
8
8
  from ..types.input_audio_codec import InputAudioCodec
9
+ from ..types.mode import Mode
9
10
  from ..types.speech_to_text_language import SpeechToTextLanguage
10
11
  from ..types.speech_to_text_model import SpeechToTextModel
11
12
  from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +38,7 @@ class SpeechToTextClient:
37
38
  *,
38
39
  file: core.File,
39
40
  model: typing.Optional[SpeechToTextModel] = OMIT,
41
+ mode: typing.Optional[Mode] = OMIT,
40
42
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
41
43
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
42
44
  request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +65,49 @@ class SpeechToTextClient:
63
65
 
64
66
  model : typing.Optional[SpeechToTextModel]
65
67
  Specifies the model to use for speech-to-text conversion.
66
- Note:- Default model is `saarika:v2.5`
68
+
69
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
70
+
71
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
72
+
73
+ mode : typing.Optional[Mode]
74
+ Mode of operation. **Only applicable when using saaras:v3 model.**
75
+
76
+ Example audio: 'मेरा फोन नंबर है 9840950950'
77
+
78
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
79
+ - Output: `मेरा फोन नंबर है 9840950950`
80
+
81
+ - **translate**: Translates speech from any supported Indic language to English.
82
+ - Output: `My phone number is 9840950950`
83
+
84
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
85
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
86
+
87
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
88
+ - Output: `mera phone number hai 9840950950`
89
+
90
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
91
+ - Output: `मेरा phone number है 9840950950`
67
92
 
68
93
  language_code : typing.Optional[SpeechToTextLanguage]
69
- Specifies the language of the input audio.
70
- For the `saarika:v2.5` model, it is optional.
71
- `unknown`: Use this when the language is not known; the API will detect it automatically.
94
+ Specifies the language of the input audio in BCP-47 format.
95
+
96
+ **Note:** This parameter is optional for `saarika:v2.5` model.
97
+
98
+ **Available Options:**
99
+ - `unknown`: Use when the language is not known; the API will auto-detect.
100
+ - `hi-IN`: Hindi
101
+ - `bn-IN`: Bengali
102
+ - `kn-IN`: Kannada
103
+ - `ml-IN`: Malayalam
104
+ - `mr-IN`: Marathi
105
+ - `od-IN`: Odia
106
+ - `pa-IN`: Punjabi
107
+ - `ta-IN`: Tamil
108
+ - `te-IN`: Telugu
109
+ - `en-IN`: English
110
+ - `gu-IN`: Gujarati
72
111
 
73
112
  input_audio_codec : typing.Optional[InputAudioCodec]
74
113
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -93,6 +132,7 @@ class SpeechToTextClient:
93
132
  _response = self._raw_client.transcribe(
94
133
  file=file,
95
134
  model=model,
135
+ mode=mode,
96
136
  language_code=language_code,
97
137
  input_audio_codec=input_audio_codec,
98
138
  request_options=request_options,
@@ -132,7 +172,10 @@ class SpeechToTextClient:
132
172
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
133
173
 
134
174
  model : typing.Optional[SpeechToTextTranslateModel]
135
- Model to be used for converting speech to text in target language
175
+ Model to be used for speech to text translation.
176
+
177
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
178
+ - Example: Hindi audio → English text output
136
179
 
137
180
  input_audio_codec : typing.Optional[InputAudioCodec]
138
181
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
180
223
  *,
181
224
  file: core.File,
182
225
  model: typing.Optional[SpeechToTextModel] = OMIT,
226
+ mode: typing.Optional[Mode] = OMIT,
183
227
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
184
228
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
185
229
  request_options: typing.Optional[RequestOptions] = None,
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
206
250
 
207
251
  model : typing.Optional[SpeechToTextModel]
208
252
  Specifies the model to use for speech-to-text conversion.
209
- Note:- Default model is `saarika:v2.5`
253
+
254
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
255
+
256
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
257
+
258
+ mode : typing.Optional[Mode]
259
+ Mode of operation. **Only applicable when using saaras:v3 model.**
260
+
261
+ Example audio: 'मेरा फोन नंबर है 9840950950'
262
+
263
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
264
+ - Output: `मेरा फोन नंबर है 9840950950`
265
+
266
+ - **translate**: Translates speech from any supported Indic language to English.
267
+ - Output: `My phone number is 9840950950`
268
+
269
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
270
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
271
+
272
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
273
+ - Output: `mera phone number hai 9840950950`
274
+
275
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
276
+ - Output: `मेरा phone number है 9840950950`
210
277
 
211
278
  language_code : typing.Optional[SpeechToTextLanguage]
212
- Specifies the language of the input audio.
213
- For the `saarika:v2.5` model, it is optional.
214
- `unknown`: Use this when the language is not known; the API will detect it automatically.
279
+ Specifies the language of the input audio in BCP-47 format.
280
+
281
+ **Note:** This parameter is optional for `saarika:v2.5` model.
282
+
283
+ **Available Options:**
284
+ - `unknown`: Use when the language is not known; the API will auto-detect.
285
+ - `hi-IN`: Hindi
286
+ - `bn-IN`: Bengali
287
+ - `kn-IN`: Kannada
288
+ - `ml-IN`: Malayalam
289
+ - `mr-IN`: Marathi
290
+ - `od-IN`: Odia
291
+ - `pa-IN`: Punjabi
292
+ - `ta-IN`: Tamil
293
+ - `te-IN`: Telugu
294
+ - `en-IN`: English
295
+ - `gu-IN`: Gujarati
215
296
 
216
297
  input_audio_codec : typing.Optional[InputAudioCodec]
217
298
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
244
325
  _response = await self._raw_client.transcribe(
245
326
  file=file,
246
327
  model=model,
328
+ mode=mode,
247
329
  language_code=language_code,
248
330
  input_audio_codec=input_audio_codec,
249
331
  request_options=request_options,
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
283
365
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
284
366
 
285
367
  model : typing.Optional[SpeechToTextTranslateModel]
286
- Model to be used for converting speech to text in target language
368
+ Model to be used for speech to text translation.
369
+
370
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
371
+ - Example: Hindi audio → English text output
287
372
 
288
373
  input_audio_codec : typing.Optional[InputAudioCodec]
289
374
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -1,6 +1,5 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
- import json
4
3
  import typing
5
4
  from json.decoder import JSONDecodeError
6
5
 
@@ -8,7 +7,6 @@ from .. import core
8
7
  from ..core.api_error import ApiError
9
8
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
10
9
  from ..core.http_response import AsyncHttpResponse, HttpResponse
11
- from ..core.jsonable_encoder import jsonable_encoder
12
10
  from ..core.pydantic_utilities import parse_obj_as
13
11
  from ..core.request_options import RequestOptions
14
12
  from ..errors.bad_request_error import BadRequestError
@@ -18,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
18
16
  from ..errors.too_many_requests_error import TooManyRequestsError
19
17
  from ..errors.unprocessable_entity_error import UnprocessableEntityError
20
18
  from ..types.input_audio_codec import InputAudioCodec
19
+ from ..types.mode import Mode
21
20
  from ..types.speech_to_text_language import SpeechToTextLanguage
22
21
  from ..types.speech_to_text_model import SpeechToTextModel
23
22
  from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +36,7 @@ class RawSpeechToTextClient:
37
36
  *,
38
37
  file: core.File,
39
38
  model: typing.Optional[SpeechToTextModel] = OMIT,
39
+ mode: typing.Optional[Mode] = OMIT,
40
40
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
41
41
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
42
42
  request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +63,49 @@ class RawSpeechToTextClient:
63
63
 
64
64
  model : typing.Optional[SpeechToTextModel]
65
65
  Specifies the model to use for speech-to-text conversion.
66
- Note:- Default model is `saarika:v2.5`
66
+
67
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
68
+
69
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
70
+
71
+ mode : typing.Optional[Mode]
72
+ Mode of operation. **Only applicable when using saaras:v3 model.**
73
+
74
+ Example audio: 'मेरा फोन नंबर है 9840950950'
75
+
76
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
77
+ - Output: `मेरा फोन नंबर है 9840950950`
78
+
79
+ - **translate**: Translates speech from any supported Indic language to English.
80
+ - Output: `My phone number is 9840950950`
81
+
82
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
83
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
84
+
85
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
86
+ - Output: `mera phone number hai 9840950950`
87
+
88
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
89
+ - Output: `मेरा phone number है 9840950950`
67
90
 
68
91
  language_code : typing.Optional[SpeechToTextLanguage]
69
- Specifies the language of the input audio.
70
- For the `saarika:v2.5` model, it is optional.
71
- `unknown`: Use this when the language is not known; the API will detect it automatically.
92
+ Specifies the language of the input audio in BCP-47 format.
93
+
94
+ **Note:** This parameter is optional for `saarika:v2.5` model.
95
+
96
+ **Available Options:**
97
+ - `unknown`: Use when the language is not known; the API will auto-detect.
98
+ - `hi-IN`: Hindi
99
+ - `bn-IN`: Bengali
100
+ - `kn-IN`: Kannada
101
+ - `ml-IN`: Malayalam
102
+ - `mr-IN`: Marathi
103
+ - `od-IN`: Odia
104
+ - `pa-IN`: Punjabi
105
+ - `ta-IN`: Tamil
106
+ - `te-IN`: Telugu
107
+ - `en-IN`: English
108
+ - `gu-IN`: Gujarati
72
109
 
73
110
  input_audio_codec : typing.Optional[InputAudioCodec]
74
111
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -86,7 +123,8 @@ class RawSpeechToTextClient:
86
123
  base_url=self._client_wrapper.get_environment().base,
87
124
  method="POST",
88
125
  data={
89
- "model": json.dumps(jsonable_encoder(model)),
126
+ "model": model,
127
+ "mode": mode,
90
128
  "language_code": language_code,
91
129
  "input_audio_codec": input_audio_codec,
92
130
  },
@@ -111,9 +149,9 @@ class RawSpeechToTextClient:
111
149
  raise BadRequestError(
112
150
  headers=dict(_response.headers),
113
151
  body=typing.cast(
114
- typing.Any,
152
+ typing.Optional[typing.Any],
115
153
  parse_obj_as(
116
- type_=typing.Any, # type: ignore
154
+ type_=typing.Optional[typing.Any], # type: ignore
117
155
  object_=_response.json(),
118
156
  ),
119
157
  ),
@@ -122,9 +160,9 @@ class RawSpeechToTextClient:
122
160
  raise ForbiddenError(
123
161
  headers=dict(_response.headers),
124
162
  body=typing.cast(
125
- typing.Any,
163
+ typing.Optional[typing.Any],
126
164
  parse_obj_as(
127
- type_=typing.Any, # type: ignore
165
+ type_=typing.Optional[typing.Any], # type: ignore
128
166
  object_=_response.json(),
129
167
  ),
130
168
  ),
@@ -133,9 +171,9 @@ class RawSpeechToTextClient:
133
171
  raise UnprocessableEntityError(
134
172
  headers=dict(_response.headers),
135
173
  body=typing.cast(
136
- typing.Any,
174
+ typing.Optional[typing.Any],
137
175
  parse_obj_as(
138
- type_=typing.Any, # type: ignore
176
+ type_=typing.Optional[typing.Any], # type: ignore
139
177
  object_=_response.json(),
140
178
  ),
141
179
  ),
@@ -144,9 +182,9 @@ class RawSpeechToTextClient:
144
182
  raise TooManyRequestsError(
145
183
  headers=dict(_response.headers),
146
184
  body=typing.cast(
147
- typing.Any,
185
+ typing.Optional[typing.Any],
148
186
  parse_obj_as(
149
- type_=typing.Any, # type: ignore
187
+ type_=typing.Optional[typing.Any], # type: ignore
150
188
  object_=_response.json(),
151
189
  ),
152
190
  ),
@@ -155,9 +193,9 @@ class RawSpeechToTextClient:
155
193
  raise InternalServerError(
156
194
  headers=dict(_response.headers),
157
195
  body=typing.cast(
158
- typing.Any,
196
+ typing.Optional[typing.Any],
159
197
  parse_obj_as(
160
- type_=typing.Any, # type: ignore
198
+ type_=typing.Optional[typing.Any], # type: ignore
161
199
  object_=_response.json(),
162
200
  ),
163
201
  ),
@@ -166,9 +204,9 @@ class RawSpeechToTextClient:
166
204
  raise ServiceUnavailableError(
167
205
  headers=dict(_response.headers),
168
206
  body=typing.cast(
169
- typing.Any,
207
+ typing.Optional[typing.Any],
170
208
  parse_obj_as(
171
- type_=typing.Any, # type: ignore
209
+ type_=typing.Optional[typing.Any], # type: ignore
172
210
  object_=_response.json(),
173
211
  ),
174
212
  ),
@@ -211,7 +249,10 @@ class RawSpeechToTextClient:
211
249
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
212
250
 
213
251
  model : typing.Optional[SpeechToTextTranslateModel]
214
- Model to be used for converting speech to text in target language
252
+ Model to be used for speech to text translation.
253
+
254
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
255
+ - Example: Hindi audio → English text output
215
256
 
216
257
  input_audio_codec : typing.Optional[InputAudioCodec]
217
258
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -230,7 +271,7 @@ class RawSpeechToTextClient:
230
271
  method="POST",
231
272
  data={
232
273
  "prompt": prompt,
233
- "model": json.dumps(jsonable_encoder(model)),
274
+ "model": model,
234
275
  "input_audio_codec": input_audio_codec,
235
276
  },
236
277
  files={
@@ -254,9 +295,9 @@ class RawSpeechToTextClient:
254
295
  raise BadRequestError(
255
296
  headers=dict(_response.headers),
256
297
  body=typing.cast(
257
- typing.Any,
298
+ typing.Optional[typing.Any],
258
299
  parse_obj_as(
259
- type_=typing.Any, # type: ignore
300
+ type_=typing.Optional[typing.Any], # type: ignore
260
301
  object_=_response.json(),
261
302
  ),
262
303
  ),
@@ -265,9 +306,9 @@ class RawSpeechToTextClient:
265
306
  raise ForbiddenError(
266
307
  headers=dict(_response.headers),
267
308
  body=typing.cast(
268
- typing.Any,
309
+ typing.Optional[typing.Any],
269
310
  parse_obj_as(
270
- type_=typing.Any, # type: ignore
311
+ type_=typing.Optional[typing.Any], # type: ignore
271
312
  object_=_response.json(),
272
313
  ),
273
314
  ),
@@ -276,9 +317,9 @@ class RawSpeechToTextClient:
276
317
  raise UnprocessableEntityError(
277
318
  headers=dict(_response.headers),
278
319
  body=typing.cast(
279
- typing.Any,
320
+ typing.Optional[typing.Any],
280
321
  parse_obj_as(
281
- type_=typing.Any, # type: ignore
322
+ type_=typing.Optional[typing.Any], # type: ignore
282
323
  object_=_response.json(),
283
324
  ),
284
325
  ),
@@ -287,9 +328,9 @@ class RawSpeechToTextClient:
287
328
  raise TooManyRequestsError(
288
329
  headers=dict(_response.headers),
289
330
  body=typing.cast(
290
- typing.Any,
331
+ typing.Optional[typing.Any],
291
332
  parse_obj_as(
292
- type_=typing.Any, # type: ignore
333
+ type_=typing.Optional[typing.Any], # type: ignore
293
334
  object_=_response.json(),
294
335
  ),
295
336
  ),
@@ -298,9 +339,9 @@ class RawSpeechToTextClient:
298
339
  raise InternalServerError(
299
340
  headers=dict(_response.headers),
300
341
  body=typing.cast(
301
- typing.Any,
342
+ typing.Optional[typing.Any],
302
343
  parse_obj_as(
303
- type_=typing.Any, # type: ignore
344
+ type_=typing.Optional[typing.Any], # type: ignore
304
345
  object_=_response.json(),
305
346
  ),
306
347
  ),
@@ -309,9 +350,9 @@ class RawSpeechToTextClient:
309
350
  raise ServiceUnavailableError(
310
351
  headers=dict(_response.headers),
311
352
  body=typing.cast(
312
- typing.Any,
353
+ typing.Optional[typing.Any],
313
354
  parse_obj_as(
314
- type_=typing.Any, # type: ignore
355
+ type_=typing.Optional[typing.Any], # type: ignore
315
356
  object_=_response.json(),
316
357
  ),
317
358
  ),
@@ -331,6 +372,7 @@ class AsyncRawSpeechToTextClient:
331
372
  *,
332
373
  file: core.File,
333
374
  model: typing.Optional[SpeechToTextModel] = OMIT,
375
+ mode: typing.Optional[Mode] = OMIT,
334
376
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
335
377
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
336
378
  request_options: typing.Optional[RequestOptions] = None,
@@ -357,12 +399,49 @@ class AsyncRawSpeechToTextClient:
357
399
 
358
400
  model : typing.Optional[SpeechToTextModel]
359
401
  Specifies the model to use for speech-to-text conversion.
360
- Note:- Default model is `saarika:v2.5`
402
+
403
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
404
+
405
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
406
+
407
+ mode : typing.Optional[Mode]
408
+ Mode of operation. **Only applicable when using saaras:v3 model.**
409
+
410
+ Example audio: 'मेरा फोन नंबर है 9840950950'
411
+
412
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
413
+ - Output: `मेरा फोन नंबर है 9840950950`
414
+
415
+ - **translate**: Translates speech from any supported Indic language to English.
416
+ - Output: `My phone number is 9840950950`
417
+
418
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
419
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
420
+
421
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
422
+ - Output: `mera phone number hai 9840950950`
423
+
424
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
425
+ - Output: `मेरा phone number है 9840950950`
361
426
 
362
427
  language_code : typing.Optional[SpeechToTextLanguage]
363
- Specifies the language of the input audio.
364
- For the `saarika:v2.5` model, it is optional.
365
- `unknown`: Use this when the language is not known; the API will detect it automatically.
428
+ Specifies the language of the input audio in BCP-47 format.
429
+
430
+ **Note:** This parameter is optional for `saarika:v2.5` model.
431
+
432
+ **Available Options:**
433
+ - `unknown`: Use when the language is not known; the API will auto-detect.
434
+ - `hi-IN`: Hindi
435
+ - `bn-IN`: Bengali
436
+ - `kn-IN`: Kannada
437
+ - `ml-IN`: Malayalam
438
+ - `mr-IN`: Marathi
439
+ - `od-IN`: Odia
440
+ - `pa-IN`: Punjabi
441
+ - `ta-IN`: Tamil
442
+ - `te-IN`: Telugu
443
+ - `en-IN`: English
444
+ - `gu-IN`: Gujarati
366
445
 
367
446
  input_audio_codec : typing.Optional[InputAudioCodec]
368
447
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -380,7 +459,8 @@ class AsyncRawSpeechToTextClient:
380
459
  base_url=self._client_wrapper.get_environment().base,
381
460
  method="POST",
382
461
  data={
383
- "model": json.dumps(jsonable_encoder(model)),
462
+ "model": model,
463
+ "mode": mode,
384
464
  "language_code": language_code,
385
465
  "input_audio_codec": input_audio_codec,
386
466
  },
@@ -405,9 +485,9 @@ class AsyncRawSpeechToTextClient:
405
485
  raise BadRequestError(
406
486
  headers=dict(_response.headers),
407
487
  body=typing.cast(
408
- typing.Any,
488
+ typing.Optional[typing.Any],
409
489
  parse_obj_as(
410
- type_=typing.Any, # type: ignore
490
+ type_=typing.Optional[typing.Any], # type: ignore
411
491
  object_=_response.json(),
412
492
  ),
413
493
  ),
@@ -416,9 +496,9 @@ class AsyncRawSpeechToTextClient:
416
496
  raise ForbiddenError(
417
497
  headers=dict(_response.headers),
418
498
  body=typing.cast(
419
- typing.Any,
499
+ typing.Optional[typing.Any],
420
500
  parse_obj_as(
421
- type_=typing.Any, # type: ignore
501
+ type_=typing.Optional[typing.Any], # type: ignore
422
502
  object_=_response.json(),
423
503
  ),
424
504
  ),
@@ -427,9 +507,9 @@ class AsyncRawSpeechToTextClient:
427
507
  raise UnprocessableEntityError(
428
508
  headers=dict(_response.headers),
429
509
  body=typing.cast(
430
- typing.Any,
510
+ typing.Optional[typing.Any],
431
511
  parse_obj_as(
432
- type_=typing.Any, # type: ignore
512
+ type_=typing.Optional[typing.Any], # type: ignore
433
513
  object_=_response.json(),
434
514
  ),
435
515
  ),
@@ -438,9 +518,9 @@ class AsyncRawSpeechToTextClient:
438
518
  raise TooManyRequestsError(
439
519
  headers=dict(_response.headers),
440
520
  body=typing.cast(
441
- typing.Any,
521
+ typing.Optional[typing.Any],
442
522
  parse_obj_as(
443
- type_=typing.Any, # type: ignore
523
+ type_=typing.Optional[typing.Any], # type: ignore
444
524
  object_=_response.json(),
445
525
  ),
446
526
  ),
@@ -449,9 +529,9 @@ class AsyncRawSpeechToTextClient:
449
529
  raise InternalServerError(
450
530
  headers=dict(_response.headers),
451
531
  body=typing.cast(
452
- typing.Any,
532
+ typing.Optional[typing.Any],
453
533
  parse_obj_as(
454
- type_=typing.Any, # type: ignore
534
+ type_=typing.Optional[typing.Any], # type: ignore
455
535
  object_=_response.json(),
456
536
  ),
457
537
  ),
@@ -460,9 +540,9 @@ class AsyncRawSpeechToTextClient:
460
540
  raise ServiceUnavailableError(
461
541
  headers=dict(_response.headers),
462
542
  body=typing.cast(
463
- typing.Any,
543
+ typing.Optional[typing.Any],
464
544
  parse_obj_as(
465
- type_=typing.Any, # type: ignore
545
+ type_=typing.Optional[typing.Any], # type: ignore
466
546
  object_=_response.json(),
467
547
  ),
468
548
  ),
@@ -505,7 +585,10 @@ class AsyncRawSpeechToTextClient:
505
585
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
506
586
 
507
587
  model : typing.Optional[SpeechToTextTranslateModel]
508
- Model to be used for converting speech to text in target language
588
+ Model to be used for speech to text translation.
589
+
590
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
591
+ - Example: Hindi audio → English text output
509
592
 
510
593
  input_audio_codec : typing.Optional[InputAudioCodec]
511
594
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -524,7 +607,7 @@ class AsyncRawSpeechToTextClient:
524
607
  method="POST",
525
608
  data={
526
609
  "prompt": prompt,
527
- "model": json.dumps(jsonable_encoder(model)),
610
+ "model": model,
528
611
  "input_audio_codec": input_audio_codec,
529
612
  },
530
613
  files={
@@ -548,9 +631,9 @@ class AsyncRawSpeechToTextClient:
548
631
  raise BadRequestError(
549
632
  headers=dict(_response.headers),
550
633
  body=typing.cast(
551
- typing.Any,
634
+ typing.Optional[typing.Any],
552
635
  parse_obj_as(
553
- type_=typing.Any, # type: ignore
636
+ type_=typing.Optional[typing.Any], # type: ignore
554
637
  object_=_response.json(),
555
638
  ),
556
639
  ),
@@ -559,9 +642,9 @@ class AsyncRawSpeechToTextClient:
559
642
  raise ForbiddenError(
560
643
  headers=dict(_response.headers),
561
644
  body=typing.cast(
562
- typing.Any,
645
+ typing.Optional[typing.Any],
563
646
  parse_obj_as(
564
- type_=typing.Any, # type: ignore
647
+ type_=typing.Optional[typing.Any], # type: ignore
565
648
  object_=_response.json(),
566
649
  ),
567
650
  ),
@@ -570,9 +653,9 @@ class AsyncRawSpeechToTextClient:
570
653
  raise UnprocessableEntityError(
571
654
  headers=dict(_response.headers),
572
655
  body=typing.cast(
573
- typing.Any,
656
+ typing.Optional[typing.Any],
574
657
  parse_obj_as(
575
- type_=typing.Any, # type: ignore
658
+ type_=typing.Optional[typing.Any], # type: ignore
576
659
  object_=_response.json(),
577
660
  ),
578
661
  ),
@@ -581,9 +664,9 @@ class AsyncRawSpeechToTextClient:
581
664
  raise TooManyRequestsError(
582
665
  headers=dict(_response.headers),
583
666
  body=typing.cast(
584
- typing.Any,
667
+ typing.Optional[typing.Any],
585
668
  parse_obj_as(
586
- type_=typing.Any, # type: ignore
669
+ type_=typing.Optional[typing.Any], # type: ignore
587
670
  object_=_response.json(),
588
671
  ),
589
672
  ),
@@ -592,9 +675,9 @@ class AsyncRawSpeechToTextClient:
592
675
  raise InternalServerError(
593
676
  headers=dict(_response.headers),
594
677
  body=typing.cast(
595
- typing.Any,
678
+ typing.Optional[typing.Any],
596
679
  parse_obj_as(
597
- type_=typing.Any, # type: ignore
680
+ type_=typing.Optional[typing.Any], # type: ignore
598
681
  object_=_response.json(),
599
682
  ),
600
683
  ),
@@ -603,9 +686,9 @@ class AsyncRawSpeechToTextClient:
603
686
  raise ServiceUnavailableError(
604
687
  headers=dict(_response.headers),
605
688
  body=typing.cast(
606
- typing.Any,
689
+ typing.Optional[typing.Any],
607
690
  parse_obj_as(
608
- type_=typing.Any, # type: ignore
691
+ type_=typing.Optional[typing.Any], # type: ignore
609
692
  object_=_response.json(),
610
693
  ),
611
694
  ),