sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +203 -405
- sarvamai/chat/raw_client.py +20 -20
- sarvamai/client.py +34 -186
- sarvamai/core/__init__.py +21 -76
- sarvamai/core/client_wrapper.py +3 -19
- sarvamai/core/force_multipart.py +2 -4
- sarvamai/core/http_client.py +97 -217
- sarvamai/core/http_response.py +1 -1
- sarvamai/core/jsonable_encoder.py +0 -8
- sarvamai/core/pydantic_utilities.py +4 -110
- sarvamai/errors/__init__.py +6 -40
- sarvamai/errors/bad_request_error.py +1 -1
- sarvamai/errors/forbidden_error.py +1 -1
- sarvamai/errors/internal_server_error.py +1 -1
- sarvamai/errors/service_unavailable_error.py +1 -1
- sarvamai/errors/too_many_requests_error.py +1 -1
- sarvamai/errors/unprocessable_entity_error.py +1 -1
- sarvamai/requests/__init__.py +62 -150
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/error_response_data.py +1 -1
- sarvamai/requests/file_signed_url_details.py +1 -1
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_transcription_data.py +2 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +147 -64
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_job/raw_client.py +120 -120
- sarvamai/speech_to_text_streaming/__init__.py +10 -38
- sarvamai/speech_to_text_streaming/client.py +90 -8
- sarvamai/speech_to_text_streaming/raw_client.py +90 -8
- sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
- sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
- sarvamai/speech_to_text_translate_streaming/client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
- sarvamai/text/raw_client.py +60 -60
- sarvamai/text_to_speech/client.py +100 -16
- sarvamai/text_to_speech/raw_client.py +120 -36
- sarvamai/text_to_speech_streaming/__init__.py +2 -29
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +102 -222
- sarvamai/types/chat_completion_request_message.py +2 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/error_response_data.py +1 -1
- sarvamai/types/file_signed_url_details.py +1 -1
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_transcription_data.py +2 -2
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
- sarvamai/core/http_sse/__init__.py +0 -42
- sarvamai/core/http_sse/_api.py +0 -112
- sarvamai/core/http_sse/_decoders.py +0 -61
- sarvamai/core/http_sse/_exceptions.py +0 -7
- sarvamai/core/http_sse/_models.py +0 -17
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0
|
@@ -6,6 +6,7 @@ from .. import core
|
|
|
6
6
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
7
7
|
from ..core.request_options import RequestOptions
|
|
8
8
|
from ..types.input_audio_codec import InputAudioCodec
|
|
9
|
+
from ..types.mode import Mode
|
|
9
10
|
from ..types.speech_to_text_language import SpeechToTextLanguage
|
|
10
11
|
from ..types.speech_to_text_model import SpeechToTextModel
|
|
11
12
|
from ..types.speech_to_text_response import SpeechToTextResponse
|
|
@@ -37,6 +38,7 @@ class SpeechToTextClient:
|
|
|
37
38
|
*,
|
|
38
39
|
file: core.File,
|
|
39
40
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
41
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
40
42
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
41
43
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
42
44
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -63,12 +65,49 @@ class SpeechToTextClient:
|
|
|
63
65
|
|
|
64
66
|
model : typing.Optional[SpeechToTextModel]
|
|
65
67
|
Specifies the model to use for speech-to-text conversion.
|
|
66
|
-
|
|
68
|
+
|
|
69
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
70
|
+
|
|
71
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
72
|
+
|
|
73
|
+
mode : typing.Optional[Mode]
|
|
74
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
75
|
+
|
|
76
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
77
|
+
|
|
78
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
79
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
80
|
+
|
|
81
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
82
|
+
- Output: `My phone number is 9840950950`
|
|
83
|
+
|
|
84
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
85
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
86
|
+
|
|
87
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
88
|
+
- Output: `mera phone number hai 9840950950`
|
|
89
|
+
|
|
90
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
91
|
+
- Output: `मेरा phone number है 9840950950`
|
|
67
92
|
|
|
68
93
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
69
|
-
Specifies the language of the input audio.
|
|
70
|
-
|
|
71
|
-
|
|
94
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
95
|
+
|
|
96
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
97
|
+
|
|
98
|
+
**Available Options:**
|
|
99
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
100
|
+
- `hi-IN`: Hindi
|
|
101
|
+
- `bn-IN`: Bengali
|
|
102
|
+
- `kn-IN`: Kannada
|
|
103
|
+
- `ml-IN`: Malayalam
|
|
104
|
+
- `mr-IN`: Marathi
|
|
105
|
+
- `od-IN`: Odia
|
|
106
|
+
- `pa-IN`: Punjabi
|
|
107
|
+
- `ta-IN`: Tamil
|
|
108
|
+
- `te-IN`: Telugu
|
|
109
|
+
- `en-IN`: English
|
|
110
|
+
- `gu-IN`: Gujarati
|
|
72
111
|
|
|
73
112
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
74
113
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -93,6 +132,7 @@ class SpeechToTextClient:
|
|
|
93
132
|
_response = self._raw_client.transcribe(
|
|
94
133
|
file=file,
|
|
95
134
|
model=model,
|
|
135
|
+
mode=mode,
|
|
96
136
|
language_code=language_code,
|
|
97
137
|
input_audio_codec=input_audio_codec,
|
|
98
138
|
request_options=request_options,
|
|
@@ -132,7 +172,10 @@ class SpeechToTextClient:
|
|
|
132
172
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
133
173
|
|
|
134
174
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
135
|
-
Model to be used for
|
|
175
|
+
Model to be used for speech to text translation.
|
|
176
|
+
|
|
177
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
178
|
+
- Example: Hindi audio → English text output
|
|
136
179
|
|
|
137
180
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
138
181
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
|
|
|
180
223
|
*,
|
|
181
224
|
file: core.File,
|
|
182
225
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
226
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
183
227
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
184
228
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
185
229
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
|
|
|
206
250
|
|
|
207
251
|
model : typing.Optional[SpeechToTextModel]
|
|
208
252
|
Specifies the model to use for speech-to-text conversion.
|
|
209
|
-
|
|
253
|
+
|
|
254
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
255
|
+
|
|
256
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
257
|
+
|
|
258
|
+
mode : typing.Optional[Mode]
|
|
259
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
260
|
+
|
|
261
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
262
|
+
|
|
263
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
264
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
265
|
+
|
|
266
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
267
|
+
- Output: `My phone number is 9840950950`
|
|
268
|
+
|
|
269
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
270
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
271
|
+
|
|
272
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
273
|
+
- Output: `mera phone number hai 9840950950`
|
|
274
|
+
|
|
275
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
276
|
+
- Output: `मेरा phone number है 9840950950`
|
|
210
277
|
|
|
211
278
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
212
|
-
Specifies the language of the input audio.
|
|
213
|
-
|
|
214
|
-
|
|
279
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
280
|
+
|
|
281
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
282
|
+
|
|
283
|
+
**Available Options:**
|
|
284
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
285
|
+
- `hi-IN`: Hindi
|
|
286
|
+
- `bn-IN`: Bengali
|
|
287
|
+
- `kn-IN`: Kannada
|
|
288
|
+
- `ml-IN`: Malayalam
|
|
289
|
+
- `mr-IN`: Marathi
|
|
290
|
+
- `od-IN`: Odia
|
|
291
|
+
- `pa-IN`: Punjabi
|
|
292
|
+
- `ta-IN`: Tamil
|
|
293
|
+
- `te-IN`: Telugu
|
|
294
|
+
- `en-IN`: English
|
|
295
|
+
- `gu-IN`: Gujarati
|
|
215
296
|
|
|
216
297
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
217
298
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
|
|
|
244
325
|
_response = await self._raw_client.transcribe(
|
|
245
326
|
file=file,
|
|
246
327
|
model=model,
|
|
328
|
+
mode=mode,
|
|
247
329
|
language_code=language_code,
|
|
248
330
|
input_audio_codec=input_audio_codec,
|
|
249
331
|
request_options=request_options,
|
|
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
|
|
|
283
365
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
284
366
|
|
|
285
367
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
286
|
-
Model to be used for
|
|
368
|
+
Model to be used for speech to text translation.
|
|
369
|
+
|
|
370
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
371
|
+
- Example: Hindi audio → English text output
|
|
287
372
|
|
|
288
373
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
289
374
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import typing
|
|
5
4
|
from json.decoder import JSONDecodeError
|
|
6
5
|
|
|
@@ -8,7 +7,6 @@ from .. import core
|
|
|
8
7
|
from ..core.api_error import ApiError
|
|
9
8
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
10
9
|
from ..core.http_response import AsyncHttpResponse, HttpResponse
|
|
11
|
-
from ..core.jsonable_encoder import jsonable_encoder
|
|
12
10
|
from ..core.pydantic_utilities import parse_obj_as
|
|
13
11
|
from ..core.request_options import RequestOptions
|
|
14
12
|
from ..errors.bad_request_error import BadRequestError
|
|
@@ -18,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
|
|
|
18
16
|
from ..errors.too_many_requests_error import TooManyRequestsError
|
|
19
17
|
from ..errors.unprocessable_entity_error import UnprocessableEntityError
|
|
20
18
|
from ..types.input_audio_codec import InputAudioCodec
|
|
19
|
+
from ..types.mode import Mode
|
|
21
20
|
from ..types.speech_to_text_language import SpeechToTextLanguage
|
|
22
21
|
from ..types.speech_to_text_model import SpeechToTextModel
|
|
23
22
|
from ..types.speech_to_text_response import SpeechToTextResponse
|
|
@@ -37,6 +36,7 @@ class RawSpeechToTextClient:
|
|
|
37
36
|
*,
|
|
38
37
|
file: core.File,
|
|
39
38
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
39
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
40
40
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
41
41
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
42
42
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -63,12 +63,49 @@ class RawSpeechToTextClient:
|
|
|
63
63
|
|
|
64
64
|
model : typing.Optional[SpeechToTextModel]
|
|
65
65
|
Specifies the model to use for speech-to-text conversion.
|
|
66
|
-
|
|
66
|
+
|
|
67
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
68
|
+
|
|
69
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
70
|
+
|
|
71
|
+
mode : typing.Optional[Mode]
|
|
72
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
73
|
+
|
|
74
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
75
|
+
|
|
76
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
77
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
78
|
+
|
|
79
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
80
|
+
- Output: `My phone number is 9840950950`
|
|
81
|
+
|
|
82
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
83
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
84
|
+
|
|
85
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
86
|
+
- Output: `mera phone number hai 9840950950`
|
|
87
|
+
|
|
88
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
89
|
+
- Output: `मेरा phone number है 9840950950`
|
|
67
90
|
|
|
68
91
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
69
|
-
Specifies the language of the input audio.
|
|
70
|
-
|
|
71
|
-
|
|
92
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
93
|
+
|
|
94
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
95
|
+
|
|
96
|
+
**Available Options:**
|
|
97
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
98
|
+
- `hi-IN`: Hindi
|
|
99
|
+
- `bn-IN`: Bengali
|
|
100
|
+
- `kn-IN`: Kannada
|
|
101
|
+
- `ml-IN`: Malayalam
|
|
102
|
+
- `mr-IN`: Marathi
|
|
103
|
+
- `od-IN`: Odia
|
|
104
|
+
- `pa-IN`: Punjabi
|
|
105
|
+
- `ta-IN`: Tamil
|
|
106
|
+
- `te-IN`: Telugu
|
|
107
|
+
- `en-IN`: English
|
|
108
|
+
- `gu-IN`: Gujarati
|
|
72
109
|
|
|
73
110
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
74
111
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -86,7 +123,8 @@ class RawSpeechToTextClient:
|
|
|
86
123
|
base_url=self._client_wrapper.get_environment().base,
|
|
87
124
|
method="POST",
|
|
88
125
|
data={
|
|
89
|
-
"model":
|
|
126
|
+
"model": model,
|
|
127
|
+
"mode": mode,
|
|
90
128
|
"language_code": language_code,
|
|
91
129
|
"input_audio_codec": input_audio_codec,
|
|
92
130
|
},
|
|
@@ -111,9 +149,9 @@ class RawSpeechToTextClient:
|
|
|
111
149
|
raise BadRequestError(
|
|
112
150
|
headers=dict(_response.headers),
|
|
113
151
|
body=typing.cast(
|
|
114
|
-
typing.Any,
|
|
152
|
+
typing.Optional[typing.Any],
|
|
115
153
|
parse_obj_as(
|
|
116
|
-
type_=typing.Any, # type: ignore
|
|
154
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
117
155
|
object_=_response.json(),
|
|
118
156
|
),
|
|
119
157
|
),
|
|
@@ -122,9 +160,9 @@ class RawSpeechToTextClient:
|
|
|
122
160
|
raise ForbiddenError(
|
|
123
161
|
headers=dict(_response.headers),
|
|
124
162
|
body=typing.cast(
|
|
125
|
-
typing.Any,
|
|
163
|
+
typing.Optional[typing.Any],
|
|
126
164
|
parse_obj_as(
|
|
127
|
-
type_=typing.Any, # type: ignore
|
|
165
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
128
166
|
object_=_response.json(),
|
|
129
167
|
),
|
|
130
168
|
),
|
|
@@ -133,9 +171,9 @@ class RawSpeechToTextClient:
|
|
|
133
171
|
raise UnprocessableEntityError(
|
|
134
172
|
headers=dict(_response.headers),
|
|
135
173
|
body=typing.cast(
|
|
136
|
-
typing.Any,
|
|
174
|
+
typing.Optional[typing.Any],
|
|
137
175
|
parse_obj_as(
|
|
138
|
-
type_=typing.Any, # type: ignore
|
|
176
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
139
177
|
object_=_response.json(),
|
|
140
178
|
),
|
|
141
179
|
),
|
|
@@ -144,9 +182,9 @@ class RawSpeechToTextClient:
|
|
|
144
182
|
raise TooManyRequestsError(
|
|
145
183
|
headers=dict(_response.headers),
|
|
146
184
|
body=typing.cast(
|
|
147
|
-
typing.Any,
|
|
185
|
+
typing.Optional[typing.Any],
|
|
148
186
|
parse_obj_as(
|
|
149
|
-
type_=typing.Any, # type: ignore
|
|
187
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
150
188
|
object_=_response.json(),
|
|
151
189
|
),
|
|
152
190
|
),
|
|
@@ -155,9 +193,9 @@ class RawSpeechToTextClient:
|
|
|
155
193
|
raise InternalServerError(
|
|
156
194
|
headers=dict(_response.headers),
|
|
157
195
|
body=typing.cast(
|
|
158
|
-
typing.Any,
|
|
196
|
+
typing.Optional[typing.Any],
|
|
159
197
|
parse_obj_as(
|
|
160
|
-
type_=typing.Any, # type: ignore
|
|
198
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
161
199
|
object_=_response.json(),
|
|
162
200
|
),
|
|
163
201
|
),
|
|
@@ -166,9 +204,9 @@ class RawSpeechToTextClient:
|
|
|
166
204
|
raise ServiceUnavailableError(
|
|
167
205
|
headers=dict(_response.headers),
|
|
168
206
|
body=typing.cast(
|
|
169
|
-
typing.Any,
|
|
207
|
+
typing.Optional[typing.Any],
|
|
170
208
|
parse_obj_as(
|
|
171
|
-
type_=typing.Any, # type: ignore
|
|
209
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
172
210
|
object_=_response.json(),
|
|
173
211
|
),
|
|
174
212
|
),
|
|
@@ -211,7 +249,10 @@ class RawSpeechToTextClient:
|
|
|
211
249
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
212
250
|
|
|
213
251
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
214
|
-
Model to be used for
|
|
252
|
+
Model to be used for speech to text translation.
|
|
253
|
+
|
|
254
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
255
|
+
- Example: Hindi audio → English text output
|
|
215
256
|
|
|
216
257
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
217
258
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -230,7 +271,7 @@ class RawSpeechToTextClient:
|
|
|
230
271
|
method="POST",
|
|
231
272
|
data={
|
|
232
273
|
"prompt": prompt,
|
|
233
|
-
"model":
|
|
274
|
+
"model": model,
|
|
234
275
|
"input_audio_codec": input_audio_codec,
|
|
235
276
|
},
|
|
236
277
|
files={
|
|
@@ -254,9 +295,9 @@ class RawSpeechToTextClient:
|
|
|
254
295
|
raise BadRequestError(
|
|
255
296
|
headers=dict(_response.headers),
|
|
256
297
|
body=typing.cast(
|
|
257
|
-
typing.Any,
|
|
298
|
+
typing.Optional[typing.Any],
|
|
258
299
|
parse_obj_as(
|
|
259
|
-
type_=typing.Any, # type: ignore
|
|
300
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
260
301
|
object_=_response.json(),
|
|
261
302
|
),
|
|
262
303
|
),
|
|
@@ -265,9 +306,9 @@ class RawSpeechToTextClient:
|
|
|
265
306
|
raise ForbiddenError(
|
|
266
307
|
headers=dict(_response.headers),
|
|
267
308
|
body=typing.cast(
|
|
268
|
-
typing.Any,
|
|
309
|
+
typing.Optional[typing.Any],
|
|
269
310
|
parse_obj_as(
|
|
270
|
-
type_=typing.Any, # type: ignore
|
|
311
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
271
312
|
object_=_response.json(),
|
|
272
313
|
),
|
|
273
314
|
),
|
|
@@ -276,9 +317,9 @@ class RawSpeechToTextClient:
|
|
|
276
317
|
raise UnprocessableEntityError(
|
|
277
318
|
headers=dict(_response.headers),
|
|
278
319
|
body=typing.cast(
|
|
279
|
-
typing.Any,
|
|
320
|
+
typing.Optional[typing.Any],
|
|
280
321
|
parse_obj_as(
|
|
281
|
-
type_=typing.Any, # type: ignore
|
|
322
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
282
323
|
object_=_response.json(),
|
|
283
324
|
),
|
|
284
325
|
),
|
|
@@ -287,9 +328,9 @@ class RawSpeechToTextClient:
|
|
|
287
328
|
raise TooManyRequestsError(
|
|
288
329
|
headers=dict(_response.headers),
|
|
289
330
|
body=typing.cast(
|
|
290
|
-
typing.Any,
|
|
331
|
+
typing.Optional[typing.Any],
|
|
291
332
|
parse_obj_as(
|
|
292
|
-
type_=typing.Any, # type: ignore
|
|
333
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
293
334
|
object_=_response.json(),
|
|
294
335
|
),
|
|
295
336
|
),
|
|
@@ -298,9 +339,9 @@ class RawSpeechToTextClient:
|
|
|
298
339
|
raise InternalServerError(
|
|
299
340
|
headers=dict(_response.headers),
|
|
300
341
|
body=typing.cast(
|
|
301
|
-
typing.Any,
|
|
342
|
+
typing.Optional[typing.Any],
|
|
302
343
|
parse_obj_as(
|
|
303
|
-
type_=typing.Any, # type: ignore
|
|
344
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
304
345
|
object_=_response.json(),
|
|
305
346
|
),
|
|
306
347
|
),
|
|
@@ -309,9 +350,9 @@ class RawSpeechToTextClient:
|
|
|
309
350
|
raise ServiceUnavailableError(
|
|
310
351
|
headers=dict(_response.headers),
|
|
311
352
|
body=typing.cast(
|
|
312
|
-
typing.Any,
|
|
353
|
+
typing.Optional[typing.Any],
|
|
313
354
|
parse_obj_as(
|
|
314
|
-
type_=typing.Any, # type: ignore
|
|
355
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
315
356
|
object_=_response.json(),
|
|
316
357
|
),
|
|
317
358
|
),
|
|
@@ -331,6 +372,7 @@ class AsyncRawSpeechToTextClient:
|
|
|
331
372
|
*,
|
|
332
373
|
file: core.File,
|
|
333
374
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
375
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
334
376
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
335
377
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
336
378
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -357,12 +399,49 @@ class AsyncRawSpeechToTextClient:
|
|
|
357
399
|
|
|
358
400
|
model : typing.Optional[SpeechToTextModel]
|
|
359
401
|
Specifies the model to use for speech-to-text conversion.
|
|
360
|
-
|
|
402
|
+
|
|
403
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
404
|
+
|
|
405
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
406
|
+
|
|
407
|
+
mode : typing.Optional[Mode]
|
|
408
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
409
|
+
|
|
410
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
411
|
+
|
|
412
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
413
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
414
|
+
|
|
415
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
416
|
+
- Output: `My phone number is 9840950950`
|
|
417
|
+
|
|
418
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
419
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
420
|
+
|
|
421
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
422
|
+
- Output: `mera phone number hai 9840950950`
|
|
423
|
+
|
|
424
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
425
|
+
- Output: `मेरा phone number है 9840950950`
|
|
361
426
|
|
|
362
427
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
363
|
-
Specifies the language of the input audio.
|
|
364
|
-
|
|
365
|
-
|
|
428
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
429
|
+
|
|
430
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
431
|
+
|
|
432
|
+
**Available Options:**
|
|
433
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
434
|
+
- `hi-IN`: Hindi
|
|
435
|
+
- `bn-IN`: Bengali
|
|
436
|
+
- `kn-IN`: Kannada
|
|
437
|
+
- `ml-IN`: Malayalam
|
|
438
|
+
- `mr-IN`: Marathi
|
|
439
|
+
- `od-IN`: Odia
|
|
440
|
+
- `pa-IN`: Punjabi
|
|
441
|
+
- `ta-IN`: Tamil
|
|
442
|
+
- `te-IN`: Telugu
|
|
443
|
+
- `en-IN`: English
|
|
444
|
+
- `gu-IN`: Gujarati
|
|
366
445
|
|
|
367
446
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
368
447
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -380,7 +459,8 @@ class AsyncRawSpeechToTextClient:
|
|
|
380
459
|
base_url=self._client_wrapper.get_environment().base,
|
|
381
460
|
method="POST",
|
|
382
461
|
data={
|
|
383
|
-
"model":
|
|
462
|
+
"model": model,
|
|
463
|
+
"mode": mode,
|
|
384
464
|
"language_code": language_code,
|
|
385
465
|
"input_audio_codec": input_audio_codec,
|
|
386
466
|
},
|
|
@@ -405,9 +485,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
405
485
|
raise BadRequestError(
|
|
406
486
|
headers=dict(_response.headers),
|
|
407
487
|
body=typing.cast(
|
|
408
|
-
typing.Any,
|
|
488
|
+
typing.Optional[typing.Any],
|
|
409
489
|
parse_obj_as(
|
|
410
|
-
type_=typing.Any, # type: ignore
|
|
490
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
411
491
|
object_=_response.json(),
|
|
412
492
|
),
|
|
413
493
|
),
|
|
@@ -416,9 +496,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
416
496
|
raise ForbiddenError(
|
|
417
497
|
headers=dict(_response.headers),
|
|
418
498
|
body=typing.cast(
|
|
419
|
-
typing.Any,
|
|
499
|
+
typing.Optional[typing.Any],
|
|
420
500
|
parse_obj_as(
|
|
421
|
-
type_=typing.Any, # type: ignore
|
|
501
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
422
502
|
object_=_response.json(),
|
|
423
503
|
),
|
|
424
504
|
),
|
|
@@ -427,9 +507,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
427
507
|
raise UnprocessableEntityError(
|
|
428
508
|
headers=dict(_response.headers),
|
|
429
509
|
body=typing.cast(
|
|
430
|
-
typing.Any,
|
|
510
|
+
typing.Optional[typing.Any],
|
|
431
511
|
parse_obj_as(
|
|
432
|
-
type_=typing.Any, # type: ignore
|
|
512
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
433
513
|
object_=_response.json(),
|
|
434
514
|
),
|
|
435
515
|
),
|
|
@@ -438,9 +518,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
438
518
|
raise TooManyRequestsError(
|
|
439
519
|
headers=dict(_response.headers),
|
|
440
520
|
body=typing.cast(
|
|
441
|
-
typing.Any,
|
|
521
|
+
typing.Optional[typing.Any],
|
|
442
522
|
parse_obj_as(
|
|
443
|
-
type_=typing.Any, # type: ignore
|
|
523
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
444
524
|
object_=_response.json(),
|
|
445
525
|
),
|
|
446
526
|
),
|
|
@@ -449,9 +529,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
449
529
|
raise InternalServerError(
|
|
450
530
|
headers=dict(_response.headers),
|
|
451
531
|
body=typing.cast(
|
|
452
|
-
typing.Any,
|
|
532
|
+
typing.Optional[typing.Any],
|
|
453
533
|
parse_obj_as(
|
|
454
|
-
type_=typing.Any, # type: ignore
|
|
534
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
455
535
|
object_=_response.json(),
|
|
456
536
|
),
|
|
457
537
|
),
|
|
@@ -460,9 +540,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
460
540
|
raise ServiceUnavailableError(
|
|
461
541
|
headers=dict(_response.headers),
|
|
462
542
|
body=typing.cast(
|
|
463
|
-
typing.Any,
|
|
543
|
+
typing.Optional[typing.Any],
|
|
464
544
|
parse_obj_as(
|
|
465
|
-
type_=typing.Any, # type: ignore
|
|
545
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
466
546
|
object_=_response.json(),
|
|
467
547
|
),
|
|
468
548
|
),
|
|
@@ -505,7 +585,10 @@ class AsyncRawSpeechToTextClient:
|
|
|
505
585
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
506
586
|
|
|
507
587
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
508
|
-
Model to be used for
|
|
588
|
+
Model to be used for speech to text translation.
|
|
589
|
+
|
|
590
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
591
|
+
- Example: Hindi audio → English text output
|
|
509
592
|
|
|
510
593
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
511
594
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -524,7 +607,7 @@ class AsyncRawSpeechToTextClient:
|
|
|
524
607
|
method="POST",
|
|
525
608
|
data={
|
|
526
609
|
"prompt": prompt,
|
|
527
|
-
"model":
|
|
610
|
+
"model": model,
|
|
528
611
|
"input_audio_codec": input_audio_codec,
|
|
529
612
|
},
|
|
530
613
|
files={
|
|
@@ -548,9 +631,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
548
631
|
raise BadRequestError(
|
|
549
632
|
headers=dict(_response.headers),
|
|
550
633
|
body=typing.cast(
|
|
551
|
-
typing.Any,
|
|
634
|
+
typing.Optional[typing.Any],
|
|
552
635
|
parse_obj_as(
|
|
553
|
-
type_=typing.Any, # type: ignore
|
|
636
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
554
637
|
object_=_response.json(),
|
|
555
638
|
),
|
|
556
639
|
),
|
|
@@ -559,9 +642,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
559
642
|
raise ForbiddenError(
|
|
560
643
|
headers=dict(_response.headers),
|
|
561
644
|
body=typing.cast(
|
|
562
|
-
typing.Any,
|
|
645
|
+
typing.Optional[typing.Any],
|
|
563
646
|
parse_obj_as(
|
|
564
|
-
type_=typing.Any, # type: ignore
|
|
647
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
565
648
|
object_=_response.json(),
|
|
566
649
|
),
|
|
567
650
|
),
|
|
@@ -570,9 +653,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
570
653
|
raise UnprocessableEntityError(
|
|
571
654
|
headers=dict(_response.headers),
|
|
572
655
|
body=typing.cast(
|
|
573
|
-
typing.Any,
|
|
656
|
+
typing.Optional[typing.Any],
|
|
574
657
|
parse_obj_as(
|
|
575
|
-
type_=typing.Any, # type: ignore
|
|
658
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
576
659
|
object_=_response.json(),
|
|
577
660
|
),
|
|
578
661
|
),
|
|
@@ -581,9 +664,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
581
664
|
raise TooManyRequestsError(
|
|
582
665
|
headers=dict(_response.headers),
|
|
583
666
|
body=typing.cast(
|
|
584
|
-
typing.Any,
|
|
667
|
+
typing.Optional[typing.Any],
|
|
585
668
|
parse_obj_as(
|
|
586
|
-
type_=typing.Any, # type: ignore
|
|
669
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
587
670
|
object_=_response.json(),
|
|
588
671
|
),
|
|
589
672
|
),
|
|
@@ -592,9 +675,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
592
675
|
raise InternalServerError(
|
|
593
676
|
headers=dict(_response.headers),
|
|
594
677
|
body=typing.cast(
|
|
595
|
-
typing.Any,
|
|
678
|
+
typing.Optional[typing.Any],
|
|
596
679
|
parse_obj_as(
|
|
597
|
-
type_=typing.Any, # type: ignore
|
|
680
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
598
681
|
object_=_response.json(),
|
|
599
682
|
),
|
|
600
683
|
),
|
|
@@ -603,9 +686,9 @@ class AsyncRawSpeechToTextClient:
|
|
|
603
686
|
raise ServiceUnavailableError(
|
|
604
687
|
headers=dict(_response.headers),
|
|
605
688
|
body=typing.cast(
|
|
606
|
-
typing.Any,
|
|
689
|
+
typing.Optional[typing.Any],
|
|
607
690
|
parse_obj_as(
|
|
608
|
-
type_=typing.Any, # type: ignore
|
|
691
|
+
type_=typing.Optional[typing.Any], # type: ignore
|
|
609
692
|
object_=_response.json(),
|
|
610
693
|
),
|
|
611
694
|
),
|