sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -3
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_streaming/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/client.py +102 -18
- sarvamai/speech_to_text_streaming/raw_client.py +102 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_streaming/client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -2
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -6,6 +6,7 @@ from .. import core
|
|
|
6
6
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
7
7
|
from ..core.request_options import RequestOptions
|
|
8
8
|
from ..types.input_audio_codec import InputAudioCodec
|
|
9
|
+
from ..types.mode import Mode
|
|
9
10
|
from ..types.speech_to_text_language import SpeechToTextLanguage
|
|
10
11
|
from ..types.speech_to_text_model import SpeechToTextModel
|
|
11
12
|
from ..types.speech_to_text_response import SpeechToTextResponse
|
|
@@ -37,6 +38,7 @@ class SpeechToTextClient:
|
|
|
37
38
|
*,
|
|
38
39
|
file: core.File,
|
|
39
40
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
41
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
40
42
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
41
43
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
42
44
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -63,12 +65,49 @@ class SpeechToTextClient:
|
|
|
63
65
|
|
|
64
66
|
model : typing.Optional[SpeechToTextModel]
|
|
65
67
|
Specifies the model to use for speech-to-text conversion.
|
|
66
|
-
|
|
68
|
+
|
|
69
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
70
|
+
|
|
71
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
72
|
+
|
|
73
|
+
mode : typing.Optional[Mode]
|
|
74
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
75
|
+
|
|
76
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
77
|
+
|
|
78
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
79
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
80
|
+
|
|
81
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
82
|
+
- Output: `My phone number is 9840950950`
|
|
83
|
+
|
|
84
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
85
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
86
|
+
|
|
87
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
88
|
+
- Output: `mera phone number hai 9840950950`
|
|
89
|
+
|
|
90
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
91
|
+
- Output: `मेरा phone number है 9840950950`
|
|
67
92
|
|
|
68
93
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
69
|
-
Specifies the language of the input audio.
|
|
70
|
-
|
|
71
|
-
|
|
94
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
95
|
+
|
|
96
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
97
|
+
|
|
98
|
+
**Available Options:**
|
|
99
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
100
|
+
- `hi-IN`: Hindi
|
|
101
|
+
- `bn-IN`: Bengali
|
|
102
|
+
- `kn-IN`: Kannada
|
|
103
|
+
- `ml-IN`: Malayalam
|
|
104
|
+
- `mr-IN`: Marathi
|
|
105
|
+
- `od-IN`: Odia
|
|
106
|
+
- `pa-IN`: Punjabi
|
|
107
|
+
- `ta-IN`: Tamil
|
|
108
|
+
- `te-IN`: Telugu
|
|
109
|
+
- `en-IN`: English
|
|
110
|
+
- `gu-IN`: Gujarati
|
|
72
111
|
|
|
73
112
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
74
113
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -93,6 +132,7 @@ class SpeechToTextClient:
|
|
|
93
132
|
_response = self._raw_client.transcribe(
|
|
94
133
|
file=file,
|
|
95
134
|
model=model,
|
|
135
|
+
mode=mode,
|
|
96
136
|
language_code=language_code,
|
|
97
137
|
input_audio_codec=input_audio_codec,
|
|
98
138
|
request_options=request_options,
|
|
@@ -132,7 +172,10 @@ class SpeechToTextClient:
|
|
|
132
172
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
133
173
|
|
|
134
174
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
135
|
-
Model to be used for
|
|
175
|
+
Model to be used for speech to text translation.
|
|
176
|
+
|
|
177
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
178
|
+
- Example: Hindi audio → English text output
|
|
136
179
|
|
|
137
180
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
138
181
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
|
|
|
180
223
|
*,
|
|
181
224
|
file: core.File,
|
|
182
225
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
226
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
183
227
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
184
228
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
185
229
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
|
|
|
206
250
|
|
|
207
251
|
model : typing.Optional[SpeechToTextModel]
|
|
208
252
|
Specifies the model to use for speech-to-text conversion.
|
|
209
|
-
|
|
253
|
+
|
|
254
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
255
|
+
|
|
256
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
257
|
+
|
|
258
|
+
mode : typing.Optional[Mode]
|
|
259
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
260
|
+
|
|
261
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
262
|
+
|
|
263
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
264
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
265
|
+
|
|
266
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
267
|
+
- Output: `My phone number is 9840950950`
|
|
268
|
+
|
|
269
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
270
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
271
|
+
|
|
272
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
273
|
+
- Output: `mera phone number hai 9840950950`
|
|
274
|
+
|
|
275
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
276
|
+
- Output: `मेरा phone number है 9840950950`
|
|
210
277
|
|
|
211
278
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
212
|
-
Specifies the language of the input audio.
|
|
213
|
-
|
|
214
|
-
|
|
279
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
280
|
+
|
|
281
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
282
|
+
|
|
283
|
+
**Available Options:**
|
|
284
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
285
|
+
- `hi-IN`: Hindi
|
|
286
|
+
- `bn-IN`: Bengali
|
|
287
|
+
- `kn-IN`: Kannada
|
|
288
|
+
- `ml-IN`: Malayalam
|
|
289
|
+
- `mr-IN`: Marathi
|
|
290
|
+
- `od-IN`: Odia
|
|
291
|
+
- `pa-IN`: Punjabi
|
|
292
|
+
- `ta-IN`: Tamil
|
|
293
|
+
- `te-IN`: Telugu
|
|
294
|
+
- `en-IN`: English
|
|
295
|
+
- `gu-IN`: Gujarati
|
|
215
296
|
|
|
216
297
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
217
298
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
|
|
|
244
325
|
_response = await self._raw_client.transcribe(
|
|
245
326
|
file=file,
|
|
246
327
|
model=model,
|
|
328
|
+
mode=mode,
|
|
247
329
|
language_code=language_code,
|
|
248
330
|
input_audio_codec=input_audio_codec,
|
|
249
331
|
request_options=request_options,
|
|
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
|
|
|
283
365
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
284
366
|
|
|
285
367
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
286
|
-
Model to be used for
|
|
368
|
+
Model to be used for speech to text translation.
|
|
369
|
+
|
|
370
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
371
|
+
- Example: Hindi audio → English text output
|
|
287
372
|
|
|
288
373
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
289
374
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -16,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
|
|
|
16
16
|
from ..errors.too_many_requests_error import TooManyRequestsError
|
|
17
17
|
from ..errors.unprocessable_entity_error import UnprocessableEntityError
|
|
18
18
|
from ..types.input_audio_codec import InputAudioCodec
|
|
19
|
+
from ..types.mode import Mode
|
|
19
20
|
from ..types.speech_to_text_language import SpeechToTextLanguage
|
|
20
21
|
from ..types.speech_to_text_model import SpeechToTextModel
|
|
21
22
|
from ..types.speech_to_text_response import SpeechToTextResponse
|
|
@@ -35,6 +36,7 @@ class RawSpeechToTextClient:
|
|
|
35
36
|
*,
|
|
36
37
|
file: core.File,
|
|
37
38
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
39
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
38
40
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
39
41
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
40
42
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -61,12 +63,49 @@ class RawSpeechToTextClient:
|
|
|
61
63
|
|
|
62
64
|
model : typing.Optional[SpeechToTextModel]
|
|
63
65
|
Specifies the model to use for speech-to-text conversion.
|
|
64
|
-
|
|
66
|
+
|
|
67
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
68
|
+
|
|
69
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
70
|
+
|
|
71
|
+
mode : typing.Optional[Mode]
|
|
72
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
73
|
+
|
|
74
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
75
|
+
|
|
76
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
77
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
78
|
+
|
|
79
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
80
|
+
- Output: `My phone number is 9840950950`
|
|
81
|
+
|
|
82
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
83
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
84
|
+
|
|
85
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
86
|
+
- Output: `mera phone number hai 9840950950`
|
|
87
|
+
|
|
88
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
89
|
+
- Output: `मेरा phone number है 9840950950`
|
|
65
90
|
|
|
66
91
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
67
|
-
Specifies the language of the input audio.
|
|
68
|
-
|
|
69
|
-
|
|
92
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
93
|
+
|
|
94
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
95
|
+
|
|
96
|
+
**Available Options:**
|
|
97
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
98
|
+
- `hi-IN`: Hindi
|
|
99
|
+
- `bn-IN`: Bengali
|
|
100
|
+
- `kn-IN`: Kannada
|
|
101
|
+
- `ml-IN`: Malayalam
|
|
102
|
+
- `mr-IN`: Marathi
|
|
103
|
+
- `od-IN`: Odia
|
|
104
|
+
- `pa-IN`: Punjabi
|
|
105
|
+
- `ta-IN`: Tamil
|
|
106
|
+
- `te-IN`: Telugu
|
|
107
|
+
- `en-IN`: English
|
|
108
|
+
- `gu-IN`: Gujarati
|
|
70
109
|
|
|
71
110
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
72
111
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -85,6 +124,7 @@ class RawSpeechToTextClient:
|
|
|
85
124
|
method="POST",
|
|
86
125
|
data={
|
|
87
126
|
"model": model,
|
|
127
|
+
"mode": mode,
|
|
88
128
|
"language_code": language_code,
|
|
89
129
|
"input_audio_codec": input_audio_codec,
|
|
90
130
|
},
|
|
@@ -209,7 +249,10 @@ class RawSpeechToTextClient:
|
|
|
209
249
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
210
250
|
|
|
211
251
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
212
|
-
Model to be used for
|
|
252
|
+
Model to be used for speech to text translation.
|
|
253
|
+
|
|
254
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
255
|
+
- Example: Hindi audio → English text output
|
|
213
256
|
|
|
214
257
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
215
258
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -329,6 +372,7 @@ class AsyncRawSpeechToTextClient:
|
|
|
329
372
|
*,
|
|
330
373
|
file: core.File,
|
|
331
374
|
model: typing.Optional[SpeechToTextModel] = OMIT,
|
|
375
|
+
mode: typing.Optional[Mode] = OMIT,
|
|
332
376
|
language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
|
|
333
377
|
input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
|
|
334
378
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -355,12 +399,49 @@ class AsyncRawSpeechToTextClient:
|
|
|
355
399
|
|
|
356
400
|
model : typing.Optional[SpeechToTextModel]
|
|
357
401
|
Specifies the model to use for speech-to-text conversion.
|
|
358
|
-
|
|
402
|
+
|
|
403
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
404
|
+
|
|
405
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
406
|
+
|
|
407
|
+
mode : typing.Optional[Mode]
|
|
408
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
409
|
+
|
|
410
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
411
|
+
|
|
412
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
413
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
414
|
+
|
|
415
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
416
|
+
- Output: `My phone number is 9840950950`
|
|
417
|
+
|
|
418
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
419
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
420
|
+
|
|
421
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
422
|
+
- Output: `mera phone number hai 9840950950`
|
|
423
|
+
|
|
424
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
425
|
+
- Output: `मेरा phone number है 9840950950`
|
|
359
426
|
|
|
360
427
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
361
|
-
Specifies the language of the input audio.
|
|
362
|
-
|
|
363
|
-
|
|
428
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
429
|
+
|
|
430
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
431
|
+
|
|
432
|
+
**Available Options:**
|
|
433
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
434
|
+
- `hi-IN`: Hindi
|
|
435
|
+
- `bn-IN`: Bengali
|
|
436
|
+
- `kn-IN`: Kannada
|
|
437
|
+
- `ml-IN`: Malayalam
|
|
438
|
+
- `mr-IN`: Marathi
|
|
439
|
+
- `od-IN`: Odia
|
|
440
|
+
- `pa-IN`: Punjabi
|
|
441
|
+
- `ta-IN`: Tamil
|
|
442
|
+
- `te-IN`: Telugu
|
|
443
|
+
- `en-IN`: English
|
|
444
|
+
- `gu-IN`: Gujarati
|
|
364
445
|
|
|
365
446
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
366
447
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -379,6 +460,7 @@ class AsyncRawSpeechToTextClient:
|
|
|
379
460
|
method="POST",
|
|
380
461
|
data={
|
|
381
462
|
"model": model,
|
|
463
|
+
"mode": mode,
|
|
382
464
|
"language_code": language_code,
|
|
383
465
|
"input_audio_codec": input_audio_codec,
|
|
384
466
|
},
|
|
@@ -503,7 +585,10 @@ class AsyncRawSpeechToTextClient:
|
|
|
503
585
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
504
586
|
|
|
505
587
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
506
|
-
Model to be used for
|
|
588
|
+
Model to be used for speech to text translation.
|
|
589
|
+
|
|
590
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
591
|
+
- Example: Hindi audio → English text output
|
|
507
592
|
|
|
508
593
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
509
594
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -12,6 +12,7 @@ from ..types.files_upload_response import FilesUploadResponse
|
|
|
12
12
|
from ..types.job_status_v_1_response import JobStatusV1Response
|
|
13
13
|
from ..types.speech_to_text_model import SpeechToTextModel
|
|
14
14
|
from ..types.speech_to_text_language import SpeechToTextLanguage
|
|
15
|
+
from ..types.mode import Mode
|
|
15
16
|
from .raw_client import AsyncRawSpeechToTextJobClient, RawSpeechToTextJobClient
|
|
16
17
|
from .job import AsyncSpeechToTextJob, SpeechToTextJob
|
|
17
18
|
|
|
@@ -72,7 +73,9 @@ class SpeechToTextJobClient:
|
|
|
72
73
|
)
|
|
73
74
|
"""
|
|
74
75
|
_response = self._raw_client.initialise(
|
|
75
|
-
job_parameters=job_parameters,
|
|
76
|
+
job_parameters=job_parameters,
|
|
77
|
+
callback=callback,
|
|
78
|
+
request_options=request_options,
|
|
76
79
|
)
|
|
77
80
|
return _response.data
|
|
78
81
|
|
|
@@ -145,11 +148,17 @@ class SpeechToTextJobClient:
|
|
|
145
148
|
job_id="job_id",
|
|
146
149
|
)
|
|
147
150
|
"""
|
|
148
|
-
_response = self._raw_client.start(
|
|
151
|
+
_response = self._raw_client.start(
|
|
152
|
+
job_id, ptu_id=ptu_id, request_options=request_options
|
|
153
|
+
)
|
|
149
154
|
return _response.data
|
|
150
155
|
|
|
151
156
|
def get_upload_links(
|
|
152
|
-
self,
|
|
157
|
+
self,
|
|
158
|
+
*,
|
|
159
|
+
job_id: str,
|
|
160
|
+
files: typing.Sequence[str],
|
|
161
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
153
162
|
) -> FilesUploadResponse:
|
|
154
163
|
"""
|
|
155
164
|
Start a speech to text bulk job V1
|
|
@@ -180,11 +189,17 @@ class SpeechToTextJobClient:
|
|
|
180
189
|
files=["files"],
|
|
181
190
|
)
|
|
182
191
|
"""
|
|
183
|
-
_response = self._raw_client.get_upload_links(
|
|
192
|
+
_response = self._raw_client.get_upload_links(
|
|
193
|
+
job_id=job_id, files=files, request_options=request_options
|
|
194
|
+
)
|
|
184
195
|
return _response.data
|
|
185
196
|
|
|
186
197
|
def get_download_links(
|
|
187
|
-
self,
|
|
198
|
+
self,
|
|
199
|
+
*,
|
|
200
|
+
job_id: str,
|
|
201
|
+
files: typing.Sequence[str],
|
|
202
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
188
203
|
) -> FilesDownloadResponse:
|
|
189
204
|
"""
|
|
190
205
|
Start a speech to text bulk job V1
|
|
@@ -215,12 +230,15 @@ class SpeechToTextJobClient:
|
|
|
215
230
|
files=["files"],
|
|
216
231
|
)
|
|
217
232
|
"""
|
|
218
|
-
_response = self._raw_client.get_download_links(
|
|
233
|
+
_response = self._raw_client.get_download_links(
|
|
234
|
+
job_id=job_id, files=files, request_options=request_options
|
|
235
|
+
)
|
|
219
236
|
return _response.data
|
|
220
237
|
|
|
221
238
|
def create_job(
|
|
222
239
|
self,
|
|
223
240
|
model: SpeechToTextModel = "saarika:v2.5",
|
|
241
|
+
mode: typing.Optional[Mode] = None,
|
|
224
242
|
with_diarization: bool = False,
|
|
225
243
|
with_timestamps: bool = False,
|
|
226
244
|
language_code: typing.Optional[SpeechToTextLanguage] = None,
|
|
@@ -236,6 +254,10 @@ class SpeechToTextJobClient:
|
|
|
236
254
|
model : SpeechToTextModel, default="saarika:v2.5"
|
|
237
255
|
The model to use for transcription.
|
|
238
256
|
|
|
257
|
+
mode : typing.Optional[Mode], default=None
|
|
258
|
+
Mode of operation. Only applicable for saaras:v3 model.
|
|
259
|
+
Options: transcribe, translate, indic-en, verbatim, translit, codemix
|
|
260
|
+
|
|
239
261
|
with_diarization : typing.Optional[bool], default=False
|
|
240
262
|
Whether to enable speaker diarization (distinguishing who said what).
|
|
241
263
|
|
|
@@ -244,7 +266,7 @@ class SpeechToTextJobClient:
|
|
|
244
266
|
|
|
245
267
|
language_code : typing.Optional[SpeechToTextLanguage], default=None
|
|
246
268
|
The language code of the input audio (e.g., "hi-IN", "bn-IN").
|
|
247
|
-
|
|
269
|
+
|
|
248
270
|
num_speakers : typing.Optional[int], default=None
|
|
249
271
|
The number of distinct speakers in the audio, if known.
|
|
250
272
|
|
|
@@ -263,6 +285,7 @@ class SpeechToTextJobClient:
|
|
|
263
285
|
job_parameters=SpeechToTextJobParametersParams(
|
|
264
286
|
language_code=language_code,
|
|
265
287
|
model=model,
|
|
288
|
+
mode=mode, # type: ignore[typeddict-item]
|
|
266
289
|
num_speakers=num_speakers, # type: ignore[typeddict-item]
|
|
267
290
|
with_diarization=with_diarization,
|
|
268
291
|
with_timestamps=with_timestamps,
|
|
@@ -350,7 +373,9 @@ class AsyncSpeechToTextJobClient:
|
|
|
350
373
|
asyncio.run(main())
|
|
351
374
|
"""
|
|
352
375
|
_response = await self._raw_client.initialise(
|
|
353
|
-
job_parameters=job_parameters,
|
|
376
|
+
job_parameters=job_parameters,
|
|
377
|
+
callback=callback,
|
|
378
|
+
request_options=request_options,
|
|
354
379
|
)
|
|
355
380
|
return _response.data
|
|
356
381
|
|
|
@@ -392,7 +417,9 @@ class AsyncSpeechToTextJobClient:
|
|
|
392
417
|
|
|
393
418
|
asyncio.run(main())
|
|
394
419
|
"""
|
|
395
|
-
_response = await self._raw_client.get_status(
|
|
420
|
+
_response = await self._raw_client.get_status(
|
|
421
|
+
job_id, request_options=request_options
|
|
422
|
+
)
|
|
396
423
|
return _response.data
|
|
397
424
|
|
|
398
425
|
async def start(
|
|
@@ -439,11 +466,17 @@ class AsyncSpeechToTextJobClient:
|
|
|
439
466
|
|
|
440
467
|
asyncio.run(main())
|
|
441
468
|
"""
|
|
442
|
-
_response = await self._raw_client.start(
|
|
469
|
+
_response = await self._raw_client.start(
|
|
470
|
+
job_id, ptu_id=ptu_id, request_options=request_options
|
|
471
|
+
)
|
|
443
472
|
return _response.data
|
|
444
473
|
|
|
445
474
|
async def get_upload_links(
|
|
446
|
-
self,
|
|
475
|
+
self,
|
|
476
|
+
*,
|
|
477
|
+
job_id: str,
|
|
478
|
+
files: typing.Sequence[str],
|
|
479
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
447
480
|
) -> FilesUploadResponse:
|
|
448
481
|
"""
|
|
449
482
|
Start a speech to text bulk job V1
|
|
@@ -482,11 +515,17 @@ class AsyncSpeechToTextJobClient:
|
|
|
482
515
|
|
|
483
516
|
asyncio.run(main())
|
|
484
517
|
"""
|
|
485
|
-
_response = await self._raw_client.get_upload_links(
|
|
518
|
+
_response = await self._raw_client.get_upload_links(
|
|
519
|
+
job_id=job_id, files=files, request_options=request_options
|
|
520
|
+
)
|
|
486
521
|
return _response.data
|
|
487
522
|
|
|
488
523
|
async def get_download_links(
|
|
489
|
-
self,
|
|
524
|
+
self,
|
|
525
|
+
*,
|
|
526
|
+
job_id: str,
|
|
527
|
+
files: typing.Sequence[str],
|
|
528
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
490
529
|
) -> FilesDownloadResponse:
|
|
491
530
|
"""
|
|
492
531
|
Start a speech to text bulk job V1
|
|
@@ -533,6 +572,7 @@ class AsyncSpeechToTextJobClient:
|
|
|
533
572
|
async def create_job(
|
|
534
573
|
self,
|
|
535
574
|
model: SpeechToTextModel = "saarika:v2.5",
|
|
575
|
+
mode: typing.Optional[Mode] = None,
|
|
536
576
|
with_diarization: bool = False,
|
|
537
577
|
with_timestamps: bool = False,
|
|
538
578
|
language_code: typing.Optional[SpeechToTextLanguage] = None,
|
|
@@ -548,6 +588,10 @@ class AsyncSpeechToTextJobClient:
|
|
|
548
588
|
model : SpeechToTextModel, default="saarika:v2.5"
|
|
549
589
|
The model to use for transcription.
|
|
550
590
|
|
|
591
|
+
mode : typing.Optional[Mode], default=None
|
|
592
|
+
Mode of operation. Only applicable for saaras:v3 model.
|
|
593
|
+
Options: transcribe, translate, indic-en, verbatim, translit, codemix
|
|
594
|
+
|
|
551
595
|
with_diarization : typing.Optional[bool], default=False
|
|
552
596
|
Whether to enable speaker diarization (distinguishing who said what).
|
|
553
597
|
|
|
@@ -556,8 +600,8 @@ class AsyncSpeechToTextJobClient:
|
|
|
556
600
|
|
|
557
601
|
language_code : typing.Optional[SpeechToTextLanguage], default=None
|
|
558
602
|
The language code of the input audio (e.g., "hi-IN", "bn-IN").
|
|
559
|
-
|
|
560
|
-
num_speakers : typing.Optional[int]
|
|
603
|
+
|
|
604
|
+
num_speakers : typing.Optional[int] = None
|
|
561
605
|
The number of distinct speakers in the audio, if known.
|
|
562
606
|
|
|
563
607
|
callback : typing.Optional[BulkJobCallbackParams], default=OMIT
|
|
@@ -575,6 +619,7 @@ class AsyncSpeechToTextJobClient:
|
|
|
575
619
|
job_parameters=SpeechToTextJobParametersParams(
|
|
576
620
|
language_code=language_code,
|
|
577
621
|
model=model,
|
|
622
|
+
mode=mode, # type: ignore[typeddict-item]
|
|
578
623
|
with_diarization=with_diarization,
|
|
579
624
|
with_timestamps=with_timestamps,
|
|
580
625
|
num_speakers=num_speakers, # type: ignore[typeddict-item]
|
|
@@ -7,6 +7,8 @@ from .types import (
|
|
|
7
7
|
SpeechToTextStreamingHighVadSensitivity,
|
|
8
8
|
SpeechToTextStreamingInputAudioCodec,
|
|
9
9
|
SpeechToTextStreamingLanguageCode,
|
|
10
|
+
SpeechToTextStreamingMode,
|
|
11
|
+
SpeechToTextStreamingModel,
|
|
10
12
|
SpeechToTextStreamingVadSignals,
|
|
11
13
|
)
|
|
12
14
|
|
|
@@ -15,5 +17,7 @@ __all__ = [
|
|
|
15
17
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
16
18
|
"SpeechToTextStreamingInputAudioCodec",
|
|
17
19
|
"SpeechToTextStreamingLanguageCode",
|
|
20
|
+
"SpeechToTextStreamingMode",
|
|
21
|
+
"SpeechToTextStreamingModel",
|
|
18
22
|
"SpeechToTextStreamingVadSignals",
|
|
19
23
|
]
|