sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. sarvamai/__init__.py +62 -3
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +775 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  24. sarvamai/speech_to_text/client.py +95 -10
  25. sarvamai/speech_to_text/raw_client.py +95 -10
  26. sarvamai/speech_to_text_job/client.py +60 -15
  27. sarvamai/speech_to_text_streaming/__init__.py +4 -0
  28. sarvamai/speech_to_text_streaming/client.py +102 -18
  29. sarvamai/speech_to_text_streaming/raw_client.py +102 -18
  30. sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
  31. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  32. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  33. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  34. sarvamai/speech_to_text_translate_streaming/client.py +20 -12
  35. sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
  36. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  37. sarvamai/text/client.py +0 -12
  38. sarvamai/text/raw_client.py +0 -12
  39. sarvamai/text_to_speech/client.py +116 -14
  40. sarvamai/text_to_speech/raw_client.py +116 -14
  41. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  42. sarvamai/text_to_speech_streaming/client.py +19 -6
  43. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  44. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  45. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  46. sarvamai/types/__init__.py +34 -2
  47. sarvamai/types/audio_data.py +0 -6
  48. sarvamai/types/configure_connection.py +4 -0
  49. sarvamai/types/configure_connection_data.py +40 -11
  50. sarvamai/types/configure_connection_data_model.py +5 -0
  51. sarvamai/types/configure_connection_data_speaker.py +35 -1
  52. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  53. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  54. sarvamai/types/doc_digitization_error_code.py +15 -0
  55. sarvamai/types/doc_digitization_error_details.py +33 -0
  56. sarvamai/types/doc_digitization_error_message.py +23 -0
  57. sarvamai/types/doc_digitization_job_detail.py +74 -0
  58. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  59. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  60. sarvamai/types/doc_digitization_job_state.py +7 -0
  61. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  62. sarvamai/types/doc_digitization_output_format.py +5 -0
  63. sarvamai/types/doc_digitization_page_error.py +36 -0
  64. sarvamai/types/doc_digitization_supported_language.py +32 -0
  65. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  66. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  67. sarvamai/types/mode.py +5 -0
  68. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  69. sarvamai/types/speech_to_text_model.py +1 -1
  70. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  71. sarvamai/types/text_to_speech_model.py +1 -1
  72. sarvamai/types/text_to_speech_speaker.py +35 -1
  73. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/METADATA +1 -1
  74. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/RECORD +75 -42
  75. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  76. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/WHEEL +0 -0
@@ -6,6 +6,7 @@ from .. import core
6
6
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
7
7
  from ..core.request_options import RequestOptions
8
8
  from ..types.input_audio_codec import InputAudioCodec
9
+ from ..types.mode import Mode
9
10
  from ..types.speech_to_text_language import SpeechToTextLanguage
10
11
  from ..types.speech_to_text_model import SpeechToTextModel
11
12
  from ..types.speech_to_text_response import SpeechToTextResponse
@@ -37,6 +38,7 @@ class SpeechToTextClient:
37
38
  *,
38
39
  file: core.File,
39
40
  model: typing.Optional[SpeechToTextModel] = OMIT,
41
+ mode: typing.Optional[Mode] = OMIT,
40
42
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
41
43
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
42
44
  request_options: typing.Optional[RequestOptions] = None,
@@ -63,12 +65,49 @@ class SpeechToTextClient:
63
65
 
64
66
  model : typing.Optional[SpeechToTextModel]
65
67
  Specifies the model to use for speech-to-text conversion.
66
- Note:- Default model is `saarika:v2.5`
68
+
69
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
70
+
71
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
72
+
73
+ mode : typing.Optional[Mode]
74
+ Mode of operation. **Only applicable when using saaras:v3 model.**
75
+
76
+ Example audio: 'मेरा फोन नंबर है 9840950950'
77
+
78
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
79
+ - Output: `मेरा फोन नंबर है 9840950950`
80
+
81
+ - **translate**: Translates speech from any supported Indic language to English.
82
+ - Output: `My phone number is 9840950950`
83
+
84
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
85
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
86
+
87
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
88
+ - Output: `mera phone number hai 9840950950`
89
+
90
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
91
+ - Output: `मेरा phone number है 9840950950`
67
92
 
68
93
  language_code : typing.Optional[SpeechToTextLanguage]
69
- Specifies the language of the input audio.
70
- For the `saarika:v2.5` model, it is optional.
71
- `unknown`: Use this when the language is not known; the API will detect it automatically.
94
+ Specifies the language of the input audio in BCP-47 format.
95
+
96
+ **Note:** This parameter is optional for `saarika:v2.5` model.
97
+
98
+ **Available Options:**
99
+ - `unknown`: Use when the language is not known; the API will auto-detect.
100
+ - `hi-IN`: Hindi
101
+ - `bn-IN`: Bengali
102
+ - `kn-IN`: Kannada
103
+ - `ml-IN`: Malayalam
104
+ - `mr-IN`: Marathi
105
+ - `od-IN`: Odia
106
+ - `pa-IN`: Punjabi
107
+ - `ta-IN`: Tamil
108
+ - `te-IN`: Telugu
109
+ - `en-IN`: English
110
+ - `gu-IN`: Gujarati
72
111
 
73
112
  input_audio_codec : typing.Optional[InputAudioCodec]
74
113
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -93,6 +132,7 @@ class SpeechToTextClient:
93
132
  _response = self._raw_client.transcribe(
94
133
  file=file,
95
134
  model=model,
135
+ mode=mode,
96
136
  language_code=language_code,
97
137
  input_audio_codec=input_audio_codec,
98
138
  request_options=request_options,
@@ -132,7 +172,10 @@ class SpeechToTextClient:
132
172
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
133
173
 
134
174
  model : typing.Optional[SpeechToTextTranslateModel]
135
- Model to be used for converting speech to text in target language
175
+ Model to be used for speech to text translation.
176
+
177
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
178
+ - Example: Hindi audio → English text output
136
179
 
137
180
  input_audio_codec : typing.Optional[InputAudioCodec]
138
181
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -180,6 +223,7 @@ class AsyncSpeechToTextClient:
180
223
  *,
181
224
  file: core.File,
182
225
  model: typing.Optional[SpeechToTextModel] = OMIT,
226
+ mode: typing.Optional[Mode] = OMIT,
183
227
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
184
228
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
185
229
  request_options: typing.Optional[RequestOptions] = None,
@@ -206,12 +250,49 @@ class AsyncSpeechToTextClient:
206
250
 
207
251
  model : typing.Optional[SpeechToTextModel]
208
252
  Specifies the model to use for speech-to-text conversion.
209
- Note:- Default model is `saarika:v2.5`
253
+
254
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
255
+
256
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
257
+
258
+ mode : typing.Optional[Mode]
259
+ Mode of operation. **Only applicable when using saaras:v3 model.**
260
+
261
+ Example audio: 'मेरा फोन नंबर है 9840950950'
262
+
263
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
264
+ - Output: `मेरा फोन नंबर है 9840950950`
265
+
266
+ - **translate**: Translates speech from any supported Indic language to English.
267
+ - Output: `My phone number is 9840950950`
268
+
269
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
270
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
271
+
272
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
273
+ - Output: `mera phone number hai 9840950950`
274
+
275
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
276
+ - Output: `मेरा phone number है 9840950950`
210
277
 
211
278
  language_code : typing.Optional[SpeechToTextLanguage]
212
- Specifies the language of the input audio.
213
- For the `saarika:v2.5` model, it is optional.
214
- `unknown`: Use this when the language is not known; the API will detect it automatically.
279
+ Specifies the language of the input audio in BCP-47 format.
280
+
281
+ **Note:** This parameter is optional for `saarika:v2.5` model.
282
+
283
+ **Available Options:**
284
+ - `unknown`: Use when the language is not known; the API will auto-detect.
285
+ - `hi-IN`: Hindi
286
+ - `bn-IN`: Bengali
287
+ - `kn-IN`: Kannada
288
+ - `ml-IN`: Malayalam
289
+ - `mr-IN`: Marathi
290
+ - `od-IN`: Odia
291
+ - `pa-IN`: Punjabi
292
+ - `ta-IN`: Tamil
293
+ - `te-IN`: Telugu
294
+ - `en-IN`: English
295
+ - `gu-IN`: Gujarati
215
296
 
216
297
  input_audio_codec : typing.Optional[InputAudioCodec]
217
298
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -244,6 +325,7 @@ class AsyncSpeechToTextClient:
244
325
  _response = await self._raw_client.transcribe(
245
326
  file=file,
246
327
  model=model,
328
+ mode=mode,
247
329
  language_code=language_code,
248
330
  input_audio_codec=input_audio_codec,
249
331
  request_options=request_options,
@@ -283,7 +365,10 @@ class AsyncSpeechToTextClient:
283
365
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
284
366
 
285
367
  model : typing.Optional[SpeechToTextTranslateModel]
286
- Model to be used for converting speech to text in target language
368
+ Model to be used for speech to text translation.
369
+
370
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
371
+ - Example: Hindi audio → English text output
287
372
 
288
373
  input_audio_codec : typing.Optional[InputAudioCodec]
289
374
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -16,6 +16,7 @@ from ..errors.service_unavailable_error import ServiceUnavailableError
16
16
  from ..errors.too_many_requests_error import TooManyRequestsError
17
17
  from ..errors.unprocessable_entity_error import UnprocessableEntityError
18
18
  from ..types.input_audio_codec import InputAudioCodec
19
+ from ..types.mode import Mode
19
20
  from ..types.speech_to_text_language import SpeechToTextLanguage
20
21
  from ..types.speech_to_text_model import SpeechToTextModel
21
22
  from ..types.speech_to_text_response import SpeechToTextResponse
@@ -35,6 +36,7 @@ class RawSpeechToTextClient:
35
36
  *,
36
37
  file: core.File,
37
38
  model: typing.Optional[SpeechToTextModel] = OMIT,
39
+ mode: typing.Optional[Mode] = OMIT,
38
40
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
39
41
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
40
42
  request_options: typing.Optional[RequestOptions] = None,
@@ -61,12 +63,49 @@ class RawSpeechToTextClient:
61
63
 
62
64
  model : typing.Optional[SpeechToTextModel]
63
65
  Specifies the model to use for speech-to-text conversion.
64
- Note:- Default model is `saarika:v2.5`
66
+
67
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
68
+
69
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
70
+
71
+ mode : typing.Optional[Mode]
72
+ Mode of operation. **Only applicable when using saaras:v3 model.**
73
+
74
+ Example audio: 'मेरा फोन नंबर है 9840950950'
75
+
76
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
77
+ - Output: `मेरा फोन नंबर है 9840950950`
78
+
79
+ - **translate**: Translates speech from any supported Indic language to English.
80
+ - Output: `My phone number is 9840950950`
81
+
82
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
83
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
84
+
85
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
86
+ - Output: `mera phone number hai 9840950950`
87
+
88
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
89
+ - Output: `मेरा phone number है 9840950950`
65
90
 
66
91
  language_code : typing.Optional[SpeechToTextLanguage]
67
- Specifies the language of the input audio.
68
- For the `saarika:v2.5` model, it is optional.
69
- `unknown`: Use this when the language is not known; the API will detect it automatically.
92
+ Specifies the language of the input audio in BCP-47 format.
93
+
94
+ **Note:** This parameter is optional for `saarika:v2.5` model.
95
+
96
+ **Available Options:**
97
+ - `unknown`: Use when the language is not known; the API will auto-detect.
98
+ - `hi-IN`: Hindi
99
+ - `bn-IN`: Bengali
100
+ - `kn-IN`: Kannada
101
+ - `ml-IN`: Malayalam
102
+ - `mr-IN`: Marathi
103
+ - `od-IN`: Odia
104
+ - `pa-IN`: Punjabi
105
+ - `ta-IN`: Tamil
106
+ - `te-IN`: Telugu
107
+ - `en-IN`: English
108
+ - `gu-IN`: Gujarati
70
109
 
71
110
  input_audio_codec : typing.Optional[InputAudioCodec]
72
111
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -85,6 +124,7 @@ class RawSpeechToTextClient:
85
124
  method="POST",
86
125
  data={
87
126
  "model": model,
127
+ "mode": mode,
88
128
  "language_code": language_code,
89
129
  "input_audio_codec": input_audio_codec,
90
130
  },
@@ -209,7 +249,10 @@ class RawSpeechToTextClient:
209
249
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
210
250
 
211
251
  model : typing.Optional[SpeechToTextTranslateModel]
212
- Model to be used for converting speech to text in target language
252
+ Model to be used for speech to text translation.
253
+
254
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
255
+ - Example: Hindi audio → English text output
213
256
 
214
257
  input_audio_codec : typing.Optional[InputAudioCodec]
215
258
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -329,6 +372,7 @@ class AsyncRawSpeechToTextClient:
329
372
  *,
330
373
  file: core.File,
331
374
  model: typing.Optional[SpeechToTextModel] = OMIT,
375
+ mode: typing.Optional[Mode] = OMIT,
332
376
  language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
333
377
  input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
334
378
  request_options: typing.Optional[RequestOptions] = None,
@@ -355,12 +399,49 @@ class AsyncRawSpeechToTextClient:
355
399
 
356
400
  model : typing.Optional[SpeechToTextModel]
357
401
  Specifies the model to use for speech-to-text conversion.
358
- Note:- Default model is `saarika:v2.5`
402
+
403
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
404
+
405
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
406
+
407
+ mode : typing.Optional[Mode]
408
+ Mode of operation. **Only applicable when using saaras:v3 model.**
409
+
410
+ Example audio: 'मेरा फोन नंबर है 9840950950'
411
+
412
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
413
+ - Output: `मेरा फोन नंबर है 9840950950`
414
+
415
+ - **translate**: Translates speech from any supported Indic language to English.
416
+ - Output: `My phone number is 9840950950`
417
+
418
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
419
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
420
+
421
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
422
+ - Output: `mera phone number hai 9840950950`
423
+
424
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
425
+ - Output: `मेरा phone number है 9840950950`
359
426
 
360
427
  language_code : typing.Optional[SpeechToTextLanguage]
361
- Specifies the language of the input audio.
362
- For the `saarika:v2.5` model, it is optional.
363
- `unknown`: Use this when the language is not known; the API will detect it automatically.
428
+ Specifies the language of the input audio in BCP-47 format.
429
+
430
+ **Note:** This parameter is optional for `saarika:v2.5` model.
431
+
432
+ **Available Options:**
433
+ - `unknown`: Use when the language is not known; the API will auto-detect.
434
+ - `hi-IN`: Hindi
435
+ - `bn-IN`: Bengali
436
+ - `kn-IN`: Kannada
437
+ - `ml-IN`: Malayalam
438
+ - `mr-IN`: Marathi
439
+ - `od-IN`: Odia
440
+ - `pa-IN`: Punjabi
441
+ - `ta-IN`: Tamil
442
+ - `te-IN`: Telugu
443
+ - `en-IN`: English
444
+ - `gu-IN`: Gujarati
364
445
 
365
446
  input_audio_codec : typing.Optional[InputAudioCodec]
366
447
  Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
@@ -379,6 +460,7 @@ class AsyncRawSpeechToTextClient:
379
460
  method="POST",
380
461
  data={
381
462
  "model": model,
463
+ "mode": mode,
382
464
  "language_code": language_code,
383
465
  "input_audio_codec": input_audio_codec,
384
466
  },
@@ -503,7 +585,10 @@ class AsyncRawSpeechToTextClient:
503
585
  Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
504
586
 
505
587
  model : typing.Optional[SpeechToTextTranslateModel]
506
- Model to be used for converting speech to text in target language
588
+ Model to be used for speech to text translation.
589
+
590
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
591
+ - Example: Hindi audio → English text output
507
592
 
508
593
  input_audio_codec : typing.Optional[InputAudioCodec]
509
594
  Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
@@ -12,6 +12,7 @@ from ..types.files_upload_response import FilesUploadResponse
12
12
  from ..types.job_status_v_1_response import JobStatusV1Response
13
13
  from ..types.speech_to_text_model import SpeechToTextModel
14
14
  from ..types.speech_to_text_language import SpeechToTextLanguage
15
+ from ..types.mode import Mode
15
16
  from .raw_client import AsyncRawSpeechToTextJobClient, RawSpeechToTextJobClient
16
17
  from .job import AsyncSpeechToTextJob, SpeechToTextJob
17
18
 
@@ -72,7 +73,9 @@ class SpeechToTextJobClient:
72
73
  )
73
74
  """
74
75
  _response = self._raw_client.initialise(
75
- job_parameters=job_parameters, callback=callback, request_options=request_options
76
+ job_parameters=job_parameters,
77
+ callback=callback,
78
+ request_options=request_options,
76
79
  )
77
80
  return _response.data
78
81
 
@@ -145,11 +148,17 @@ class SpeechToTextJobClient:
145
148
  job_id="job_id",
146
149
  )
147
150
  """
148
- _response = self._raw_client.start(job_id, ptu_id=ptu_id, request_options=request_options)
151
+ _response = self._raw_client.start(
152
+ job_id, ptu_id=ptu_id, request_options=request_options
153
+ )
149
154
  return _response.data
150
155
 
151
156
  def get_upload_links(
152
- self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
157
+ self,
158
+ *,
159
+ job_id: str,
160
+ files: typing.Sequence[str],
161
+ request_options: typing.Optional[RequestOptions] = None,
153
162
  ) -> FilesUploadResponse:
154
163
  """
155
164
  Start a speech to text bulk job V1
@@ -180,11 +189,17 @@ class SpeechToTextJobClient:
180
189
  files=["files"],
181
190
  )
182
191
  """
183
- _response = self._raw_client.get_upload_links(job_id=job_id, files=files, request_options=request_options)
192
+ _response = self._raw_client.get_upload_links(
193
+ job_id=job_id, files=files, request_options=request_options
194
+ )
184
195
  return _response.data
185
196
 
186
197
  def get_download_links(
187
- self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
198
+ self,
199
+ *,
200
+ job_id: str,
201
+ files: typing.Sequence[str],
202
+ request_options: typing.Optional[RequestOptions] = None,
188
203
  ) -> FilesDownloadResponse:
189
204
  """
190
205
  Start a speech to text bulk job V1
@@ -215,12 +230,15 @@ class SpeechToTextJobClient:
215
230
  files=["files"],
216
231
  )
217
232
  """
218
- _response = self._raw_client.get_download_links(job_id=job_id, files=files, request_options=request_options)
233
+ _response = self._raw_client.get_download_links(
234
+ job_id=job_id, files=files, request_options=request_options
235
+ )
219
236
  return _response.data
220
237
 
221
238
  def create_job(
222
239
  self,
223
240
  model: SpeechToTextModel = "saarika:v2.5",
241
+ mode: typing.Optional[Mode] = None,
224
242
  with_diarization: bool = False,
225
243
  with_timestamps: bool = False,
226
244
  language_code: typing.Optional[SpeechToTextLanguage] = None,
@@ -236,6 +254,10 @@ class SpeechToTextJobClient:
236
254
  model : SpeechToTextModel, default="saarika:v2.5"
237
255
  The model to use for transcription.
238
256
 
257
+ mode : typing.Optional[Mode], default=None
258
+ Mode of operation. Only applicable for saaras:v3 model.
259
+ Options: transcribe, translate, indic-en, verbatim, translit, codemix
260
+
239
261
  with_diarization : typing.Optional[bool], default=False
240
262
  Whether to enable speaker diarization (distinguishing who said what).
241
263
 
@@ -244,7 +266,7 @@ class SpeechToTextJobClient:
244
266
 
245
267
  language_code : typing.Optional[SpeechToTextLanguage], default=None
246
268
  The language code of the input audio (e.g., "hi-IN", "bn-IN").
247
-
269
+
248
270
  num_speakers : typing.Optional[int], default=None
249
271
  The number of distinct speakers in the audio, if known.
250
272
 
@@ -263,6 +285,7 @@ class SpeechToTextJobClient:
263
285
  job_parameters=SpeechToTextJobParametersParams(
264
286
  language_code=language_code,
265
287
  model=model,
288
+ mode=mode, # type: ignore[typeddict-item]
266
289
  num_speakers=num_speakers, # type: ignore[typeddict-item]
267
290
  with_diarization=with_diarization,
268
291
  with_timestamps=with_timestamps,
@@ -350,7 +373,9 @@ class AsyncSpeechToTextJobClient:
350
373
  asyncio.run(main())
351
374
  """
352
375
  _response = await self._raw_client.initialise(
353
- job_parameters=job_parameters, callback=callback, request_options=request_options
376
+ job_parameters=job_parameters,
377
+ callback=callback,
378
+ request_options=request_options,
354
379
  )
355
380
  return _response.data
356
381
 
@@ -392,7 +417,9 @@ class AsyncSpeechToTextJobClient:
392
417
 
393
418
  asyncio.run(main())
394
419
  """
395
- _response = await self._raw_client.get_status(job_id, request_options=request_options)
420
+ _response = await self._raw_client.get_status(
421
+ job_id, request_options=request_options
422
+ )
396
423
  return _response.data
397
424
 
398
425
  async def start(
@@ -439,11 +466,17 @@ class AsyncSpeechToTextJobClient:
439
466
 
440
467
  asyncio.run(main())
441
468
  """
442
- _response = await self._raw_client.start(job_id, ptu_id=ptu_id, request_options=request_options)
469
+ _response = await self._raw_client.start(
470
+ job_id, ptu_id=ptu_id, request_options=request_options
471
+ )
443
472
  return _response.data
444
473
 
445
474
  async def get_upload_links(
446
- self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
475
+ self,
476
+ *,
477
+ job_id: str,
478
+ files: typing.Sequence[str],
479
+ request_options: typing.Optional[RequestOptions] = None,
447
480
  ) -> FilesUploadResponse:
448
481
  """
449
482
  Start a speech to text bulk job V1
@@ -482,11 +515,17 @@ class AsyncSpeechToTextJobClient:
482
515
 
483
516
  asyncio.run(main())
484
517
  """
485
- _response = await self._raw_client.get_upload_links(job_id=job_id, files=files, request_options=request_options)
518
+ _response = await self._raw_client.get_upload_links(
519
+ job_id=job_id, files=files, request_options=request_options
520
+ )
486
521
  return _response.data
487
522
 
488
523
  async def get_download_links(
489
- self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
524
+ self,
525
+ *,
526
+ job_id: str,
527
+ files: typing.Sequence[str],
528
+ request_options: typing.Optional[RequestOptions] = None,
490
529
  ) -> FilesDownloadResponse:
491
530
  """
492
531
  Start a speech to text bulk job V1
@@ -533,6 +572,7 @@ class AsyncSpeechToTextJobClient:
533
572
  async def create_job(
534
573
  self,
535
574
  model: SpeechToTextModel = "saarika:v2.5",
575
+ mode: typing.Optional[Mode] = None,
536
576
  with_diarization: bool = False,
537
577
  with_timestamps: bool = False,
538
578
  language_code: typing.Optional[SpeechToTextLanguage] = None,
@@ -548,6 +588,10 @@ class AsyncSpeechToTextJobClient:
548
588
  model : SpeechToTextModel, default="saarika:v2.5"
549
589
  The model to use for transcription.
550
590
 
591
+ mode : typing.Optional[Mode], default=None
592
+ Mode of operation. Only applicable for saaras:v3 model.
593
+ Options: transcribe, translate, indic-en, verbatim, translit, codemix
594
+
551
595
  with_diarization : typing.Optional[bool], default=False
552
596
  Whether to enable speaker diarization (distinguishing who said what).
553
597
 
@@ -556,8 +600,8 @@ class AsyncSpeechToTextJobClient:
556
600
 
557
601
  language_code : typing.Optional[SpeechToTextLanguage], default=None
558
602
  The language code of the input audio (e.g., "hi-IN", "bn-IN").
559
-
560
- num_speakers : typing.Optional[int], default=None
603
+
604
+ num_speakers : typing.Optional[int] = None
561
605
  The number of distinct speakers in the audio, if known.
562
606
 
563
607
  callback : typing.Optional[BulkJobCallbackParams], default=OMIT
@@ -575,6 +619,7 @@ class AsyncSpeechToTextJobClient:
575
619
  job_parameters=SpeechToTextJobParametersParams(
576
620
  language_code=language_code,
577
621
  model=model,
622
+ mode=mode, # type: ignore[typeddict-item]
578
623
  with_diarization=with_diarization,
579
624
  with_timestamps=with_timestamps,
580
625
  num_speakers=num_speakers, # type: ignore[typeddict-item]
@@ -7,6 +7,8 @@ from .types import (
7
7
  SpeechToTextStreamingHighVadSensitivity,
8
8
  SpeechToTextStreamingInputAudioCodec,
9
9
  SpeechToTextStreamingLanguageCode,
10
+ SpeechToTextStreamingMode,
11
+ SpeechToTextStreamingModel,
10
12
  SpeechToTextStreamingVadSignals,
11
13
  )
12
14
 
@@ -15,5 +17,7 @@ __all__ = [
15
17
  "SpeechToTextStreamingHighVadSensitivity",
16
18
  "SpeechToTextStreamingInputAudioCodec",
17
19
  "SpeechToTextStreamingLanguageCode",
20
+ "SpeechToTextStreamingMode",
21
+ "SpeechToTextStreamingModel",
18
22
  "SpeechToTextStreamingVadSignals",
19
23
  ]