sarvamai 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -9
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_transcription_data.py +0 -6
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/requests/speech_to_text_translate_transcription_data.py +0 -6
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_job/job.py +100 -2
- sarvamai/speech_to_text_job/raw_client.py +14 -10
- sarvamai/speech_to_text_streaming/__init__.py +4 -2
- sarvamai/speech_to_text_streaming/client.py +100 -47
- sarvamai/speech_to_text_streaming/raw_client.py +100 -47
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -2
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_job/job.py +100 -2
- sarvamai/speech_to_text_translate_job/raw_client.py +14 -10
- sarvamai/speech_to_text_translate_streaming/__init__.py +0 -2
- sarvamai/speech_to_text_translate_streaming/client.py +18 -41
- sarvamai/speech_to_text_translate_streaming/raw_client.py +18 -41
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +0 -4
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -4
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/completion_event_flag.py +3 -1
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_transcription_data.py +0 -6
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/speech_to_text_translate_transcription_data.py +0 -6
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +86 -56
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_stream_ongoing_speech_results.py +0 -5
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_stream_ongoing_speech_results.py +0 -5
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- sarvamai/types/response_speech_state.py +0 -7
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -39,7 +39,7 @@ class RawSpeechToTextJobClient:
|
|
|
39
39
|
request_options: typing.Optional[RequestOptions] = None,
|
|
40
40
|
) -> HttpResponse[BulkJobInitResponseV1]:
|
|
41
41
|
"""
|
|
42
|
-
|
|
42
|
+
Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
|
|
43
43
|
|
|
44
44
|
Parameters
|
|
45
45
|
----------
|
|
@@ -160,7 +160,9 @@ class RawSpeechToTextJobClient:
|
|
|
160
160
|
self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
|
161
161
|
) -> HttpResponse[JobStatusV1Response]:
|
|
162
162
|
"""
|
|
163
|
-
|
|
163
|
+
Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
|
|
164
|
+
|
|
165
|
+
**Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
|
|
164
166
|
|
|
165
167
|
Parameters
|
|
166
168
|
----------
|
|
@@ -270,7 +272,7 @@ class RawSpeechToTextJobClient:
|
|
|
270
272
|
request_options: typing.Optional[RequestOptions] = None,
|
|
271
273
|
) -> HttpResponse[JobStatusV1Response]:
|
|
272
274
|
"""
|
|
273
|
-
Start a speech to text bulk job
|
|
275
|
+
Start processing a speech to text bulk job after all audio files have been uploaded
|
|
274
276
|
|
|
275
277
|
Parameters
|
|
276
278
|
----------
|
|
@@ -381,7 +383,7 @@ class RawSpeechToTextJobClient:
|
|
|
381
383
|
self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
|
|
382
384
|
) -> HttpResponse[FilesUploadResponse]:
|
|
383
385
|
"""
|
|
384
|
-
|
|
386
|
+
Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
|
|
385
387
|
|
|
386
388
|
Parameters
|
|
387
389
|
----------
|
|
@@ -496,7 +498,7 @@ class RawSpeechToTextJobClient:
|
|
|
496
498
|
self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
|
|
497
499
|
) -> HttpResponse[FilesDownloadResponse]:
|
|
498
500
|
"""
|
|
499
|
-
|
|
501
|
+
Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
|
|
500
502
|
|
|
501
503
|
Parameters
|
|
502
504
|
----------
|
|
@@ -620,7 +622,7 @@ class AsyncRawSpeechToTextJobClient:
|
|
|
620
622
|
request_options: typing.Optional[RequestOptions] = None,
|
|
621
623
|
) -> AsyncHttpResponse[BulkJobInitResponseV1]:
|
|
622
624
|
"""
|
|
623
|
-
|
|
625
|
+
Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
|
|
624
626
|
|
|
625
627
|
Parameters
|
|
626
628
|
----------
|
|
@@ -741,7 +743,9 @@ class AsyncRawSpeechToTextJobClient:
|
|
|
741
743
|
self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
|
742
744
|
) -> AsyncHttpResponse[JobStatusV1Response]:
|
|
743
745
|
"""
|
|
744
|
-
|
|
746
|
+
Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
|
|
747
|
+
|
|
748
|
+
**Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
|
|
745
749
|
|
|
746
750
|
Parameters
|
|
747
751
|
----------
|
|
@@ -851,7 +855,7 @@ class AsyncRawSpeechToTextJobClient:
|
|
|
851
855
|
request_options: typing.Optional[RequestOptions] = None,
|
|
852
856
|
) -> AsyncHttpResponse[JobStatusV1Response]:
|
|
853
857
|
"""
|
|
854
|
-
Start a speech to text bulk job
|
|
858
|
+
Start processing a speech to text bulk job after all audio files have been uploaded
|
|
855
859
|
|
|
856
860
|
Parameters
|
|
857
861
|
----------
|
|
@@ -962,7 +966,7 @@ class AsyncRawSpeechToTextJobClient:
|
|
|
962
966
|
self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
|
|
963
967
|
) -> AsyncHttpResponse[FilesUploadResponse]:
|
|
964
968
|
"""
|
|
965
|
-
|
|
969
|
+
Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
|
|
966
970
|
|
|
967
971
|
Parameters
|
|
968
972
|
----------
|
|
@@ -1077,7 +1081,7 @@ class AsyncRawSpeechToTextJobClient:
|
|
|
1077
1081
|
self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
|
|
1078
1082
|
) -> AsyncHttpResponse[FilesDownloadResponse]:
|
|
1079
1083
|
"""
|
|
1080
|
-
|
|
1084
|
+
Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
|
|
1081
1085
|
|
|
1082
1086
|
Parameters
|
|
1083
1087
|
----------
|
|
@@ -7,7 +7,8 @@ from .types import (
|
|
|
7
7
|
SpeechToTextStreamingHighVadSensitivity,
|
|
8
8
|
SpeechToTextStreamingInputAudioCodec,
|
|
9
9
|
SpeechToTextStreamingLanguageCode,
|
|
10
|
-
|
|
10
|
+
SpeechToTextStreamingMode,
|
|
11
|
+
SpeechToTextStreamingModel,
|
|
11
12
|
SpeechToTextStreamingVadSignals,
|
|
12
13
|
)
|
|
13
14
|
|
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
|
16
17
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
17
18
|
"SpeechToTextStreamingInputAudioCodec",
|
|
18
19
|
"SpeechToTextStreamingLanguageCode",
|
|
19
|
-
"
|
|
20
|
+
"SpeechToTextStreamingMode",
|
|
21
|
+
"SpeechToTextStreamingModel",
|
|
20
22
|
"SpeechToTextStreamingVadSignals",
|
|
21
23
|
]
|
|
@@ -15,9 +15,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
|
|
|
15
15
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
16
16
|
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
17
17
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
18
|
-
from .types.
|
|
19
|
-
|
|
20
|
-
)
|
|
18
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
19
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
21
20
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
22
21
|
|
|
23
22
|
try:
|
|
@@ -46,14 +45,13 @@ class SpeechToTextStreamingClient:
|
|
|
46
45
|
self,
|
|
47
46
|
*,
|
|
48
47
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
49
|
-
model: typing.Optional[
|
|
50
|
-
|
|
48
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
49
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
51
50
|
sample_rate: typing.Optional[str] = None,
|
|
52
51
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
53
52
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
54
53
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
55
|
-
|
|
56
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
54
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
57
55
|
api_subscription_key: typing.Optional[str] = None,
|
|
58
56
|
request_options: typing.Optional[RequestOptions] = None,
|
|
59
57
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -67,13 +65,47 @@ class SpeechToTextStreamingClient:
|
|
|
67
65
|
Parameters
|
|
68
66
|
----------
|
|
69
67
|
language_code : SpeechToTextStreamingLanguageCode
|
|
70
|
-
|
|
68
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
71
69
|
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
**Available Options:**
|
|
71
|
+
- `hi-IN`: Hindi
|
|
72
|
+
- `bn-IN`: Bengali
|
|
73
|
+
- `gu-IN`: Gujarati
|
|
74
|
+
- `kn-IN`: Kannada
|
|
75
|
+
- `ml-IN`: Malayalam
|
|
76
|
+
- `mr-IN`: Marathi
|
|
77
|
+
- `od-IN`: Odia
|
|
78
|
+
- `pa-IN`: Punjabi
|
|
79
|
+
- `ta-IN`: Tamil
|
|
80
|
+
- `te-IN`: Telugu
|
|
81
|
+
- `en-IN`: English
|
|
74
82
|
|
|
75
|
-
|
|
76
|
-
|
|
83
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
84
|
+
Specifies the model to use for speech-to-text conversion.
|
|
85
|
+
|
|
86
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
87
|
+
|
|
88
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
89
|
+
|
|
90
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
91
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
92
|
+
|
|
93
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
94
|
+
|
|
95
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
96
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
97
|
+
|
|
98
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
99
|
+
- Output: `My phone number is 9840950950`
|
|
100
|
+
|
|
101
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
102
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
103
|
+
|
|
104
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
105
|
+
- Output: `mera phone number hai 9840950950`
|
|
106
|
+
|
|
107
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
108
|
+
- Output: `मेरा phone number है 9840950950`
|
|
77
109
|
|
|
78
110
|
sample_rate : typing.Optional[str]
|
|
79
111
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -87,11 +119,9 @@ class SpeechToTextStreamingClient:
|
|
|
87
119
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
88
120
|
Signal to flush the audio buffer and finalize transcription
|
|
89
121
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
94
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
122
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
123
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
124
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
95
125
|
|
|
96
126
|
api_subscription_key : typing.Optional[str]
|
|
97
127
|
API subscription key for authentication
|
|
@@ -109,8 +139,8 @@ class SpeechToTextStreamingClient:
|
|
|
109
139
|
query_params = query_params.add("language-code", language_code)
|
|
110
140
|
if model is not None:
|
|
111
141
|
query_params = query_params.add("model", model)
|
|
112
|
-
if
|
|
113
|
-
query_params = query_params.add("
|
|
142
|
+
if mode is not None:
|
|
143
|
+
query_params = query_params.add("mode", mode)
|
|
114
144
|
if sample_rate is not None:
|
|
115
145
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
116
146
|
if high_vad_sensitivity is not None:
|
|
@@ -119,12 +149,8 @@ class SpeechToTextStreamingClient:
|
|
|
119
149
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
120
150
|
if flush_signal is not None:
|
|
121
151
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
122
|
-
if
|
|
123
|
-
query_params = query_params.add("
|
|
124
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
125
|
-
query_params = query_params.add(
|
|
126
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
127
|
-
)
|
|
152
|
+
if input_audio_codec is not None:
|
|
153
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
128
154
|
ws_url = ws_url + f"?{query_params}"
|
|
129
155
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
130
156
|
if api_subscription_key is not None:
|
|
@@ -169,14 +195,13 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
169
195
|
self,
|
|
170
196
|
*,
|
|
171
197
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
172
|
-
model: typing.Optional[
|
|
173
|
-
|
|
198
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
199
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
174
200
|
sample_rate: typing.Optional[str] = None,
|
|
175
201
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
176
202
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
177
203
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
178
|
-
|
|
179
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
204
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
180
205
|
api_subscription_key: typing.Optional[str] = None,
|
|
181
206
|
request_options: typing.Optional[RequestOptions] = None,
|
|
182
207
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -190,13 +215,47 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
190
215
|
Parameters
|
|
191
216
|
----------
|
|
192
217
|
language_code : SpeechToTextStreamingLanguageCode
|
|
193
|
-
|
|
218
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
194
219
|
|
|
195
|
-
|
|
196
|
-
|
|
220
|
+
**Available Options:**
|
|
221
|
+
- `hi-IN`: Hindi
|
|
222
|
+
- `bn-IN`: Bengali
|
|
223
|
+
- `gu-IN`: Gujarati
|
|
224
|
+
- `kn-IN`: Kannada
|
|
225
|
+
- `ml-IN`: Malayalam
|
|
226
|
+
- `mr-IN`: Marathi
|
|
227
|
+
- `od-IN`: Odia
|
|
228
|
+
- `pa-IN`: Punjabi
|
|
229
|
+
- `ta-IN`: Tamil
|
|
230
|
+
- `te-IN`: Telugu
|
|
231
|
+
- `en-IN`: English
|
|
197
232
|
|
|
198
|
-
|
|
199
|
-
|
|
233
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
234
|
+
Specifies the model to use for speech-to-text conversion.
|
|
235
|
+
|
|
236
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
237
|
+
|
|
238
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
239
|
+
|
|
240
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
241
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
242
|
+
|
|
243
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
244
|
+
|
|
245
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
246
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
247
|
+
|
|
248
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
249
|
+
- Output: `My phone number is 9840950950`
|
|
250
|
+
|
|
251
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
252
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
253
|
+
|
|
254
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
255
|
+
- Output: `mera phone number hai 9840950950`
|
|
256
|
+
|
|
257
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
258
|
+
- Output: `मेरा phone number है 9840950950`
|
|
200
259
|
|
|
201
260
|
sample_rate : typing.Optional[str]
|
|
202
261
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -210,11 +269,9 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
210
269
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
211
270
|
Signal to flush the audio buffer and finalize transcription
|
|
212
271
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
217
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
272
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
273
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
274
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
218
275
|
|
|
219
276
|
api_subscription_key : typing.Optional[str]
|
|
220
277
|
API subscription key for authentication
|
|
@@ -232,8 +289,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
232
289
|
query_params = query_params.add("language-code", language_code)
|
|
233
290
|
if model is not None:
|
|
234
291
|
query_params = query_params.add("model", model)
|
|
235
|
-
if
|
|
236
|
-
query_params = query_params.add("
|
|
292
|
+
if mode is not None:
|
|
293
|
+
query_params = query_params.add("mode", mode)
|
|
237
294
|
if sample_rate is not None:
|
|
238
295
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
239
296
|
if high_vad_sensitivity is not None:
|
|
@@ -242,12 +299,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
242
299
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
243
300
|
if flush_signal is not None:
|
|
244
301
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
245
|
-
if
|
|
246
|
-
query_params = query_params.add("
|
|
247
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
248
|
-
query_params = query_params.add(
|
|
249
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
250
|
-
)
|
|
302
|
+
if input_audio_codec is not None:
|
|
303
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
251
304
|
ws_url = ws_url + f"?{query_params}"
|
|
252
305
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
253
306
|
if api_subscription_key is not None:
|
|
@@ -14,9 +14,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
|
|
|
14
14
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
15
15
|
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
16
16
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
17
|
-
from .types.
|
|
18
|
-
|
|
19
|
-
)
|
|
17
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
18
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
20
19
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
21
20
|
|
|
22
21
|
try:
|
|
@@ -34,14 +33,13 @@ class RawSpeechToTextStreamingClient:
|
|
|
34
33
|
self,
|
|
35
34
|
*,
|
|
36
35
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
37
|
-
model: typing.Optional[
|
|
38
|
-
|
|
36
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
37
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
39
38
|
sample_rate: typing.Optional[str] = None,
|
|
40
39
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
41
40
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
42
41
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
43
|
-
|
|
44
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
42
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
45
43
|
api_subscription_key: typing.Optional[str] = None,
|
|
46
44
|
request_options: typing.Optional[RequestOptions] = None,
|
|
47
45
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -55,13 +53,47 @@ class RawSpeechToTextStreamingClient:
|
|
|
55
53
|
Parameters
|
|
56
54
|
----------
|
|
57
55
|
language_code : SpeechToTextStreamingLanguageCode
|
|
58
|
-
|
|
56
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
59
57
|
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
**Available Options:**
|
|
59
|
+
- `hi-IN`: Hindi
|
|
60
|
+
- `bn-IN`: Bengali
|
|
61
|
+
- `gu-IN`: Gujarati
|
|
62
|
+
- `kn-IN`: Kannada
|
|
63
|
+
- `ml-IN`: Malayalam
|
|
64
|
+
- `mr-IN`: Marathi
|
|
65
|
+
- `od-IN`: Odia
|
|
66
|
+
- `pa-IN`: Punjabi
|
|
67
|
+
- `ta-IN`: Tamil
|
|
68
|
+
- `te-IN`: Telugu
|
|
69
|
+
- `en-IN`: English
|
|
62
70
|
|
|
63
|
-
|
|
64
|
-
|
|
71
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
72
|
+
Specifies the model to use for speech-to-text conversion.
|
|
73
|
+
|
|
74
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
75
|
+
|
|
76
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
77
|
+
|
|
78
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
79
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
80
|
+
|
|
81
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
82
|
+
|
|
83
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
84
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
85
|
+
|
|
86
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
87
|
+
- Output: `My phone number is 9840950950`
|
|
88
|
+
|
|
89
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
90
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
91
|
+
|
|
92
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
93
|
+
- Output: `mera phone number hai 9840950950`
|
|
94
|
+
|
|
95
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
96
|
+
- Output: `मेरा phone number है 9840950950`
|
|
65
97
|
|
|
66
98
|
sample_rate : typing.Optional[str]
|
|
67
99
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -75,11 +107,9 @@ class RawSpeechToTextStreamingClient:
|
|
|
75
107
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
76
108
|
Signal to flush the audio buffer and finalize transcription
|
|
77
109
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
82
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
110
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
111
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
112
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
83
113
|
|
|
84
114
|
api_subscription_key : typing.Optional[str]
|
|
85
115
|
API subscription key for authentication
|
|
@@ -97,8 +127,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
97
127
|
query_params = query_params.add("language-code", language_code)
|
|
98
128
|
if model is not None:
|
|
99
129
|
query_params = query_params.add("model", model)
|
|
100
|
-
if
|
|
101
|
-
query_params = query_params.add("
|
|
130
|
+
if mode is not None:
|
|
131
|
+
query_params = query_params.add("mode", mode)
|
|
102
132
|
if sample_rate is not None:
|
|
103
133
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
104
134
|
if high_vad_sensitivity is not None:
|
|
@@ -107,12 +137,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
107
137
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
108
138
|
if flush_signal is not None:
|
|
109
139
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
110
|
-
if
|
|
111
|
-
query_params = query_params.add("
|
|
112
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
113
|
-
query_params = query_params.add(
|
|
114
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
115
|
-
)
|
|
140
|
+
if input_audio_codec is not None:
|
|
141
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
116
142
|
ws_url = ws_url + f"?{query_params}"
|
|
117
143
|
headers = self._client_wrapper.get_headers()
|
|
118
144
|
if api_subscription_key is not None:
|
|
@@ -146,14 +172,13 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
146
172
|
self,
|
|
147
173
|
*,
|
|
148
174
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
149
|
-
model: typing.Optional[
|
|
150
|
-
|
|
175
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
176
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
151
177
|
sample_rate: typing.Optional[str] = None,
|
|
152
178
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
153
179
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
154
180
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
155
|
-
|
|
156
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
181
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
157
182
|
api_subscription_key: typing.Optional[str] = None,
|
|
158
183
|
request_options: typing.Optional[RequestOptions] = None,
|
|
159
184
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -167,13 +192,47 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
167
192
|
Parameters
|
|
168
193
|
----------
|
|
169
194
|
language_code : SpeechToTextStreamingLanguageCode
|
|
170
|
-
|
|
195
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
171
196
|
|
|
172
|
-
|
|
173
|
-
|
|
197
|
+
**Available Options:**
|
|
198
|
+
- `hi-IN`: Hindi
|
|
199
|
+
- `bn-IN`: Bengali
|
|
200
|
+
- `gu-IN`: Gujarati
|
|
201
|
+
- `kn-IN`: Kannada
|
|
202
|
+
- `ml-IN`: Malayalam
|
|
203
|
+
- `mr-IN`: Marathi
|
|
204
|
+
- `od-IN`: Odia
|
|
205
|
+
- `pa-IN`: Punjabi
|
|
206
|
+
- `ta-IN`: Tamil
|
|
207
|
+
- `te-IN`: Telugu
|
|
208
|
+
- `en-IN`: English
|
|
174
209
|
|
|
175
|
-
|
|
176
|
-
|
|
210
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
211
|
+
Specifies the model to use for speech-to-text conversion.
|
|
212
|
+
|
|
213
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
214
|
+
|
|
215
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
216
|
+
|
|
217
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
218
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
219
|
+
|
|
220
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
221
|
+
|
|
222
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
223
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
224
|
+
|
|
225
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
226
|
+
- Output: `My phone number is 9840950950`
|
|
227
|
+
|
|
228
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
229
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
230
|
+
|
|
231
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
232
|
+
- Output: `mera phone number hai 9840950950`
|
|
233
|
+
|
|
234
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
235
|
+
- Output: `मेरा phone number है 9840950950`
|
|
177
236
|
|
|
178
237
|
sample_rate : typing.Optional[str]
|
|
179
238
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -187,11 +246,9 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
187
246
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
188
247
|
Signal to flush the audio buffer and finalize transcription
|
|
189
248
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
194
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
249
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
250
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
251
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
195
252
|
|
|
196
253
|
api_subscription_key : typing.Optional[str]
|
|
197
254
|
API subscription key for authentication
|
|
@@ -209,8 +266,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
209
266
|
query_params = query_params.add("language-code", language_code)
|
|
210
267
|
if model is not None:
|
|
211
268
|
query_params = query_params.add("model", model)
|
|
212
|
-
if
|
|
213
|
-
query_params = query_params.add("
|
|
269
|
+
if mode is not None:
|
|
270
|
+
query_params = query_params.add("mode", mode)
|
|
214
271
|
if sample_rate is not None:
|
|
215
272
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
216
273
|
if high_vad_sensitivity is not None:
|
|
@@ -219,12 +276,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
219
276
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
220
277
|
if flush_signal is not None:
|
|
221
278
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
222
|
-
if
|
|
223
|
-
query_params = query_params.add("
|
|
224
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
225
|
-
query_params = query_params.add(
|
|
226
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
227
|
-
)
|
|
279
|
+
if input_audio_codec is not None:
|
|
280
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
228
281
|
ws_url = ws_url + f"?{query_params}"
|
|
229
282
|
headers = self._client_wrapper.get_headers()
|
|
230
283
|
if api_subscription_key is not None:
|
|
@@ -6,7 +6,8 @@ from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSig
|
|
|
6
6
|
from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
7
7
|
from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
8
8
|
from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
9
|
-
from .
|
|
9
|
+
from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
10
|
+
from .speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
10
11
|
from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
@@ -14,6 +15,7 @@ __all__ = [
|
|
|
14
15
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
15
16
|
"SpeechToTextStreamingInputAudioCodec",
|
|
16
17
|
"SpeechToTextStreamingLanguageCode",
|
|
17
|
-
"
|
|
18
|
+
"SpeechToTextStreamingMode",
|
|
19
|
+
"SpeechToTextStreamingModel",
|
|
18
20
|
"SpeechToTextStreamingVadSignals",
|
|
19
21
|
]
|
|
@@ -3,31 +3,5 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
SpeechToTextStreamingInputAudioCodec = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
7
|
-
"wav",
|
|
8
|
-
"x-wav",
|
|
9
|
-
"wave",
|
|
10
|
-
"mp3",
|
|
11
|
-
"mpeg",
|
|
12
|
-
"mpeg3",
|
|
13
|
-
"x-mp3",
|
|
14
|
-
"x-mpeg-3",
|
|
15
|
-
"aac",
|
|
16
|
-
"x-aac",
|
|
17
|
-
"aiff",
|
|
18
|
-
"x-aiff",
|
|
19
|
-
"ogg",
|
|
20
|
-
"opus",
|
|
21
|
-
"flac",
|
|
22
|
-
"x-flac",
|
|
23
|
-
"mp4",
|
|
24
|
-
"x-m4a",
|
|
25
|
-
"amr",
|
|
26
|
-
"x-ms-wma",
|
|
27
|
-
"webm",
|
|
28
|
-
"pcm_s16le",
|
|
29
|
-
"pcm_l16",
|
|
30
|
-
"pcm_raw",
|
|
31
|
-
],
|
|
32
|
-
typing.Any,
|
|
6
|
+
typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
|
|
33
7
|
]
|