sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -3
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_streaming/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/client.py +102 -18
- sarvamai/speech_to_text_streaming/raw_client.py +102 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_streaming/client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -2
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +75 -42
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -15,6 +15,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
|
|
|
15
15
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
16
16
|
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
17
17
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
18
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
19
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
18
20
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
19
21
|
|
|
20
22
|
try:
|
|
@@ -43,12 +45,13 @@ class SpeechToTextStreamingClient:
|
|
|
43
45
|
self,
|
|
44
46
|
*,
|
|
45
47
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
46
|
-
model: typing.Optional[
|
|
47
|
-
|
|
48
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
49
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
48
50
|
sample_rate: typing.Optional[str] = None,
|
|
49
51
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
50
52
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
51
53
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
54
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
52
55
|
api_subscription_key: typing.Optional[str] = None,
|
|
53
56
|
request_options: typing.Optional[RequestOptions] = None,
|
|
54
57
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -62,13 +65,47 @@ class SpeechToTextStreamingClient:
|
|
|
62
65
|
Parameters
|
|
63
66
|
----------
|
|
64
67
|
language_code : SpeechToTextStreamingLanguageCode
|
|
65
|
-
|
|
68
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
66
69
|
|
|
67
|
-
|
|
68
|
-
|
|
70
|
+
**Available Options:**
|
|
71
|
+
- `hi-IN`: Hindi
|
|
72
|
+
- `bn-IN`: Bengali
|
|
73
|
+
- `gu-IN`: Gujarati
|
|
74
|
+
- `kn-IN`: Kannada
|
|
75
|
+
- `ml-IN`: Malayalam
|
|
76
|
+
- `mr-IN`: Marathi
|
|
77
|
+
- `od-IN`: Odia
|
|
78
|
+
- `pa-IN`: Punjabi
|
|
79
|
+
- `ta-IN`: Tamil
|
|
80
|
+
- `te-IN`: Telugu
|
|
81
|
+
- `en-IN`: English
|
|
69
82
|
|
|
70
|
-
|
|
71
|
-
|
|
83
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
84
|
+
Specifies the model to use for speech-to-text conversion.
|
|
85
|
+
|
|
86
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
87
|
+
|
|
88
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
89
|
+
|
|
90
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
91
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
92
|
+
|
|
93
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
94
|
+
|
|
95
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
96
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
97
|
+
|
|
98
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
99
|
+
- Output: `My phone number is 9840950950`
|
|
100
|
+
|
|
101
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
102
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
103
|
+
|
|
104
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
105
|
+
- Output: `mera phone number hai 9840950950`
|
|
106
|
+
|
|
107
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
108
|
+
- Output: `मेरा phone number है 9840950950`
|
|
72
109
|
|
|
73
110
|
sample_rate : typing.Optional[str]
|
|
74
111
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -82,6 +119,10 @@ class SpeechToTextStreamingClient:
|
|
|
82
119
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
83
120
|
Signal to flush the audio buffer and finalize transcription
|
|
84
121
|
|
|
122
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
123
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
124
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
125
|
+
|
|
85
126
|
api_subscription_key : typing.Optional[str]
|
|
86
127
|
API subscription key for authentication
|
|
87
128
|
|
|
@@ -98,8 +139,8 @@ class SpeechToTextStreamingClient:
|
|
|
98
139
|
query_params = query_params.add("language-code", language_code)
|
|
99
140
|
if model is not None:
|
|
100
141
|
query_params = query_params.add("model", model)
|
|
101
|
-
if
|
|
102
|
-
query_params = query_params.add("
|
|
142
|
+
if mode is not None:
|
|
143
|
+
query_params = query_params.add("mode", mode)
|
|
103
144
|
if sample_rate is not None:
|
|
104
145
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
105
146
|
if high_vad_sensitivity is not None:
|
|
@@ -108,6 +149,8 @@ class SpeechToTextStreamingClient:
|
|
|
108
149
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
109
150
|
if flush_signal is not None:
|
|
110
151
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
152
|
+
if input_audio_codec is not None:
|
|
153
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
111
154
|
ws_url = ws_url + f"?{query_params}"
|
|
112
155
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
113
156
|
if api_subscription_key is not None:
|
|
@@ -152,12 +195,13 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
152
195
|
self,
|
|
153
196
|
*,
|
|
154
197
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
155
|
-
model: typing.Optional[
|
|
156
|
-
|
|
198
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
199
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
157
200
|
sample_rate: typing.Optional[str] = None,
|
|
158
201
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
159
202
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
160
203
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
204
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
161
205
|
api_subscription_key: typing.Optional[str] = None,
|
|
162
206
|
request_options: typing.Optional[RequestOptions] = None,
|
|
163
207
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -171,13 +215,47 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
171
215
|
Parameters
|
|
172
216
|
----------
|
|
173
217
|
language_code : SpeechToTextStreamingLanguageCode
|
|
174
|
-
|
|
218
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
175
219
|
|
|
176
|
-
|
|
177
|
-
|
|
220
|
+
**Available Options:**
|
|
221
|
+
- `hi-IN`: Hindi
|
|
222
|
+
- `bn-IN`: Bengali
|
|
223
|
+
- `gu-IN`: Gujarati
|
|
224
|
+
- `kn-IN`: Kannada
|
|
225
|
+
- `ml-IN`: Malayalam
|
|
226
|
+
- `mr-IN`: Marathi
|
|
227
|
+
- `od-IN`: Odia
|
|
228
|
+
- `pa-IN`: Punjabi
|
|
229
|
+
- `ta-IN`: Tamil
|
|
230
|
+
- `te-IN`: Telugu
|
|
231
|
+
- `en-IN`: English
|
|
178
232
|
|
|
179
|
-
|
|
180
|
-
|
|
233
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
234
|
+
Specifies the model to use for speech-to-text conversion.
|
|
235
|
+
|
|
236
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
237
|
+
|
|
238
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
239
|
+
|
|
240
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
241
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
242
|
+
|
|
243
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
244
|
+
|
|
245
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
246
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
247
|
+
|
|
248
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
249
|
+
- Output: `My phone number is 9840950950`
|
|
250
|
+
|
|
251
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
252
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
253
|
+
|
|
254
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
255
|
+
- Output: `mera phone number hai 9840950950`
|
|
256
|
+
|
|
257
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
258
|
+
- Output: `मेरा phone number है 9840950950`
|
|
181
259
|
|
|
182
260
|
sample_rate : typing.Optional[str]
|
|
183
261
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -191,6 +269,10 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
191
269
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
192
270
|
Signal to flush the audio buffer and finalize transcription
|
|
193
271
|
|
|
272
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
273
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
274
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
275
|
+
|
|
194
276
|
api_subscription_key : typing.Optional[str]
|
|
195
277
|
API subscription key for authentication
|
|
196
278
|
|
|
@@ -207,8 +289,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
207
289
|
query_params = query_params.add("language-code", language_code)
|
|
208
290
|
if model is not None:
|
|
209
291
|
query_params = query_params.add("model", model)
|
|
210
|
-
if
|
|
211
|
-
query_params = query_params.add("
|
|
292
|
+
if mode is not None:
|
|
293
|
+
query_params = query_params.add("mode", mode)
|
|
212
294
|
if sample_rate is not None:
|
|
213
295
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
214
296
|
if high_vad_sensitivity is not None:
|
|
@@ -217,6 +299,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
217
299
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
218
300
|
if flush_signal is not None:
|
|
219
301
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
302
|
+
if input_audio_codec is not None:
|
|
303
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
220
304
|
ws_url = ws_url + f"?{query_params}"
|
|
221
305
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
222
306
|
if api_subscription_key is not None:
|
|
@@ -14,6 +14,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
|
|
|
14
14
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
15
15
|
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
16
16
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
17
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
18
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
17
19
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
18
20
|
|
|
19
21
|
try:
|
|
@@ -31,12 +33,13 @@ class RawSpeechToTextStreamingClient:
|
|
|
31
33
|
self,
|
|
32
34
|
*,
|
|
33
35
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
34
|
-
model: typing.Optional[
|
|
35
|
-
|
|
36
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
37
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
36
38
|
sample_rate: typing.Optional[str] = None,
|
|
37
39
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
38
40
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
39
41
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
42
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
40
43
|
api_subscription_key: typing.Optional[str] = None,
|
|
41
44
|
request_options: typing.Optional[RequestOptions] = None,
|
|
42
45
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -50,13 +53,47 @@ class RawSpeechToTextStreamingClient:
|
|
|
50
53
|
Parameters
|
|
51
54
|
----------
|
|
52
55
|
language_code : SpeechToTextStreamingLanguageCode
|
|
53
|
-
|
|
56
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
**Available Options:**
|
|
59
|
+
- `hi-IN`: Hindi
|
|
60
|
+
- `bn-IN`: Bengali
|
|
61
|
+
- `gu-IN`: Gujarati
|
|
62
|
+
- `kn-IN`: Kannada
|
|
63
|
+
- `ml-IN`: Malayalam
|
|
64
|
+
- `mr-IN`: Marathi
|
|
65
|
+
- `od-IN`: Odia
|
|
66
|
+
- `pa-IN`: Punjabi
|
|
67
|
+
- `ta-IN`: Tamil
|
|
68
|
+
- `te-IN`: Telugu
|
|
69
|
+
- `en-IN`: English
|
|
57
70
|
|
|
58
|
-
|
|
59
|
-
|
|
71
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
72
|
+
Specifies the model to use for speech-to-text conversion.
|
|
73
|
+
|
|
74
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
75
|
+
|
|
76
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
77
|
+
|
|
78
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
79
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
80
|
+
|
|
81
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
82
|
+
|
|
83
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
84
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
85
|
+
|
|
86
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
87
|
+
- Output: `My phone number is 9840950950`
|
|
88
|
+
|
|
89
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
90
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
91
|
+
|
|
92
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
93
|
+
- Output: `mera phone number hai 9840950950`
|
|
94
|
+
|
|
95
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
96
|
+
- Output: `मेरा phone number है 9840950950`
|
|
60
97
|
|
|
61
98
|
sample_rate : typing.Optional[str]
|
|
62
99
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -70,6 +107,10 @@ class RawSpeechToTextStreamingClient:
|
|
|
70
107
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
71
108
|
Signal to flush the audio buffer and finalize transcription
|
|
72
109
|
|
|
110
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
111
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
112
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
113
|
+
|
|
73
114
|
api_subscription_key : typing.Optional[str]
|
|
74
115
|
API subscription key for authentication
|
|
75
116
|
|
|
@@ -86,8 +127,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
86
127
|
query_params = query_params.add("language-code", language_code)
|
|
87
128
|
if model is not None:
|
|
88
129
|
query_params = query_params.add("model", model)
|
|
89
|
-
if
|
|
90
|
-
query_params = query_params.add("
|
|
130
|
+
if mode is not None:
|
|
131
|
+
query_params = query_params.add("mode", mode)
|
|
91
132
|
if sample_rate is not None:
|
|
92
133
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
93
134
|
if high_vad_sensitivity is not None:
|
|
@@ -96,6 +137,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
96
137
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
97
138
|
if flush_signal is not None:
|
|
98
139
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
140
|
+
if input_audio_codec is not None:
|
|
141
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
99
142
|
ws_url = ws_url + f"?{query_params}"
|
|
100
143
|
headers = self._client_wrapper.get_headers()
|
|
101
144
|
if api_subscription_key is not None:
|
|
@@ -129,12 +172,13 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
129
172
|
self,
|
|
130
173
|
*,
|
|
131
174
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
132
|
-
model: typing.Optional[
|
|
133
|
-
|
|
175
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
176
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
134
177
|
sample_rate: typing.Optional[str] = None,
|
|
135
178
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
136
179
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
137
180
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
181
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
138
182
|
api_subscription_key: typing.Optional[str] = None,
|
|
139
183
|
request_options: typing.Optional[RequestOptions] = None,
|
|
140
184
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -148,13 +192,47 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
148
192
|
Parameters
|
|
149
193
|
----------
|
|
150
194
|
language_code : SpeechToTextStreamingLanguageCode
|
|
151
|
-
|
|
195
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
152
196
|
|
|
153
|
-
|
|
154
|
-
|
|
197
|
+
**Available Options:**
|
|
198
|
+
- `hi-IN`: Hindi
|
|
199
|
+
- `bn-IN`: Bengali
|
|
200
|
+
- `gu-IN`: Gujarati
|
|
201
|
+
- `kn-IN`: Kannada
|
|
202
|
+
- `ml-IN`: Malayalam
|
|
203
|
+
- `mr-IN`: Marathi
|
|
204
|
+
- `od-IN`: Odia
|
|
205
|
+
- `pa-IN`: Punjabi
|
|
206
|
+
- `ta-IN`: Tamil
|
|
207
|
+
- `te-IN`: Telugu
|
|
208
|
+
- `en-IN`: English
|
|
155
209
|
|
|
156
|
-
|
|
157
|
-
|
|
210
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
211
|
+
Specifies the model to use for speech-to-text conversion.
|
|
212
|
+
|
|
213
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
214
|
+
|
|
215
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
216
|
+
|
|
217
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
218
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
219
|
+
|
|
220
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
221
|
+
|
|
222
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
223
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
224
|
+
|
|
225
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
226
|
+
- Output: `My phone number is 9840950950`
|
|
227
|
+
|
|
228
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
229
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
230
|
+
|
|
231
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
232
|
+
- Output: `mera phone number hai 9840950950`
|
|
233
|
+
|
|
234
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
235
|
+
- Output: `मेरा phone number है 9840950950`
|
|
158
236
|
|
|
159
237
|
sample_rate : typing.Optional[str]
|
|
160
238
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -168,6 +246,10 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
168
246
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
169
247
|
Signal to flush the audio buffer and finalize transcription
|
|
170
248
|
|
|
249
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
250
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
251
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
252
|
+
|
|
171
253
|
api_subscription_key : typing.Optional[str]
|
|
172
254
|
API subscription key for authentication
|
|
173
255
|
|
|
@@ -184,8 +266,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
184
266
|
query_params = query_params.add("language-code", language_code)
|
|
185
267
|
if model is not None:
|
|
186
268
|
query_params = query_params.add("model", model)
|
|
187
|
-
if
|
|
188
|
-
query_params = query_params.add("
|
|
269
|
+
if mode is not None:
|
|
270
|
+
query_params = query_params.add("mode", mode)
|
|
189
271
|
if sample_rate is not None:
|
|
190
272
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
191
273
|
if high_vad_sensitivity is not None:
|
|
@@ -194,6 +276,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
194
276
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
195
277
|
if flush_signal is not None:
|
|
196
278
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
279
|
+
if input_audio_codec is not None:
|
|
280
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
197
281
|
ws_url = ws_url + f"?{query_params}"
|
|
198
282
|
headers = self._client_wrapper.get_headers()
|
|
199
283
|
if api_subscription_key is not None:
|
|
@@ -6,6 +6,8 @@ from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSig
|
|
|
6
6
|
from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
7
7
|
from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
8
8
|
from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
9
|
+
from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
10
|
+
from .speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
9
11
|
from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
10
12
|
|
|
11
13
|
__all__ = [
|
|
@@ -13,5 +15,7 @@ __all__ = [
|
|
|
13
15
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
14
16
|
"SpeechToTextStreamingInputAudioCodec",
|
|
15
17
|
"SpeechToTextStreamingLanguageCode",
|
|
18
|
+
"SpeechToTextStreamingMode",
|
|
19
|
+
"SpeechToTextStreamingModel",
|
|
16
20
|
"SpeechToTextStreamingVadSignals",
|
|
17
21
|
]
|
|
@@ -3,31 +3,5 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
SpeechToTextStreamingInputAudioCodec = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
7
|
-
"wav",
|
|
8
|
-
"x-wav",
|
|
9
|
-
"wave",
|
|
10
|
-
"mp3",
|
|
11
|
-
"mpeg",
|
|
12
|
-
"mpeg3",
|
|
13
|
-
"x-mp3",
|
|
14
|
-
"x-mpeg-3",
|
|
15
|
-
"aac",
|
|
16
|
-
"x-aac",
|
|
17
|
-
"aiff",
|
|
18
|
-
"x-aiff",
|
|
19
|
-
"ogg",
|
|
20
|
-
"opus",
|
|
21
|
-
"flac",
|
|
22
|
-
"x-flac",
|
|
23
|
-
"mp4",
|
|
24
|
-
"x-m4a",
|
|
25
|
-
"amr",
|
|
26
|
-
"x-ms-wma",
|
|
27
|
-
"webm",
|
|
28
|
-
"pcm_s16le",
|
|
29
|
-
"pcm_l16",
|
|
30
|
-
"pcm_raw",
|
|
31
|
-
],
|
|
32
|
-
typing.Any,
|
|
6
|
+
typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
|
|
33
7
|
]
|
|
@@ -44,11 +44,11 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
44
44
|
self,
|
|
45
45
|
*,
|
|
46
46
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
47
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
48
47
|
sample_rate: typing.Optional[str] = None,
|
|
49
48
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
50
49
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
51
50
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
51
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
52
52
|
api_subscription_key: typing.Optional[str] = None,
|
|
53
53
|
request_options: typing.Optional[RequestOptions] = None,
|
|
54
54
|
) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -62,10 +62,10 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
62
62
|
Parameters
|
|
63
63
|
----------
|
|
64
64
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
65
|
-
|
|
65
|
+
Model to be used for speech to text translation.
|
|
66
66
|
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
68
|
+
- Example: Hindi audio → English text output
|
|
69
69
|
|
|
70
70
|
sample_rate : typing.Optional[str]
|
|
71
71
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -79,6 +79,10 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
79
79
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
80
80
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
81
81
|
|
|
82
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
83
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
84
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
85
|
+
|
|
82
86
|
api_subscription_key : typing.Optional[str]
|
|
83
87
|
API subscription key for authentication
|
|
84
88
|
|
|
@@ -93,8 +97,6 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
93
97
|
query_params = httpx.QueryParams()
|
|
94
98
|
if model is not None:
|
|
95
99
|
query_params = query_params.add("model", model)
|
|
96
|
-
if input_audio_codec is not None:
|
|
97
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
98
100
|
if sample_rate is not None:
|
|
99
101
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
100
102
|
if high_vad_sensitivity is not None:
|
|
@@ -103,6 +105,8 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
103
105
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
104
106
|
if flush_signal is not None:
|
|
105
107
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
108
|
+
if input_audio_codec is not None:
|
|
109
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
106
110
|
ws_url = ws_url + f"?{query_params}"
|
|
107
111
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
108
112
|
if api_subscription_key is not None:
|
|
@@ -147,11 +151,11 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
147
151
|
self,
|
|
148
152
|
*,
|
|
149
153
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
150
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
151
154
|
sample_rate: typing.Optional[str] = None,
|
|
152
155
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
153
156
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
154
157
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
158
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
155
159
|
api_subscription_key: typing.Optional[str] = None,
|
|
156
160
|
request_options: typing.Optional[RequestOptions] = None,
|
|
157
161
|
) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -165,10 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
165
169
|
Parameters
|
|
166
170
|
----------
|
|
167
171
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
168
|
-
|
|
172
|
+
Model to be used for speech to text translation.
|
|
169
173
|
|
|
170
|
-
|
|
171
|
-
|
|
174
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
175
|
+
- Example: Hindi audio → English text output
|
|
172
176
|
|
|
173
177
|
sample_rate : typing.Optional[str]
|
|
174
178
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -182,6 +186,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
182
186
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
183
187
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
184
188
|
|
|
189
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
190
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
191
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
192
|
+
|
|
185
193
|
api_subscription_key : typing.Optional[str]
|
|
186
194
|
API subscription key for authentication
|
|
187
195
|
|
|
@@ -196,8 +204,6 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
196
204
|
query_params = httpx.QueryParams()
|
|
197
205
|
if model is not None:
|
|
198
206
|
query_params = query_params.add("model", model)
|
|
199
|
-
if input_audio_codec is not None:
|
|
200
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
201
207
|
if sample_rate is not None:
|
|
202
208
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
203
209
|
if high_vad_sensitivity is not None:
|
|
@@ -206,6 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
206
212
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
207
213
|
if flush_signal is not None:
|
|
208
214
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
215
|
+
if input_audio_codec is not None:
|
|
216
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
209
217
|
ws_url = ws_url + f"?{query_params}"
|
|
210
218
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
211
219
|
if api_subscription_key is not None:
|