sarvamai 0.1.23a4__py3-none-any.whl → 0.1.23a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +4 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/requests/speech_to_text_job_parameters.py +37 -5
- sarvamai/requests/speech_to_text_response.py +6 -14
- sarvamai/requests/speech_to_text_transcription_data.py +14 -0
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/requests/speech_to_text_translate_response.py +6 -9
- sarvamai/requests/speech_to_text_translate_transcription_data.py +13 -0
- sarvamai/speech_to_text/client.py +84 -26
- sarvamai/speech_to_text/raw_client.py +84 -26
- sarvamai/speech_to_text_streaming/__init__.py +2 -0
- sarvamai/speech_to_text_streaming/client.py +117 -18
- sarvamai/speech_to_text_streaming/raw_client.py +117 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +2 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_language_code.py +25 -1
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +1 -1
- sarvamai/speech_to_text_translate_streaming/__init__.py +2 -0
- sarvamai/speech_to_text_translate_streaming/client.py +23 -2
- sarvamai/speech_to_text_translate_streaming/raw_client.py +23 -2
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +2 -0
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +7 -0
- sarvamai/types/mode.py +1 -3
- sarvamai/types/speech_to_text_job_parameters.py +37 -5
- sarvamai/types/speech_to_text_language.py +24 -1
- sarvamai/types/speech_to_text_model.py +1 -3
- sarvamai/types/speech_to_text_response.py +6 -14
- sarvamai/types/speech_to_text_transcription_data.py +14 -0
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/speech_to_text_translate_language.py +25 -1
- sarvamai/types/speech_to_text_translate_model.py +1 -1
- sarvamai/types/speech_to_text_translate_response.py +6 -9
- sarvamai/types/speech_to_text_translate_transcription_data.py +13 -0
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/METADATA +1 -1
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/RECORD +36 -34
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/WHEEL +0 -0
|
@@ -13,6 +13,7 @@ from .raw_client import AsyncRawSpeechToTextStreamingClient, RawSpeechToTextStre
|
|
|
13
13
|
from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
|
|
14
14
|
from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
15
15
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
16
|
+
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
16
17
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
17
18
|
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
18
19
|
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
@@ -50,6 +51,7 @@ class SpeechToTextStreamingClient:
|
|
|
50
51
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
51
52
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
52
53
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
54
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
53
55
|
api_subscription_key: typing.Optional[str] = None,
|
|
54
56
|
request_options: typing.Optional[RequestOptions] = None,
|
|
55
57
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -63,19 +65,61 @@ class SpeechToTextStreamingClient:
|
|
|
63
65
|
Parameters
|
|
64
66
|
----------
|
|
65
67
|
language_code : SpeechToTextStreamingLanguageCode
|
|
66
|
-
|
|
68
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
69
|
+
|
|
70
|
+
**Available Options (saarika:v2.5):**
|
|
71
|
+
- `hi-IN`: Hindi
|
|
72
|
+
- `bn-IN`: Bengali
|
|
73
|
+
- `gu-IN`: Gujarati
|
|
74
|
+
- `kn-IN`: Kannada
|
|
75
|
+
- `ml-IN`: Malayalam
|
|
76
|
+
- `mr-IN`: Marathi
|
|
77
|
+
- `od-IN`: Odia
|
|
78
|
+
- `pa-IN`: Punjabi
|
|
79
|
+
- `ta-IN`: Tamil
|
|
80
|
+
- `te-IN`: Telugu
|
|
81
|
+
- `en-IN`: English
|
|
82
|
+
|
|
83
|
+
**Additional Options (saaras:v3 only):**
|
|
84
|
+
- `as-IN`: Assamese
|
|
85
|
+
- `ur-IN`: Urdu
|
|
86
|
+
- `ne-IN`: Nepali
|
|
87
|
+
- `kok-IN`: Konkani
|
|
88
|
+
- `ks-IN`: Kashmiri
|
|
89
|
+
- `sd-IN`: Sindhi
|
|
90
|
+
- `sa-IN`: Sanskrit
|
|
91
|
+
- `sat-IN`: Santali
|
|
92
|
+
- `mni-IN`: Manipuri
|
|
93
|
+
- `brx-IN`: Bodo
|
|
94
|
+
- `mai-IN`: Maithili
|
|
95
|
+
- `doi-IN`: Dogri
|
|
67
96
|
|
|
68
97
|
model : typing.Optional[SpeechToTextStreamingModel]
|
|
69
|
-
|
|
98
|
+
Specifies the model to use for speech-to-text conversion.
|
|
99
|
+
|
|
100
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
101
|
+
|
|
102
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
70
103
|
|
|
71
104
|
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
72
|
-
Mode of operation
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
-
|
|
77
|
-
|
|
78
|
-
|
|
105
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
106
|
+
|
|
107
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
108
|
+
|
|
109
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
110
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
111
|
+
|
|
112
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
113
|
+
- Output: `My phone number is 9840950950`
|
|
114
|
+
|
|
115
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
116
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
117
|
+
|
|
118
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
119
|
+
- Output: `mera phone number hai 9840950950`
|
|
120
|
+
|
|
121
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
122
|
+
- Output: `मेरा phone number है 9840950950`
|
|
79
123
|
|
|
80
124
|
sample_rate : typing.Optional[str]
|
|
81
125
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -89,6 +133,10 @@ class SpeechToTextStreamingClient:
|
|
|
89
133
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
90
134
|
Signal to flush the audio buffer and finalize transcription
|
|
91
135
|
|
|
136
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
137
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
138
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
139
|
+
|
|
92
140
|
api_subscription_key : typing.Optional[str]
|
|
93
141
|
API subscription key for authentication
|
|
94
142
|
|
|
@@ -115,6 +163,8 @@ class SpeechToTextStreamingClient:
|
|
|
115
163
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
116
164
|
if flush_signal is not None:
|
|
117
165
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
166
|
+
if input_audio_codec is not None:
|
|
167
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
118
168
|
ws_url = ws_url + f"?{query_params}"
|
|
119
169
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
120
170
|
if api_subscription_key is not None:
|
|
@@ -165,6 +215,7 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
165
215
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
166
216
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
167
217
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
218
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
168
219
|
api_subscription_key: typing.Optional[str] = None,
|
|
169
220
|
request_options: typing.Optional[RequestOptions] = None,
|
|
170
221
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -178,19 +229,61 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
178
229
|
Parameters
|
|
179
230
|
----------
|
|
180
231
|
language_code : SpeechToTextStreamingLanguageCode
|
|
181
|
-
|
|
232
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
233
|
+
|
|
234
|
+
**Available Options (saarika:v2.5):**
|
|
235
|
+
- `hi-IN`: Hindi
|
|
236
|
+
- `bn-IN`: Bengali
|
|
237
|
+
- `gu-IN`: Gujarati
|
|
238
|
+
- `kn-IN`: Kannada
|
|
239
|
+
- `ml-IN`: Malayalam
|
|
240
|
+
- `mr-IN`: Marathi
|
|
241
|
+
- `od-IN`: Odia
|
|
242
|
+
- `pa-IN`: Punjabi
|
|
243
|
+
- `ta-IN`: Tamil
|
|
244
|
+
- `te-IN`: Telugu
|
|
245
|
+
- `en-IN`: English
|
|
246
|
+
|
|
247
|
+
**Additional Options (saaras:v3 only):**
|
|
248
|
+
- `as-IN`: Assamese
|
|
249
|
+
- `ur-IN`: Urdu
|
|
250
|
+
- `ne-IN`: Nepali
|
|
251
|
+
- `kok-IN`: Konkani
|
|
252
|
+
- `ks-IN`: Kashmiri
|
|
253
|
+
- `sd-IN`: Sindhi
|
|
254
|
+
- `sa-IN`: Sanskrit
|
|
255
|
+
- `sat-IN`: Santali
|
|
256
|
+
- `mni-IN`: Manipuri
|
|
257
|
+
- `brx-IN`: Bodo
|
|
258
|
+
- `mai-IN`: Maithili
|
|
259
|
+
- `doi-IN`: Dogri
|
|
182
260
|
|
|
183
261
|
model : typing.Optional[SpeechToTextStreamingModel]
|
|
184
|
-
|
|
262
|
+
Specifies the model to use for speech-to-text conversion.
|
|
263
|
+
|
|
264
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
265
|
+
|
|
266
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
185
267
|
|
|
186
268
|
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
187
|
-
Mode of operation
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
-
|
|
192
|
-
|
|
193
|
-
|
|
269
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
270
|
+
|
|
271
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
272
|
+
|
|
273
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
274
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
275
|
+
|
|
276
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
277
|
+
- Output: `My phone number is 9840950950`
|
|
278
|
+
|
|
279
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
280
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
281
|
+
|
|
282
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
283
|
+
- Output: `mera phone number hai 9840950950`
|
|
284
|
+
|
|
285
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
286
|
+
- Output: `मेरा phone number है 9840950950`
|
|
194
287
|
|
|
195
288
|
sample_rate : typing.Optional[str]
|
|
196
289
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -204,6 +297,10 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
204
297
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
205
298
|
Signal to flush the audio buffer and finalize transcription
|
|
206
299
|
|
|
300
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
301
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
302
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
303
|
+
|
|
207
304
|
api_subscription_key : typing.Optional[str]
|
|
208
305
|
API subscription key for authentication
|
|
209
306
|
|
|
@@ -230,6 +327,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
230
327
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
231
328
|
if flush_signal is not None:
|
|
232
329
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
330
|
+
if input_audio_codec is not None:
|
|
331
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
233
332
|
ws_url = ws_url + f"?{query_params}"
|
|
234
333
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
235
334
|
if api_subscription_key is not None:
|
|
@@ -12,6 +12,7 @@ from ..core.request_options import RequestOptions
|
|
|
12
12
|
from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
|
|
13
13
|
from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
14
14
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
15
|
+
from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
15
16
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
16
17
|
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
17
18
|
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
@@ -38,6 +39,7 @@ class RawSpeechToTextStreamingClient:
|
|
|
38
39
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
39
40
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
40
41
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
42
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
41
43
|
api_subscription_key: typing.Optional[str] = None,
|
|
42
44
|
request_options: typing.Optional[RequestOptions] = None,
|
|
43
45
|
) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
|
|
@@ -51,19 +53,61 @@ class RawSpeechToTextStreamingClient:
|
|
|
51
53
|
Parameters
|
|
52
54
|
----------
|
|
53
55
|
language_code : SpeechToTextStreamingLanguageCode
|
|
54
|
-
|
|
56
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
57
|
+
|
|
58
|
+
**Available Options (saarika:v2.5):**
|
|
59
|
+
- `hi-IN`: Hindi
|
|
60
|
+
- `bn-IN`: Bengali
|
|
61
|
+
- `gu-IN`: Gujarati
|
|
62
|
+
- `kn-IN`: Kannada
|
|
63
|
+
- `ml-IN`: Malayalam
|
|
64
|
+
- `mr-IN`: Marathi
|
|
65
|
+
- `od-IN`: Odia
|
|
66
|
+
- `pa-IN`: Punjabi
|
|
67
|
+
- `ta-IN`: Tamil
|
|
68
|
+
- `te-IN`: Telugu
|
|
69
|
+
- `en-IN`: English
|
|
70
|
+
|
|
71
|
+
**Additional Options (saaras:v3 only):**
|
|
72
|
+
- `as-IN`: Assamese
|
|
73
|
+
- `ur-IN`: Urdu
|
|
74
|
+
- `ne-IN`: Nepali
|
|
75
|
+
- `kok-IN`: Konkani
|
|
76
|
+
- `ks-IN`: Kashmiri
|
|
77
|
+
- `sd-IN`: Sindhi
|
|
78
|
+
- `sa-IN`: Sanskrit
|
|
79
|
+
- `sat-IN`: Santali
|
|
80
|
+
- `mni-IN`: Manipuri
|
|
81
|
+
- `brx-IN`: Bodo
|
|
82
|
+
- `mai-IN`: Maithili
|
|
83
|
+
- `doi-IN`: Dogri
|
|
55
84
|
|
|
56
85
|
model : typing.Optional[SpeechToTextStreamingModel]
|
|
57
|
-
|
|
86
|
+
Specifies the model to use for speech-to-text conversion.
|
|
87
|
+
|
|
88
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
89
|
+
|
|
90
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
58
91
|
|
|
59
92
|
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
60
|
-
Mode of operation
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
-
|
|
65
|
-
|
|
66
|
-
|
|
93
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
94
|
+
|
|
95
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
96
|
+
|
|
97
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
98
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
99
|
+
|
|
100
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
101
|
+
- Output: `My phone number is 9840950950`
|
|
102
|
+
|
|
103
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
104
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
105
|
+
|
|
106
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
107
|
+
- Output: `mera phone number hai 9840950950`
|
|
108
|
+
|
|
109
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
110
|
+
- Output: `मेरा phone number है 9840950950`
|
|
67
111
|
|
|
68
112
|
sample_rate : typing.Optional[str]
|
|
69
113
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -77,6 +121,10 @@ class RawSpeechToTextStreamingClient:
|
|
|
77
121
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
78
122
|
Signal to flush the audio buffer and finalize transcription
|
|
79
123
|
|
|
124
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
125
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
126
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
127
|
+
|
|
80
128
|
api_subscription_key : typing.Optional[str]
|
|
81
129
|
API subscription key for authentication
|
|
82
130
|
|
|
@@ -103,6 +151,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
103
151
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
104
152
|
if flush_signal is not None:
|
|
105
153
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
154
|
+
if input_audio_codec is not None:
|
|
155
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
106
156
|
ws_url = ws_url + f"?{query_params}"
|
|
107
157
|
headers = self._client_wrapper.get_headers()
|
|
108
158
|
if api_subscription_key is not None:
|
|
@@ -142,6 +192,7 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
142
192
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
143
193
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
144
194
|
flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
|
|
195
|
+
input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
|
|
145
196
|
api_subscription_key: typing.Optional[str] = None,
|
|
146
197
|
request_options: typing.Optional[RequestOptions] = None,
|
|
147
198
|
) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
|
|
@@ -155,19 +206,61 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
155
206
|
Parameters
|
|
156
207
|
----------
|
|
157
208
|
language_code : SpeechToTextStreamingLanguageCode
|
|
158
|
-
|
|
209
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
210
|
+
|
|
211
|
+
**Available Options (saarika:v2.5):**
|
|
212
|
+
- `hi-IN`: Hindi
|
|
213
|
+
- `bn-IN`: Bengali
|
|
214
|
+
- `gu-IN`: Gujarati
|
|
215
|
+
- `kn-IN`: Kannada
|
|
216
|
+
- `ml-IN`: Malayalam
|
|
217
|
+
- `mr-IN`: Marathi
|
|
218
|
+
- `od-IN`: Odia
|
|
219
|
+
- `pa-IN`: Punjabi
|
|
220
|
+
- `ta-IN`: Tamil
|
|
221
|
+
- `te-IN`: Telugu
|
|
222
|
+
- `en-IN`: English
|
|
223
|
+
|
|
224
|
+
**Additional Options (saaras:v3 only):**
|
|
225
|
+
- `as-IN`: Assamese
|
|
226
|
+
- `ur-IN`: Urdu
|
|
227
|
+
- `ne-IN`: Nepali
|
|
228
|
+
- `kok-IN`: Konkani
|
|
229
|
+
- `ks-IN`: Kashmiri
|
|
230
|
+
- `sd-IN`: Sindhi
|
|
231
|
+
- `sa-IN`: Sanskrit
|
|
232
|
+
- `sat-IN`: Santali
|
|
233
|
+
- `mni-IN`: Manipuri
|
|
234
|
+
- `brx-IN`: Bodo
|
|
235
|
+
- `mai-IN`: Maithili
|
|
236
|
+
- `doi-IN`: Dogri
|
|
159
237
|
|
|
160
238
|
model : typing.Optional[SpeechToTextStreamingModel]
|
|
161
|
-
|
|
239
|
+
Specifies the model to use for speech-to-text conversion.
|
|
240
|
+
|
|
241
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
242
|
+
|
|
243
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
162
244
|
|
|
163
245
|
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
164
|
-
Mode of operation
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
-
|
|
169
|
-
|
|
170
|
-
|
|
246
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
247
|
+
|
|
248
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
249
|
+
|
|
250
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
251
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
252
|
+
|
|
253
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
254
|
+
- Output: `My phone number is 9840950950`
|
|
255
|
+
|
|
256
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
257
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
258
|
+
|
|
259
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
260
|
+
- Output: `mera phone number hai 9840950950`
|
|
261
|
+
|
|
262
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
263
|
+
- Output: `मेरा phone number है 9840950950`
|
|
171
264
|
|
|
172
265
|
sample_rate : typing.Optional[str]
|
|
173
266
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -181,6 +274,10 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
181
274
|
flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
|
|
182
275
|
Signal to flush the audio buffer and finalize transcription
|
|
183
276
|
|
|
277
|
+
input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
|
|
278
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
279
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
280
|
+
|
|
184
281
|
api_subscription_key : typing.Optional[str]
|
|
185
282
|
API subscription key for authentication
|
|
186
283
|
|
|
@@ -207,6 +304,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
207
304
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
208
305
|
if flush_signal is not None:
|
|
209
306
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
307
|
+
if input_audio_codec is not None:
|
|
308
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
210
309
|
ws_url = ws_url + f"?{query_params}"
|
|
211
310
|
headers = self._client_wrapper.get_headers()
|
|
212
311
|
if api_subscription_key is not None:
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
6
6
|
from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
7
|
+
from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
|
|
7
8
|
from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
8
9
|
from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
9
10
|
from .speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
@@ -12,6 +13,7 @@ from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignal
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
"SpeechToTextStreamingFlushSignal",
|
|
14
15
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
16
|
+
"SpeechToTextStreamingInputAudioCodec",
|
|
15
17
|
"SpeechToTextStreamingLanguageCode",
|
|
16
18
|
"SpeechToTextStreamingMode",
|
|
17
19
|
"SpeechToTextStreamingModel",
|
|
@@ -3,6 +3,30 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
SpeechToTextStreamingLanguageCode = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
6
|
+
typing.Literal[
|
|
7
|
+
"en-IN",
|
|
8
|
+
"hi-IN",
|
|
9
|
+
"bn-IN",
|
|
10
|
+
"gu-IN",
|
|
11
|
+
"kn-IN",
|
|
12
|
+
"ml-IN",
|
|
13
|
+
"mr-IN",
|
|
14
|
+
"od-IN",
|
|
15
|
+
"pa-IN",
|
|
16
|
+
"ta-IN",
|
|
17
|
+
"te-IN",
|
|
18
|
+
"as-IN",
|
|
19
|
+
"ur-IN",
|
|
20
|
+
"ne-IN",
|
|
21
|
+
"kok-IN",
|
|
22
|
+
"ks-IN",
|
|
23
|
+
"sd-IN",
|
|
24
|
+
"sa-IN",
|
|
25
|
+
"sat-IN",
|
|
26
|
+
"mni-IN",
|
|
27
|
+
"brx-IN",
|
|
28
|
+
"mai-IN",
|
|
29
|
+
"doi-IN",
|
|
30
|
+
],
|
|
7
31
|
typing.Any,
|
|
8
32
|
]
|
|
@@ -5,11 +5,13 @@
|
|
|
5
5
|
from .types import (
|
|
6
6
|
SpeechToTextTranslateStreamingFlushSignal,
|
|
7
7
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
8
|
+
SpeechToTextTranslateStreamingInputAudioCodec,
|
|
8
9
|
SpeechToTextTranslateStreamingVadSignals,
|
|
9
10
|
)
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"SpeechToTextTranslateStreamingFlushSignal",
|
|
13
14
|
"SpeechToTextTranslateStreamingHighVadSensitivity",
|
|
15
|
+
"SpeechToTextTranslateStreamingInputAudioCodec",
|
|
14
16
|
"SpeechToTextTranslateStreamingVadSignals",
|
|
15
17
|
]
|
|
@@ -15,6 +15,7 @@ from .types.speech_to_text_translate_streaming_flush_signal import SpeechToTextT
|
|
|
15
15
|
from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
|
|
16
16
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
17
17
|
)
|
|
18
|
+
from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
|
|
18
19
|
from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
|
|
19
20
|
|
|
20
21
|
try:
|
|
@@ -47,6 +48,7 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
47
48
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
48
49
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
49
50
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
51
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
50
52
|
api_subscription_key: typing.Optional[str] = None,
|
|
51
53
|
request_options: typing.Optional[RequestOptions] = None,
|
|
52
54
|
) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -60,7 +62,10 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
60
62
|
Parameters
|
|
61
63
|
----------
|
|
62
64
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
63
|
-
|
|
65
|
+
Model to be used for speech to text translation.
|
|
66
|
+
|
|
67
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
68
|
+
- Example: Hindi audio → English text output
|
|
64
69
|
|
|
65
70
|
sample_rate : typing.Optional[str]
|
|
66
71
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -74,6 +79,10 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
74
79
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
75
80
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
76
81
|
|
|
82
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
83
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
84
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
85
|
+
|
|
77
86
|
api_subscription_key : typing.Optional[str]
|
|
78
87
|
API subscription key for authentication
|
|
79
88
|
|
|
@@ -96,6 +105,8 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
96
105
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
97
106
|
if flush_signal is not None:
|
|
98
107
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
108
|
+
if input_audio_codec is not None:
|
|
109
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
99
110
|
ws_url = ws_url + f"?{query_params}"
|
|
100
111
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
101
112
|
if api_subscription_key is not None:
|
|
@@ -144,6 +155,7 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
144
155
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
145
156
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
146
157
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
158
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
147
159
|
api_subscription_key: typing.Optional[str] = None,
|
|
148
160
|
request_options: typing.Optional[RequestOptions] = None,
|
|
149
161
|
) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -157,7 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
157
169
|
Parameters
|
|
158
170
|
----------
|
|
159
171
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
160
|
-
|
|
172
|
+
Model to be used for speech to text translation.
|
|
173
|
+
|
|
174
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
175
|
+
- Example: Hindi audio → English text output
|
|
161
176
|
|
|
162
177
|
sample_rate : typing.Optional[str]
|
|
163
178
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -171,6 +186,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
171
186
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
172
187
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
173
188
|
|
|
189
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
190
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
191
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
192
|
+
|
|
174
193
|
api_subscription_key : typing.Optional[str]
|
|
175
194
|
API subscription key for authentication
|
|
176
195
|
|
|
@@ -193,6 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
193
212
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
194
213
|
if flush_signal is not None:
|
|
195
214
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
215
|
+
if input_audio_codec is not None:
|
|
216
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
196
217
|
ws_url = ws_url + f"?{query_params}"
|
|
197
218
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
198
219
|
if api_subscription_key is not None:
|