sarvamai 0.1.23a4__py3-none-any.whl → 0.1.23a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. sarvamai/__init__.py +4 -0
  2. sarvamai/core/client_wrapper.py +2 -2
  3. sarvamai/requests/speech_to_text_job_parameters.py +37 -5
  4. sarvamai/requests/speech_to_text_response.py +6 -14
  5. sarvamai/requests/speech_to_text_transcription_data.py +14 -0
  6. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  7. sarvamai/requests/speech_to_text_translate_response.py +6 -9
  8. sarvamai/requests/speech_to_text_translate_transcription_data.py +13 -0
  9. sarvamai/speech_to_text/client.py +84 -26
  10. sarvamai/speech_to_text/raw_client.py +84 -26
  11. sarvamai/speech_to_text_streaming/__init__.py +2 -0
  12. sarvamai/speech_to_text_streaming/client.py +117 -18
  13. sarvamai/speech_to_text_streaming/raw_client.py +117 -18
  14. sarvamai/speech_to_text_streaming/types/__init__.py +2 -0
  15. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +7 -0
  16. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_language_code.py +25 -1
  17. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +1 -1
  18. sarvamai/speech_to_text_translate_streaming/__init__.py +2 -0
  19. sarvamai/speech_to_text_translate_streaming/client.py +23 -2
  20. sarvamai/speech_to_text_translate_streaming/raw_client.py +23 -2
  21. sarvamai/speech_to_text_translate_streaming/types/__init__.py +2 -0
  22. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +7 -0
  23. sarvamai/types/mode.py +1 -3
  24. sarvamai/types/speech_to_text_job_parameters.py +37 -5
  25. sarvamai/types/speech_to_text_language.py +24 -1
  26. sarvamai/types/speech_to_text_model.py +1 -3
  27. sarvamai/types/speech_to_text_response.py +6 -14
  28. sarvamai/types/speech_to_text_transcription_data.py +14 -0
  29. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  30. sarvamai/types/speech_to_text_translate_language.py +25 -1
  31. sarvamai/types/speech_to_text_translate_model.py +1 -1
  32. sarvamai/types/speech_to_text_translate_response.py +6 -9
  33. sarvamai/types/speech_to_text_translate_transcription_data.py +13 -0
  34. {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/METADATA +1 -1
  35. {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/RECORD +36 -34
  36. {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/WHEEL +0 -0
@@ -13,6 +13,7 @@ from .raw_client import AsyncRawSpeechToTextStreamingClient, RawSpeechToTextStre
13
13
  from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
14
14
  from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
15
15
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
16
+ from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
16
17
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
17
18
  from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
18
19
  from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -50,6 +51,7 @@ class SpeechToTextStreamingClient:
50
51
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
51
52
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
52
53
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
54
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
53
55
  api_subscription_key: typing.Optional[str] = None,
54
56
  request_options: typing.Optional[RequestOptions] = None,
55
57
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -63,19 +65,61 @@ class SpeechToTextStreamingClient:
63
65
  Parameters
64
66
  ----------
65
67
  language_code : SpeechToTextStreamingLanguageCode
66
- Language code for speech recognition (BCP-47 format)
68
+ Specifies the language of the input audio in BCP-47 format.
69
+
70
+ **Available Options (saarika:v2.5):**
71
+ - `hi-IN`: Hindi
72
+ - `bn-IN`: Bengali
73
+ - `gu-IN`: Gujarati
74
+ - `kn-IN`: Kannada
75
+ - `ml-IN`: Malayalam
76
+ - `mr-IN`: Marathi
77
+ - `od-IN`: Odia
78
+ - `pa-IN`: Punjabi
79
+ - `ta-IN`: Tamil
80
+ - `te-IN`: Telugu
81
+ - `en-IN`: English
82
+
83
+ **Additional Options (saaras:v3 only):**
84
+ - `as-IN`: Assamese
85
+ - `ur-IN`: Urdu
86
+ - `ne-IN`: Nepali
87
+ - `kok-IN`: Konkani
88
+ - `ks-IN`: Kashmiri
89
+ - `sd-IN`: Sindhi
90
+ - `sa-IN`: Sanskrit
91
+ - `sat-IN`: Santali
92
+ - `mni-IN`: Manipuri
93
+ - `brx-IN`: Bodo
94
+ - `mai-IN`: Maithili
95
+ - `doi-IN`: Dogri
67
96
 
68
97
  model : typing.Optional[SpeechToTextStreamingModel]
69
- Speech to text model to use
98
+ Specifies the model to use for speech-to-text conversion.
99
+
100
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
101
+
102
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
70
103
 
71
104
  mode : typing.Optional[SpeechToTextStreamingMode]
72
- Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
73
- - transcribe: Standard Whisper transcription
74
- - translate: Standard Whisper translation to English
75
- - indic-en: Translate Indic languages to English
76
- - verbatim: Exact transcription in original script
77
- - translit: Transliteration to Latin script
78
- - codemix: Code-mixed output (native + English)
105
+ Mode of operation. **Only applicable when using saaras:v3 model.**
106
+
107
+ Example audio: 'मेरा फोन नंबर है 9840950950'
108
+
109
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
110
+ - Output: `मेरा फोन नंबर है 9840950950`
111
+
112
+ - **translate**: Translates speech from any supported Indic language to English.
113
+ - Output: `My phone number is 9840950950`
114
+
115
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
116
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
117
+
118
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
119
+ - Output: `mera phone number hai 9840950950`
120
+
121
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
122
+ - Output: `मेरा phone number है 9840950950`
79
123
 
80
124
  sample_rate : typing.Optional[str]
81
125
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -89,6 +133,10 @@ class SpeechToTextStreamingClient:
89
133
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
90
134
  Signal to flush the audio buffer and finalize transcription
91
135
 
136
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
137
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
138
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
139
+
92
140
  api_subscription_key : typing.Optional[str]
93
141
  API subscription key for authentication
94
142
 
@@ -115,6 +163,8 @@ class SpeechToTextStreamingClient:
115
163
  query_params = query_params.add("vad_signals", vad_signals)
116
164
  if flush_signal is not None:
117
165
  query_params = query_params.add("flush_signal", flush_signal)
166
+ if input_audio_codec is not None:
167
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
118
168
  ws_url = ws_url + f"?{query_params}"
119
169
  headers = self._raw_client._client_wrapper.get_headers()
120
170
  if api_subscription_key is not None:
@@ -165,6 +215,7 @@ class AsyncSpeechToTextStreamingClient:
165
215
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
166
216
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
167
217
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
218
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
168
219
  api_subscription_key: typing.Optional[str] = None,
169
220
  request_options: typing.Optional[RequestOptions] = None,
170
221
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -178,19 +229,61 @@ class AsyncSpeechToTextStreamingClient:
178
229
  Parameters
179
230
  ----------
180
231
  language_code : SpeechToTextStreamingLanguageCode
181
- Language code for speech recognition (BCP-47 format)
232
+ Specifies the language of the input audio in BCP-47 format.
233
+
234
+ **Available Options (saarika:v2.5):**
235
+ - `hi-IN`: Hindi
236
+ - `bn-IN`: Bengali
237
+ - `gu-IN`: Gujarati
238
+ - `kn-IN`: Kannada
239
+ - `ml-IN`: Malayalam
240
+ - `mr-IN`: Marathi
241
+ - `od-IN`: Odia
242
+ - `pa-IN`: Punjabi
243
+ - `ta-IN`: Tamil
244
+ - `te-IN`: Telugu
245
+ - `en-IN`: English
246
+
247
+ **Additional Options (saaras:v3 only):**
248
+ - `as-IN`: Assamese
249
+ - `ur-IN`: Urdu
250
+ - `ne-IN`: Nepali
251
+ - `kok-IN`: Konkani
252
+ - `ks-IN`: Kashmiri
253
+ - `sd-IN`: Sindhi
254
+ - `sa-IN`: Sanskrit
255
+ - `sat-IN`: Santali
256
+ - `mni-IN`: Manipuri
257
+ - `brx-IN`: Bodo
258
+ - `mai-IN`: Maithili
259
+ - `doi-IN`: Dogri
182
260
 
183
261
  model : typing.Optional[SpeechToTextStreamingModel]
184
- Speech to text model to use
262
+ Specifies the model to use for speech-to-text conversion.
263
+
264
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
265
+
266
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
185
267
 
186
268
  mode : typing.Optional[SpeechToTextStreamingMode]
187
- Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
188
- - transcribe: Standard Whisper transcription
189
- - translate: Standard Whisper translation to English
190
- - indic-en: Translate Indic languages to English
191
- - verbatim: Exact transcription in original script
192
- - translit: Transliteration to Latin script
193
- - codemix: Code-mixed output (native + English)
269
+ Mode of operation. **Only applicable when using saaras:v3 model.**
270
+
271
+ Example audio: 'मेरा फोन नंबर है 9840950950'
272
+
273
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
274
+ - Output: `मेरा फोन नंबर है 9840950950`
275
+
276
+ - **translate**: Translates speech from any supported Indic language to English.
277
+ - Output: `My phone number is 9840950950`
278
+
279
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
280
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
281
+
282
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
283
+ - Output: `mera phone number hai 9840950950`
284
+
285
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
286
+ - Output: `मेरा phone number है 9840950950`
194
287
 
195
288
  sample_rate : typing.Optional[str]
196
289
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -204,6 +297,10 @@ class AsyncSpeechToTextStreamingClient:
204
297
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
205
298
  Signal to flush the audio buffer and finalize transcription
206
299
 
300
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
301
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
302
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
303
+
207
304
  api_subscription_key : typing.Optional[str]
208
305
  API subscription key for authentication
209
306
 
@@ -230,6 +327,8 @@ class AsyncSpeechToTextStreamingClient:
230
327
  query_params = query_params.add("vad_signals", vad_signals)
231
328
  if flush_signal is not None:
232
329
  query_params = query_params.add("flush_signal", flush_signal)
330
+ if input_audio_codec is not None:
331
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
233
332
  ws_url = ws_url + f"?{query_params}"
234
333
  headers = self._raw_client._client_wrapper.get_headers()
235
334
  if api_subscription_key is not None:
@@ -12,6 +12,7 @@ from ..core.request_options import RequestOptions
12
12
  from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextStreamingSocketClient
13
13
  from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
14
14
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
15
+ from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
15
16
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
16
17
  from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
17
18
  from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -38,6 +39,7 @@ class RawSpeechToTextStreamingClient:
38
39
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
39
40
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
40
41
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
42
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
41
43
  api_subscription_key: typing.Optional[str] = None,
42
44
  request_options: typing.Optional[RequestOptions] = None,
43
45
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -51,19 +53,61 @@ class RawSpeechToTextStreamingClient:
51
53
  Parameters
52
54
  ----------
53
55
  language_code : SpeechToTextStreamingLanguageCode
54
- Language code for speech recognition (BCP-47 format)
56
+ Specifies the language of the input audio in BCP-47 format.
57
+
58
+ **Available Options (saarika:v2.5):**
59
+ - `hi-IN`: Hindi
60
+ - `bn-IN`: Bengali
61
+ - `gu-IN`: Gujarati
62
+ - `kn-IN`: Kannada
63
+ - `ml-IN`: Malayalam
64
+ - `mr-IN`: Marathi
65
+ - `od-IN`: Odia
66
+ - `pa-IN`: Punjabi
67
+ - `ta-IN`: Tamil
68
+ - `te-IN`: Telugu
69
+ - `en-IN`: English
70
+
71
+ **Additional Options (saaras:v3 only):**
72
+ - `as-IN`: Assamese
73
+ - `ur-IN`: Urdu
74
+ - `ne-IN`: Nepali
75
+ - `kok-IN`: Konkani
76
+ - `ks-IN`: Kashmiri
77
+ - `sd-IN`: Sindhi
78
+ - `sa-IN`: Sanskrit
79
+ - `sat-IN`: Santali
80
+ - `mni-IN`: Manipuri
81
+ - `brx-IN`: Bodo
82
+ - `mai-IN`: Maithili
83
+ - `doi-IN`: Dogri
55
84
 
56
85
  model : typing.Optional[SpeechToTextStreamingModel]
57
- Speech to text model to use
86
+ Specifies the model to use for speech-to-text conversion.
87
+
88
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
89
+
90
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
58
91
 
59
92
  mode : typing.Optional[SpeechToTextStreamingMode]
60
- Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
61
- - transcribe: Standard Whisper transcription
62
- - translate: Standard Whisper translation to English
63
- - indic-en: Translate Indic languages to English
64
- - verbatim: Exact transcription in original script
65
- - translit: Transliteration to Latin script
66
- - codemix: Code-mixed output (native + English)
93
+ Mode of operation. **Only applicable when using saaras:v3 model.**
94
+
95
+ Example audio: 'मेरा फोन नंबर है 9840950950'
96
+
97
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
98
+ - Output: `मेरा फोन नंबर है 9840950950`
99
+
100
+ - **translate**: Translates speech from any supported Indic language to English.
101
+ - Output: `My phone number is 9840950950`
102
+
103
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
104
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
105
+
106
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
107
+ - Output: `mera phone number hai 9840950950`
108
+
109
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
110
+ - Output: `मेरा phone number है 9840950950`
67
111
 
68
112
  sample_rate : typing.Optional[str]
69
113
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -77,6 +121,10 @@ class RawSpeechToTextStreamingClient:
77
121
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
78
122
  Signal to flush the audio buffer and finalize transcription
79
123
 
124
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
125
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
126
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
127
+
80
128
  api_subscription_key : typing.Optional[str]
81
129
  API subscription key for authentication
82
130
 
@@ -103,6 +151,8 @@ class RawSpeechToTextStreamingClient:
103
151
  query_params = query_params.add("vad_signals", vad_signals)
104
152
  if flush_signal is not None:
105
153
  query_params = query_params.add("flush_signal", flush_signal)
154
+ if input_audio_codec is not None:
155
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
106
156
  ws_url = ws_url + f"?{query_params}"
107
157
  headers = self._client_wrapper.get_headers()
108
158
  if api_subscription_key is not None:
@@ -142,6 +192,7 @@ class AsyncRawSpeechToTextStreamingClient:
142
192
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
143
193
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
144
194
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
195
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
145
196
  api_subscription_key: typing.Optional[str] = None,
146
197
  request_options: typing.Optional[RequestOptions] = None,
147
198
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -155,19 +206,61 @@ class AsyncRawSpeechToTextStreamingClient:
155
206
  Parameters
156
207
  ----------
157
208
  language_code : SpeechToTextStreamingLanguageCode
158
- Language code for speech recognition (BCP-47 format)
209
+ Specifies the language of the input audio in BCP-47 format.
210
+
211
+ **Available Options (saarika:v2.5):**
212
+ - `hi-IN`: Hindi
213
+ - `bn-IN`: Bengali
214
+ - `gu-IN`: Gujarati
215
+ - `kn-IN`: Kannada
216
+ - `ml-IN`: Malayalam
217
+ - `mr-IN`: Marathi
218
+ - `od-IN`: Odia
219
+ - `pa-IN`: Punjabi
220
+ - `ta-IN`: Tamil
221
+ - `te-IN`: Telugu
222
+ - `en-IN`: English
223
+
224
+ **Additional Options (saaras:v3 only):**
225
+ - `as-IN`: Assamese
226
+ - `ur-IN`: Urdu
227
+ - `ne-IN`: Nepali
228
+ - `kok-IN`: Konkani
229
+ - `ks-IN`: Kashmiri
230
+ - `sd-IN`: Sindhi
231
+ - `sa-IN`: Sanskrit
232
+ - `sat-IN`: Santali
233
+ - `mni-IN`: Manipuri
234
+ - `brx-IN`: Bodo
235
+ - `mai-IN`: Maithili
236
+ - `doi-IN`: Dogri
159
237
 
160
238
  model : typing.Optional[SpeechToTextStreamingModel]
161
- Speech to text model to use
239
+ Specifies the model to use for speech-to-text conversion.
240
+
241
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
242
+
243
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
162
244
 
163
245
  mode : typing.Optional[SpeechToTextStreamingMode]
164
- Mode of operation for saaras:v3 model. Only applicable when model is 'saaras:v3'.
165
- - transcribe: Standard Whisper transcription
166
- - translate: Standard Whisper translation to English
167
- - indic-en: Translate Indic languages to English
168
- - verbatim: Exact transcription in original script
169
- - translit: Transliteration to Latin script
170
- - codemix: Code-mixed output (native + English)
246
+ Mode of operation. **Only applicable when using saaras:v3 model.**
247
+
248
+ Example audio: 'मेरा फोन नंबर है 9840950950'
249
+
250
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
251
+ - Output: `मेरा फोन नंबर है 9840950950`
252
+
253
+ - **translate**: Translates speech from any supported Indic language to English.
254
+ - Output: `My phone number is 9840950950`
255
+
256
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
257
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
258
+
259
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
260
+ - Output: `mera phone number hai 9840950950`
261
+
262
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
263
+ - Output: `मेरा phone number है 9840950950`
171
264
 
172
265
  sample_rate : typing.Optional[str]
173
266
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -181,6 +274,10 @@ class AsyncRawSpeechToTextStreamingClient:
181
274
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
182
275
  Signal to flush the audio buffer and finalize transcription
183
276
 
277
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
278
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
279
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
280
+
184
281
  api_subscription_key : typing.Optional[str]
185
282
  API subscription key for authentication
186
283
 
@@ -207,6 +304,8 @@ class AsyncRawSpeechToTextStreamingClient:
207
304
  query_params = query_params.add("vad_signals", vad_signals)
208
305
  if flush_signal is not None:
209
306
  query_params = query_params.add("flush_signal", flush_signal)
307
+ if input_audio_codec is not None:
308
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
210
309
  ws_url = ws_url + f"?{query_params}"
211
310
  headers = self._client_wrapper.get_headers()
212
311
  if api_subscription_key is not None:
@@ -4,6 +4,7 @@
4
4
 
5
5
  from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
6
6
  from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
7
+ from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
7
8
  from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
8
9
  from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
9
10
  from .speech_to_text_streaming_model import SpeechToTextStreamingModel
@@ -12,6 +13,7 @@ from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignal
12
13
  __all__ = [
13
14
  "SpeechToTextStreamingFlushSignal",
14
15
  "SpeechToTextStreamingHighVadSensitivity",
16
+ "SpeechToTextStreamingInputAudioCodec",
15
17
  "SpeechToTextStreamingLanguageCode",
16
18
  "SpeechToTextStreamingMode",
17
19
  "SpeechToTextStreamingModel",
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SpeechToTextStreamingInputAudioCodec = typing.Union[
6
+ typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
7
+ ]
@@ -3,6 +3,30 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextStreamingLanguageCode = typing.Union[
6
- typing.Literal["en-IN", "hi-IN", "bn-IN", "gu-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN"],
6
+ typing.Literal[
7
+ "en-IN",
8
+ "hi-IN",
9
+ "bn-IN",
10
+ "gu-IN",
11
+ "kn-IN",
12
+ "ml-IN",
13
+ "mr-IN",
14
+ "od-IN",
15
+ "pa-IN",
16
+ "ta-IN",
17
+ "te-IN",
18
+ "as-IN",
19
+ "ur-IN",
20
+ "ne-IN",
21
+ "kok-IN",
22
+ "ks-IN",
23
+ "sd-IN",
24
+ "sa-IN",
25
+ "sat-IN",
26
+ "mni-IN",
27
+ "brx-IN",
28
+ "mai-IN",
29
+ "doi-IN",
30
+ ],
7
31
  typing.Any,
8
32
  ]
@@ -3,5 +3,5 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextStreamingMode = typing.Union[
6
- typing.Literal["transcribe", "translate", "indic-en", "verbatim", "translit", "codemix"], typing.Any
6
+ typing.Literal["transcribe", "translate", "verbatim", "translit", "codemix"], typing.Any
7
7
  ]
@@ -5,11 +5,13 @@
5
5
  from .types import (
6
6
  SpeechToTextTranslateStreamingFlushSignal,
7
7
  SpeechToTextTranslateStreamingHighVadSensitivity,
8
+ SpeechToTextTranslateStreamingInputAudioCodec,
8
9
  SpeechToTextTranslateStreamingVadSignals,
9
10
  )
10
11
 
11
12
  __all__ = [
12
13
  "SpeechToTextTranslateStreamingFlushSignal",
13
14
  "SpeechToTextTranslateStreamingHighVadSensitivity",
15
+ "SpeechToTextTranslateStreamingInputAudioCodec",
14
16
  "SpeechToTextTranslateStreamingVadSignals",
15
17
  ]
@@ -15,6 +15,7 @@ from .types.speech_to_text_translate_streaming_flush_signal import SpeechToTextT
15
15
  from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
16
16
  SpeechToTextTranslateStreamingHighVadSensitivity,
17
17
  )
18
+ from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
18
19
  from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
19
20
 
20
21
  try:
@@ -47,6 +48,7 @@ class SpeechToTextTranslateStreamingClient:
47
48
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
48
49
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
49
50
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
51
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
50
52
  api_subscription_key: typing.Optional[str] = None,
51
53
  request_options: typing.Optional[RequestOptions] = None,
52
54
  ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -60,7 +62,10 @@ class SpeechToTextTranslateStreamingClient:
60
62
  Parameters
61
63
  ----------
62
64
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
63
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
65
+ Model to be used for speech to text translation.
66
+
67
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
68
+ - Example: Hindi audio → English text output
64
69
 
65
70
  sample_rate : typing.Optional[str]
66
71
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -74,6 +79,10 @@ class SpeechToTextTranslateStreamingClient:
74
79
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
75
80
  Signal to flush the audio buffer and finalize transcription and translation
76
81
 
82
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
83
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
84
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
85
+
77
86
  api_subscription_key : typing.Optional[str]
78
87
  API subscription key for authentication
79
88
 
@@ -96,6 +105,8 @@ class SpeechToTextTranslateStreamingClient:
96
105
  query_params = query_params.add("vad_signals", vad_signals)
97
106
  if flush_signal is not None:
98
107
  query_params = query_params.add("flush_signal", flush_signal)
108
+ if input_audio_codec is not None:
109
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
99
110
  ws_url = ws_url + f"?{query_params}"
100
111
  headers = self._raw_client._client_wrapper.get_headers()
101
112
  if api_subscription_key is not None:
@@ -144,6 +155,7 @@ class AsyncSpeechToTextTranslateStreamingClient:
144
155
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
145
156
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
146
157
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
158
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
147
159
  api_subscription_key: typing.Optional[str] = None,
148
160
  request_options: typing.Optional[RequestOptions] = None,
149
161
  ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -157,7 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
157
169
  Parameters
158
170
  ----------
159
171
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
160
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
172
+ Model to be used for speech to text translation.
173
+
174
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
175
+ - Example: Hindi audio → English text output
161
176
 
162
177
  sample_rate : typing.Optional[str]
163
178
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -171,6 +186,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
171
186
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
172
187
  Signal to flush the audio buffer and finalize transcription and translation
173
188
 
189
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
190
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
191
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
192
+
174
193
  api_subscription_key : typing.Optional[str]
175
194
  API subscription key for authentication
176
195
 
@@ -193,6 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
193
212
  query_params = query_params.add("vad_signals", vad_signals)
194
213
  if flush_signal is not None:
195
214
  query_params = query_params.add("flush_signal", flush_signal)
215
+ if input_audio_codec is not None:
216
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
196
217
  ws_url = ws_url + f"?{query_params}"
197
218
  headers = self._raw_client._client_wrapper.get_headers()
198
219
  if api_subscription_key is not None: