sarvamai 0.1.22a4__py3-none-any.whl → 0.1.22a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. sarvamai/__init__.py +62 -3
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +775 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  24. sarvamai/speech_to_text/client.py +95 -10
  25. sarvamai/speech_to_text/raw_client.py +95 -10
  26. sarvamai/speech_to_text_job/client.py +60 -15
  27. sarvamai/speech_to_text_streaming/__init__.py +4 -0
  28. sarvamai/speech_to_text_streaming/client.py +102 -18
  29. sarvamai/speech_to_text_streaming/raw_client.py +102 -18
  30. sarvamai/speech_to_text_streaming/types/__init__.py +4 -0
  31. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  32. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  33. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  34. sarvamai/speech_to_text_translate_streaming/client.py +20 -12
  35. sarvamai/speech_to_text_translate_streaming/raw_client.py +20 -12
  36. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  37. sarvamai/text/client.py +0 -12
  38. sarvamai/text/raw_client.py +0 -12
  39. sarvamai/text_to_speech/client.py +116 -14
  40. sarvamai/text_to_speech/raw_client.py +116 -14
  41. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  42. sarvamai/text_to_speech_streaming/client.py +19 -6
  43. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  44. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  45. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  46. sarvamai/types/__init__.py +34 -2
  47. sarvamai/types/audio_data.py +0 -6
  48. sarvamai/types/configure_connection.py +4 -0
  49. sarvamai/types/configure_connection_data.py +40 -11
  50. sarvamai/types/configure_connection_data_model.py +5 -0
  51. sarvamai/types/configure_connection_data_speaker.py +35 -1
  52. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  53. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  54. sarvamai/types/doc_digitization_error_code.py +15 -0
  55. sarvamai/types/doc_digitization_error_details.py +33 -0
  56. sarvamai/types/doc_digitization_error_message.py +23 -0
  57. sarvamai/types/doc_digitization_job_detail.py +74 -0
  58. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  59. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  60. sarvamai/types/doc_digitization_job_state.py +7 -0
  61. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  62. sarvamai/types/doc_digitization_output_format.py +5 -0
  63. sarvamai/types/doc_digitization_page_error.py +36 -0
  64. sarvamai/types/doc_digitization_supported_language.py +32 -0
  65. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  66. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  67. sarvamai/types/mode.py +5 -0
  68. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  69. sarvamai/types/speech_to_text_model.py +1 -1
  70. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  71. sarvamai/types/text_to_speech_model.py +1 -1
  72. sarvamai/types/text_to_speech_speaker.py +35 -1
  73. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/METADATA +1 -1
  74. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/RECORD +75 -42
  75. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  76. {sarvamai-0.1.22a4.dist-info → sarvamai-0.1.22a8.dist-info}/WHEEL +0 -0
@@ -15,6 +15,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
15
15
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
16
16
  from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
17
17
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
18
+ from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
19
+ from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
18
20
  from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
19
21
 
20
22
  try:
@@ -43,12 +45,13 @@ class SpeechToTextStreamingClient:
43
45
  self,
44
46
  *,
45
47
  language_code: SpeechToTextStreamingLanguageCode,
46
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
47
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
48
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
49
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
48
50
  sample_rate: typing.Optional[str] = None,
49
51
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
50
52
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
51
53
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
54
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
52
55
  api_subscription_key: typing.Optional[str] = None,
53
56
  request_options: typing.Optional[RequestOptions] = None,
54
57
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -62,13 +65,47 @@ class SpeechToTextStreamingClient:
62
65
  Parameters
63
66
  ----------
64
67
  language_code : SpeechToTextStreamingLanguageCode
65
- Language code for speech recognition
68
+ Specifies the language of the input audio in BCP-47 format.
66
69
 
67
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
68
- Speech to text model to use
70
+ **Available Options:**
71
+ - `hi-IN`: Hindi
72
+ - `bn-IN`: Bengali
73
+ - `gu-IN`: Gujarati
74
+ - `kn-IN`: Kannada
75
+ - `ml-IN`: Malayalam
76
+ - `mr-IN`: Marathi
77
+ - `od-IN`: Odia
78
+ - `pa-IN`: Punjabi
79
+ - `ta-IN`: Tamil
80
+ - `te-IN`: Telugu
81
+ - `en-IN`: English
69
82
 
70
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
71
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
83
+ model : typing.Optional[SpeechToTextStreamingModel]
84
+ Specifies the model to use for speech-to-text conversion.
85
+
86
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
87
+
88
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
89
+
90
+ mode : typing.Optional[SpeechToTextStreamingMode]
91
+ Mode of operation. **Only applicable when using saaras:v3 model.**
92
+
93
+ Example audio: 'मेरा फोन नंबर है 9840950950'
94
+
95
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
96
+ - Output: `मेरा फोन नंबर है 9840950950`
97
+
98
+ - **translate**: Translates speech from any supported Indic language to English.
99
+ - Output: `My phone number is 9840950950`
100
+
101
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
102
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
103
+
104
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
105
+ - Output: `mera phone number hai 9840950950`
106
+
107
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
108
+ - Output: `मेरा phone number है 9840950950`
72
109
 
73
110
  sample_rate : typing.Optional[str]
74
111
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -82,6 +119,10 @@ class SpeechToTextStreamingClient:
82
119
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
83
120
  Signal to flush the audio buffer and finalize transcription
84
121
 
122
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
123
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
124
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
125
+
85
126
  api_subscription_key : typing.Optional[str]
86
127
  API subscription key for authentication
87
128
 
@@ -98,8 +139,8 @@ class SpeechToTextStreamingClient:
98
139
  query_params = query_params.add("language-code", language_code)
99
140
  if model is not None:
100
141
  query_params = query_params.add("model", model)
101
- if input_audio_codec is not None:
102
- query_params = query_params.add("input_audio_codec", input_audio_codec)
142
+ if mode is not None:
143
+ query_params = query_params.add("mode", mode)
103
144
  if sample_rate is not None:
104
145
  query_params = query_params.add("sample_rate", sample_rate)
105
146
  if high_vad_sensitivity is not None:
@@ -108,6 +149,8 @@ class SpeechToTextStreamingClient:
108
149
  query_params = query_params.add("vad_signals", vad_signals)
109
150
  if flush_signal is not None:
110
151
  query_params = query_params.add("flush_signal", flush_signal)
152
+ if input_audio_codec is not None:
153
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
111
154
  ws_url = ws_url + f"?{query_params}"
112
155
  headers = self._raw_client._client_wrapper.get_headers()
113
156
  if api_subscription_key is not None:
@@ -152,12 +195,13 @@ class AsyncSpeechToTextStreamingClient:
152
195
  self,
153
196
  *,
154
197
  language_code: SpeechToTextStreamingLanguageCode,
155
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
156
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
198
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
199
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
157
200
  sample_rate: typing.Optional[str] = None,
158
201
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
159
202
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
160
203
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
204
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
161
205
  api_subscription_key: typing.Optional[str] = None,
162
206
  request_options: typing.Optional[RequestOptions] = None,
163
207
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -171,13 +215,47 @@ class AsyncSpeechToTextStreamingClient:
171
215
  Parameters
172
216
  ----------
173
217
  language_code : SpeechToTextStreamingLanguageCode
174
- Language code for speech recognition
218
+ Specifies the language of the input audio in BCP-47 format.
175
219
 
176
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
177
- Speech to text model to use
220
+ **Available Options:**
221
+ - `hi-IN`: Hindi
222
+ - `bn-IN`: Bengali
223
+ - `gu-IN`: Gujarati
224
+ - `kn-IN`: Kannada
225
+ - `ml-IN`: Malayalam
226
+ - `mr-IN`: Marathi
227
+ - `od-IN`: Odia
228
+ - `pa-IN`: Punjabi
229
+ - `ta-IN`: Tamil
230
+ - `te-IN`: Telugu
231
+ - `en-IN`: English
178
232
 
179
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
180
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
233
+ model : typing.Optional[SpeechToTextStreamingModel]
234
+ Specifies the model to use for speech-to-text conversion.
235
+
236
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
237
+
238
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
239
+
240
+ mode : typing.Optional[SpeechToTextStreamingMode]
241
+ Mode of operation. **Only applicable when using saaras:v3 model.**
242
+
243
+ Example audio: 'मेरा फोन नंबर है 9840950950'
244
+
245
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
246
+ - Output: `मेरा फोन नंबर है 9840950950`
247
+
248
+ - **translate**: Translates speech from any supported Indic language to English.
249
+ - Output: `My phone number is 9840950950`
250
+
251
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
252
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
253
+
254
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
255
+ - Output: `mera phone number hai 9840950950`
256
+
257
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
258
+ - Output: `मेरा phone number है 9840950950`
181
259
 
182
260
  sample_rate : typing.Optional[str]
183
261
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -191,6 +269,10 @@ class AsyncSpeechToTextStreamingClient:
191
269
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
192
270
  Signal to flush the audio buffer and finalize transcription
193
271
 
272
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
273
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
274
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
275
+
194
276
  api_subscription_key : typing.Optional[str]
195
277
  API subscription key for authentication
196
278
 
@@ -207,8 +289,8 @@ class AsyncSpeechToTextStreamingClient:
207
289
  query_params = query_params.add("language-code", language_code)
208
290
  if model is not None:
209
291
  query_params = query_params.add("model", model)
210
- if input_audio_codec is not None:
211
- query_params = query_params.add("input_audio_codec", input_audio_codec)
292
+ if mode is not None:
293
+ query_params = query_params.add("mode", mode)
212
294
  if sample_rate is not None:
213
295
  query_params = query_params.add("sample_rate", sample_rate)
214
296
  if high_vad_sensitivity is not None:
@@ -217,6 +299,8 @@ class AsyncSpeechToTextStreamingClient:
217
299
  query_params = query_params.add("vad_signals", vad_signals)
218
300
  if flush_signal is not None:
219
301
  query_params = query_params.add("flush_signal", flush_signal)
302
+ if input_audio_codec is not None:
303
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
220
304
  ws_url = ws_url + f"?{query_params}"
221
305
  headers = self._raw_client._client_wrapper.get_headers()
222
306
  if api_subscription_key is not None:
@@ -14,6 +14,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
14
14
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
15
15
  from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
16
16
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
17
+ from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
18
+ from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
17
19
  from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
18
20
 
19
21
  try:
@@ -31,12 +33,13 @@ class RawSpeechToTextStreamingClient:
31
33
  self,
32
34
  *,
33
35
  language_code: SpeechToTextStreamingLanguageCode,
34
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
35
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
36
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
37
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
36
38
  sample_rate: typing.Optional[str] = None,
37
39
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
38
40
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
39
41
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
42
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
40
43
  api_subscription_key: typing.Optional[str] = None,
41
44
  request_options: typing.Optional[RequestOptions] = None,
42
45
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -50,13 +53,47 @@ class RawSpeechToTextStreamingClient:
50
53
  Parameters
51
54
  ----------
52
55
  language_code : SpeechToTextStreamingLanguageCode
53
- Language code for speech recognition
56
+ Specifies the language of the input audio in BCP-47 format.
54
57
 
55
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
56
- Speech to text model to use
58
+ **Available Options:**
59
+ - `hi-IN`: Hindi
60
+ - `bn-IN`: Bengali
61
+ - `gu-IN`: Gujarati
62
+ - `kn-IN`: Kannada
63
+ - `ml-IN`: Malayalam
64
+ - `mr-IN`: Marathi
65
+ - `od-IN`: Odia
66
+ - `pa-IN`: Punjabi
67
+ - `ta-IN`: Tamil
68
+ - `te-IN`: Telugu
69
+ - `en-IN`: English
57
70
 
58
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
59
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
71
+ model : typing.Optional[SpeechToTextStreamingModel]
72
+ Specifies the model to use for speech-to-text conversion.
73
+
74
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
75
+
76
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
77
+
78
+ mode : typing.Optional[SpeechToTextStreamingMode]
79
+ Mode of operation. **Only applicable when using saaras:v3 model.**
80
+
81
+ Example audio: 'मेरा फोन नंबर है 9840950950'
82
+
83
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
84
+ - Output: `मेरा फोन नंबर है 9840950950`
85
+
86
+ - **translate**: Translates speech from any supported Indic language to English.
87
+ - Output: `My phone number is 9840950950`
88
+
89
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
90
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
91
+
92
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
93
+ - Output: `mera phone number hai 9840950950`
94
+
95
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
96
+ - Output: `मेरा phone number है 9840950950`
60
97
 
61
98
  sample_rate : typing.Optional[str]
62
99
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -70,6 +107,10 @@ class RawSpeechToTextStreamingClient:
70
107
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
71
108
  Signal to flush the audio buffer and finalize transcription
72
109
 
110
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
111
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
112
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
113
+
73
114
  api_subscription_key : typing.Optional[str]
74
115
  API subscription key for authentication
75
116
 
@@ -86,8 +127,8 @@ class RawSpeechToTextStreamingClient:
86
127
  query_params = query_params.add("language-code", language_code)
87
128
  if model is not None:
88
129
  query_params = query_params.add("model", model)
89
- if input_audio_codec is not None:
90
- query_params = query_params.add("input_audio_codec", input_audio_codec)
130
+ if mode is not None:
131
+ query_params = query_params.add("mode", mode)
91
132
  if sample_rate is not None:
92
133
  query_params = query_params.add("sample_rate", sample_rate)
93
134
  if high_vad_sensitivity is not None:
@@ -96,6 +137,8 @@ class RawSpeechToTextStreamingClient:
96
137
  query_params = query_params.add("vad_signals", vad_signals)
97
138
  if flush_signal is not None:
98
139
  query_params = query_params.add("flush_signal", flush_signal)
140
+ if input_audio_codec is not None:
141
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
99
142
  ws_url = ws_url + f"?{query_params}"
100
143
  headers = self._client_wrapper.get_headers()
101
144
  if api_subscription_key is not None:
@@ -129,12 +172,13 @@ class AsyncRawSpeechToTextStreamingClient:
129
172
  self,
130
173
  *,
131
174
  language_code: SpeechToTextStreamingLanguageCode,
132
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
133
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
175
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
176
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
134
177
  sample_rate: typing.Optional[str] = None,
135
178
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
136
179
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
137
180
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
181
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
138
182
  api_subscription_key: typing.Optional[str] = None,
139
183
  request_options: typing.Optional[RequestOptions] = None,
140
184
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -148,13 +192,47 @@ class AsyncRawSpeechToTextStreamingClient:
148
192
  Parameters
149
193
  ----------
150
194
  language_code : SpeechToTextStreamingLanguageCode
151
- Language code for speech recognition
195
+ Specifies the language of the input audio in BCP-47 format.
152
196
 
153
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
154
- Speech to text model to use
197
+ **Available Options:**
198
+ - `hi-IN`: Hindi
199
+ - `bn-IN`: Bengali
200
+ - `gu-IN`: Gujarati
201
+ - `kn-IN`: Kannada
202
+ - `ml-IN`: Malayalam
203
+ - `mr-IN`: Marathi
204
+ - `od-IN`: Odia
205
+ - `pa-IN`: Punjabi
206
+ - `ta-IN`: Tamil
207
+ - `te-IN`: Telugu
208
+ - `en-IN`: English
155
209
 
156
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
157
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
210
+ model : typing.Optional[SpeechToTextStreamingModel]
211
+ Specifies the model to use for speech-to-text conversion.
212
+
213
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
214
+
215
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
216
+
217
+ mode : typing.Optional[SpeechToTextStreamingMode]
218
+ Mode of operation. **Only applicable when using saaras:v3 model.**
219
+
220
+ Example audio: 'मेरा फोन नंबर है 9840950950'
221
+
222
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
223
+ - Output: `मेरा फोन नंबर है 9840950950`
224
+
225
+ - **translate**: Translates speech from any supported Indic language to English.
226
+ - Output: `My phone number is 9840950950`
227
+
228
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
229
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
230
+
231
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
232
+ - Output: `mera phone number hai 9840950950`
233
+
234
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
235
+ - Output: `मेरा phone number है 9840950950`
158
236
 
159
237
  sample_rate : typing.Optional[str]
160
238
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -168,6 +246,10 @@ class AsyncRawSpeechToTextStreamingClient:
168
246
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
169
247
  Signal to flush the audio buffer and finalize transcription
170
248
 
249
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
250
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
251
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
252
+
171
253
  api_subscription_key : typing.Optional[str]
172
254
  API subscription key for authentication
173
255
 
@@ -184,8 +266,8 @@ class AsyncRawSpeechToTextStreamingClient:
184
266
  query_params = query_params.add("language-code", language_code)
185
267
  if model is not None:
186
268
  query_params = query_params.add("model", model)
187
- if input_audio_codec is not None:
188
- query_params = query_params.add("input_audio_codec", input_audio_codec)
269
+ if mode is not None:
270
+ query_params = query_params.add("mode", mode)
189
271
  if sample_rate is not None:
190
272
  query_params = query_params.add("sample_rate", sample_rate)
191
273
  if high_vad_sensitivity is not None:
@@ -194,6 +276,8 @@ class AsyncRawSpeechToTextStreamingClient:
194
276
  query_params = query_params.add("vad_signals", vad_signals)
195
277
  if flush_signal is not None:
196
278
  query_params = query_params.add("flush_signal", flush_signal)
279
+ if input_audio_codec is not None:
280
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
197
281
  ws_url = ws_url + f"?{query_params}"
198
282
  headers = self._client_wrapper.get_headers()
199
283
  if api_subscription_key is not None:
@@ -6,6 +6,8 @@ from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSig
6
6
  from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
7
7
  from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
8
8
  from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
9
+ from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
10
+ from .speech_to_text_streaming_model import SpeechToTextStreamingModel
9
11
  from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
10
12
 
11
13
  __all__ = [
@@ -13,5 +15,7 @@ __all__ = [
13
15
  "SpeechToTextStreamingHighVadSensitivity",
14
16
  "SpeechToTextStreamingInputAudioCodec",
15
17
  "SpeechToTextStreamingLanguageCode",
18
+ "SpeechToTextStreamingMode",
19
+ "SpeechToTextStreamingModel",
16
20
  "SpeechToTextStreamingVadSignals",
17
21
  ]
@@ -3,31 +3,5 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextStreamingInputAudioCodec = typing.Union[
6
- typing.Literal[
7
- "wav",
8
- "x-wav",
9
- "wave",
10
- "mp3",
11
- "mpeg",
12
- "mpeg3",
13
- "x-mp3",
14
- "x-mpeg-3",
15
- "aac",
16
- "x-aac",
17
- "aiff",
18
- "x-aiff",
19
- "ogg",
20
- "opus",
21
- "flac",
22
- "x-flac",
23
- "mp4",
24
- "x-m4a",
25
- "amr",
26
- "x-ms-wma",
27
- "webm",
28
- "pcm_s16le",
29
- "pcm_l16",
30
- "pcm_raw",
31
- ],
32
- typing.Any,
6
+ typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
33
7
  ]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SpeechToTextStreamingMode = typing.Union[
6
+ typing.Literal["transcribe", "translate", "verbatim", "translit", "codemix"], typing.Any
7
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SpeechToTextStreamingModel = typing.Union[typing.Literal["saarika:v2.5", "saaras:v3"], typing.Any]
@@ -44,11 +44,11 @@ class SpeechToTextTranslateStreamingClient:
44
44
  self,
45
45
  *,
46
46
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
47
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
48
47
  sample_rate: typing.Optional[str] = None,
49
48
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
50
49
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
51
50
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
51
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
52
52
  api_subscription_key: typing.Optional[str] = None,
53
53
  request_options: typing.Optional[RequestOptions] = None,
54
54
  ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -62,10 +62,10 @@ class SpeechToTextTranslateStreamingClient:
62
62
  Parameters
63
63
  ----------
64
64
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
65
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
65
+ Model to be used for speech to text translation.
66
66
 
67
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
68
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
67
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
68
+ - Example: Hindi audio English text output
69
69
 
70
70
  sample_rate : typing.Optional[str]
71
71
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -79,6 +79,10 @@ class SpeechToTextTranslateStreamingClient:
79
79
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
80
80
  Signal to flush the audio buffer and finalize transcription and translation
81
81
 
82
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
83
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
84
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
85
+
82
86
  api_subscription_key : typing.Optional[str]
83
87
  API subscription key for authentication
84
88
 
@@ -93,8 +97,6 @@ class SpeechToTextTranslateStreamingClient:
93
97
  query_params = httpx.QueryParams()
94
98
  if model is not None:
95
99
  query_params = query_params.add("model", model)
96
- if input_audio_codec is not None:
97
- query_params = query_params.add("input_audio_codec", input_audio_codec)
98
100
  if sample_rate is not None:
99
101
  query_params = query_params.add("sample_rate", sample_rate)
100
102
  if high_vad_sensitivity is not None:
@@ -103,6 +105,8 @@ class SpeechToTextTranslateStreamingClient:
103
105
  query_params = query_params.add("vad_signals", vad_signals)
104
106
  if flush_signal is not None:
105
107
  query_params = query_params.add("flush_signal", flush_signal)
108
+ if input_audio_codec is not None:
109
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
106
110
  ws_url = ws_url + f"?{query_params}"
107
111
  headers = self._raw_client._client_wrapper.get_headers()
108
112
  if api_subscription_key is not None:
@@ -147,11 +151,11 @@ class AsyncSpeechToTextTranslateStreamingClient:
147
151
  self,
148
152
  *,
149
153
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
150
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
151
154
  sample_rate: typing.Optional[str] = None,
152
155
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
153
156
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
154
157
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
158
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
155
159
  api_subscription_key: typing.Optional[str] = None,
156
160
  request_options: typing.Optional[RequestOptions] = None,
157
161
  ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -165,10 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
165
169
  Parameters
166
170
  ----------
167
171
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
168
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
172
+ Model to be used for speech to text translation.
169
173
 
170
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
171
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
174
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
175
+ - Example: Hindi audio English text output
172
176
 
173
177
  sample_rate : typing.Optional[str]
174
178
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -182,6 +186,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
182
186
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
183
187
  Signal to flush the audio buffer and finalize transcription and translation
184
188
 
189
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
190
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
191
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
192
+
185
193
  api_subscription_key : typing.Optional[str]
186
194
  API subscription key for authentication
187
195
 
@@ -196,8 +204,6 @@ class AsyncSpeechToTextTranslateStreamingClient:
196
204
  query_params = httpx.QueryParams()
197
205
  if model is not None:
198
206
  query_params = query_params.add("model", model)
199
- if input_audio_codec is not None:
200
- query_params = query_params.add("input_audio_codec", input_audio_codec)
201
207
  if sample_rate is not None:
202
208
  query_params = query_params.add("sample_rate", sample_rate)
203
209
  if high_vad_sensitivity is not None:
@@ -206,6 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
206
212
  query_params = query_params.add("vad_signals", vad_signals)
207
213
  if flush_signal is not None:
208
214
  query_params = query_params.add("flush_signal", flush_signal)
215
+ if input_audio_codec is not None:
216
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
209
217
  ws_url = ws_url + f"?{query_params}"
210
218
  headers = self._raw_client._client_wrapper.get_headers()
211
219
  if api_subscription_key is not None: