sarvamai 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sarvamai/__init__.py +62 -9
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +776 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_transcription_data.py +0 -6
  24. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  25. sarvamai/requests/speech_to_text_translate_transcription_data.py +0 -6
  26. sarvamai/speech_to_text/client.py +95 -10
  27. sarvamai/speech_to_text/raw_client.py +95 -10
  28. sarvamai/speech_to_text_job/client.py +60 -15
  29. sarvamai/speech_to_text_job/job.py +100 -2
  30. sarvamai/speech_to_text_job/raw_client.py +14 -10
  31. sarvamai/speech_to_text_streaming/__init__.py +4 -2
  32. sarvamai/speech_to_text_streaming/client.py +100 -47
  33. sarvamai/speech_to_text_streaming/raw_client.py +100 -47
  34. sarvamai/speech_to_text_streaming/types/__init__.py +4 -2
  35. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  36. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  37. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  38. sarvamai/speech_to_text_translate_job/job.py +100 -2
  39. sarvamai/speech_to_text_translate_job/raw_client.py +14 -10
  40. sarvamai/speech_to_text_translate_streaming/__init__.py +0 -2
  41. sarvamai/speech_to_text_translate_streaming/client.py +18 -41
  42. sarvamai/speech_to_text_translate_streaming/raw_client.py +18 -41
  43. sarvamai/speech_to_text_translate_streaming/types/__init__.py +0 -4
  44. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  45. sarvamai/text/client.py +0 -12
  46. sarvamai/text/raw_client.py +0 -12
  47. sarvamai/text_to_speech/client.py +116 -14
  48. sarvamai/text_to_speech/raw_client.py +116 -14
  49. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  50. sarvamai/text_to_speech_streaming/client.py +19 -6
  51. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  52. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  53. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  54. sarvamai/types/__init__.py +34 -4
  55. sarvamai/types/audio_data.py +0 -6
  56. sarvamai/types/completion_event_flag.py +3 -1
  57. sarvamai/types/configure_connection.py +4 -0
  58. sarvamai/types/configure_connection_data.py +40 -11
  59. sarvamai/types/configure_connection_data_model.py +5 -0
  60. sarvamai/types/configure_connection_data_speaker.py +35 -1
  61. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  62. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  63. sarvamai/types/doc_digitization_error_code.py +15 -0
  64. sarvamai/types/doc_digitization_error_details.py +33 -0
  65. sarvamai/types/doc_digitization_error_message.py +23 -0
  66. sarvamai/types/doc_digitization_job_detail.py +74 -0
  67. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  68. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  69. sarvamai/types/doc_digitization_job_state.py +7 -0
  70. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  71. sarvamai/types/doc_digitization_output_format.py +5 -0
  72. sarvamai/types/doc_digitization_page_error.py +36 -0
  73. sarvamai/types/doc_digitization_supported_language.py +32 -0
  74. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  75. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  76. sarvamai/types/mode.py +5 -0
  77. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  78. sarvamai/types/speech_to_text_model.py +1 -1
  79. sarvamai/types/speech_to_text_transcription_data.py +0 -6
  80. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  81. sarvamai/types/speech_to_text_translate_transcription_data.py +0 -6
  82. sarvamai/types/text_to_speech_model.py +1 -1
  83. sarvamai/types/text_to_speech_speaker.py +35 -1
  84. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
  85. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +86 -56
  86. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_stream_ongoing_speech_results.py +0 -5
  87. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_stream_ongoing_speech_results.py +0 -5
  88. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  89. sarvamai/types/response_speech_state.py +0 -7
  90. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
@@ -39,7 +39,7 @@ class RawSpeechToTextJobClient:
39
39
  request_options: typing.Optional[RequestOptions] = None,
40
40
  ) -> HttpResponse[BulkJobInitResponseV1]:
41
41
  """
42
- Get a job uuid, and storage folder details for speech to text bulk job v1
42
+ Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
43
43
 
44
44
  Parameters
45
45
  ----------
@@ -160,7 +160,9 @@ class RawSpeechToTextJobClient:
160
160
  self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
161
161
  ) -> HttpResponse[JobStatusV1Response]:
162
162
  """
163
- Get the status of a speech to text bulk job V1
163
+ Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
164
+
165
+ **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
164
166
 
165
167
  Parameters
166
168
  ----------
@@ -270,7 +272,7 @@ class RawSpeechToTextJobClient:
270
272
  request_options: typing.Optional[RequestOptions] = None,
271
273
  ) -> HttpResponse[JobStatusV1Response]:
272
274
  """
273
- Start a speech to text bulk job V1
275
+ Start processing a speech to text bulk job after all audio files have been uploaded
274
276
 
275
277
  Parameters
276
278
  ----------
@@ -381,7 +383,7 @@ class RawSpeechToTextJobClient:
381
383
  self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
382
384
  ) -> HttpResponse[FilesUploadResponse]:
383
385
  """
384
- Start a speech to text bulk job V1
386
+ Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
385
387
 
386
388
  Parameters
387
389
  ----------
@@ -496,7 +498,7 @@ class RawSpeechToTextJobClient:
496
498
  self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
497
499
  ) -> HttpResponse[FilesDownloadResponse]:
498
500
  """
499
- Start a speech to text bulk job V1
501
+ Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
500
502
 
501
503
  Parameters
502
504
  ----------
@@ -620,7 +622,7 @@ class AsyncRawSpeechToTextJobClient:
620
622
  request_options: typing.Optional[RequestOptions] = None,
621
623
  ) -> AsyncHttpResponse[BulkJobInitResponseV1]:
622
624
  """
623
- Get a job uuid, and storage folder details for speech to text bulk job v1
625
+ Create a new speech to text bulk job and receive a job UUID and storage folder details for processing multiple audio files
624
626
 
625
627
  Parameters
626
628
  ----------
@@ -741,7 +743,9 @@ class AsyncRawSpeechToTextJobClient:
741
743
  self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
742
744
  ) -> AsyncHttpResponse[JobStatusV1Response]:
743
745
  """
744
- Get the status of a speech to text bulk job V1
746
+ Retrieve the current status and details of a speech to text bulk job, including progress and file-level information.
747
+
748
+ **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
745
749
 
746
750
  Parameters
747
751
  ----------
@@ -851,7 +855,7 @@ class AsyncRawSpeechToTextJobClient:
851
855
  request_options: typing.Optional[RequestOptions] = None,
852
856
  ) -> AsyncHttpResponse[JobStatusV1Response]:
853
857
  """
854
- Start a speech to text bulk job V1
858
+ Start processing a speech to text bulk job after all audio files have been uploaded
855
859
 
856
860
  Parameters
857
861
  ----------
@@ -962,7 +966,7 @@ class AsyncRawSpeechToTextJobClient:
962
966
  self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
963
967
  ) -> AsyncHttpResponse[FilesUploadResponse]:
964
968
  """
965
- Start a speech to text bulk job V1
969
+ Generate presigned upload URLs for audio files that will be processed in a speech to text bulk job
966
970
 
967
971
  Parameters
968
972
  ----------
@@ -1077,7 +1081,7 @@ class AsyncRawSpeechToTextJobClient:
1077
1081
  self, *, job_id: str, files: typing.Sequence[str], request_options: typing.Optional[RequestOptions] = None
1078
1082
  ) -> AsyncHttpResponse[FilesDownloadResponse]:
1079
1083
  """
1080
- Start a speech to text bulk job V1
1084
+ Generate presigned download URLs for the transcription output files of a completed speech to text bulk job
1081
1085
 
1082
1086
  Parameters
1083
1087
  ----------
@@ -7,7 +7,8 @@ from .types import (
7
7
  SpeechToTextStreamingHighVadSensitivity,
8
8
  SpeechToTextStreamingInputAudioCodec,
9
9
  SpeechToTextStreamingLanguageCode,
10
- SpeechToTextStreamingStreamOngoingSpeechResults,
10
+ SpeechToTextStreamingMode,
11
+ SpeechToTextStreamingModel,
11
12
  SpeechToTextStreamingVadSignals,
12
13
  )
13
14
 
@@ -16,6 +17,7 @@ __all__ = [
16
17
  "SpeechToTextStreamingHighVadSensitivity",
17
18
  "SpeechToTextStreamingInputAudioCodec",
18
19
  "SpeechToTextStreamingLanguageCode",
19
- "SpeechToTextStreamingStreamOngoingSpeechResults",
20
+ "SpeechToTextStreamingMode",
21
+ "SpeechToTextStreamingModel",
20
22
  "SpeechToTextStreamingVadSignals",
21
23
  ]
@@ -15,9 +15,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
15
15
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
16
16
  from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
17
17
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
18
- from .types.speech_to_text_streaming_stream_ongoing_speech_results import (
19
- SpeechToTextStreamingStreamOngoingSpeechResults,
20
- )
18
+ from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
19
+ from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
21
20
  from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
22
21
 
23
22
  try:
@@ -46,14 +45,13 @@ class SpeechToTextStreamingClient:
46
45
  self,
47
46
  *,
48
47
  language_code: SpeechToTextStreamingLanguageCode,
49
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
50
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
48
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
49
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
51
50
  sample_rate: typing.Optional[str] = None,
52
51
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
53
52
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
54
53
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
55
- stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
56
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
54
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
57
55
  api_subscription_key: typing.Optional[str] = None,
58
56
  request_options: typing.Optional[RequestOptions] = None,
59
57
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -67,13 +65,47 @@ class SpeechToTextStreamingClient:
67
65
  Parameters
68
66
  ----------
69
67
  language_code : SpeechToTextStreamingLanguageCode
70
- Language code for speech recognition
68
+ Specifies the language of the input audio in BCP-47 format.
71
69
 
72
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
73
- Speech to text model to use
70
+ **Available Options:**
71
+ - `hi-IN`: Hindi
72
+ - `bn-IN`: Bengali
73
+ - `gu-IN`: Gujarati
74
+ - `kn-IN`: Kannada
75
+ - `ml-IN`: Malayalam
76
+ - `mr-IN`: Marathi
77
+ - `od-IN`: Odia
78
+ - `pa-IN`: Punjabi
79
+ - `ta-IN`: Tamil
80
+ - `te-IN`: Telugu
81
+ - `en-IN`: English
74
82
 
75
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
76
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
83
+ model : typing.Optional[SpeechToTextStreamingModel]
84
+ Specifies the model to use for speech-to-text conversion.
85
+
86
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
87
+
88
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
89
+
90
+ mode : typing.Optional[SpeechToTextStreamingMode]
91
+ Mode of operation. **Only applicable when using saaras:v3 model.**
92
+
93
+ Example audio: 'मेरा फोन नंबर है 9840950950'
94
+
95
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
96
+ - Output: `मेरा फोन नंबर है 9840950950`
97
+
98
+ - **translate**: Translates speech from any supported Indic language to English.
99
+ - Output: `My phone number is 9840950950`
100
+
101
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
102
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
103
+
104
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
105
+ - Output: `mera phone number hai 9840950950`
106
+
107
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
108
+ - Output: `मेरा phone number है 9840950950`
77
109
 
78
110
  sample_rate : typing.Optional[str]
79
111
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -87,11 +119,9 @@ class SpeechToTextStreamingClient:
87
119
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
88
120
  Signal to flush the audio buffer and finalize transcription
89
121
 
90
- stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
91
- Enable streaming of ongoing speech results during active speech
92
-
93
- streaming_ongoing_requests_frame_size : typing.Optional[str]
94
- Frame size for streaming ongoing speech results (1-100)
122
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
123
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
124
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
95
125
 
96
126
  api_subscription_key : typing.Optional[str]
97
127
  API subscription key for authentication
@@ -109,8 +139,8 @@ class SpeechToTextStreamingClient:
109
139
  query_params = query_params.add("language-code", language_code)
110
140
  if model is not None:
111
141
  query_params = query_params.add("model", model)
112
- if input_audio_codec is not None:
113
- query_params = query_params.add("input_audio_codec", input_audio_codec)
142
+ if mode is not None:
143
+ query_params = query_params.add("mode", mode)
114
144
  if sample_rate is not None:
115
145
  query_params = query_params.add("sample_rate", sample_rate)
116
146
  if high_vad_sensitivity is not None:
@@ -119,12 +149,8 @@ class SpeechToTextStreamingClient:
119
149
  query_params = query_params.add("vad_signals", vad_signals)
120
150
  if flush_signal is not None:
121
151
  query_params = query_params.add("flush_signal", flush_signal)
122
- if stream_ongoing_speech_results is not None:
123
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
124
- if streaming_ongoing_requests_frame_size is not None:
125
- query_params = query_params.add(
126
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
127
- )
152
+ if input_audio_codec is not None:
153
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
128
154
  ws_url = ws_url + f"?{query_params}"
129
155
  headers = self._raw_client._client_wrapper.get_headers()
130
156
  if api_subscription_key is not None:
@@ -169,14 +195,13 @@ class AsyncSpeechToTextStreamingClient:
169
195
  self,
170
196
  *,
171
197
  language_code: SpeechToTextStreamingLanguageCode,
172
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
173
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
198
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
199
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
174
200
  sample_rate: typing.Optional[str] = None,
175
201
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
176
202
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
177
203
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
178
- stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
179
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
204
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
180
205
  api_subscription_key: typing.Optional[str] = None,
181
206
  request_options: typing.Optional[RequestOptions] = None,
182
207
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -190,13 +215,47 @@ class AsyncSpeechToTextStreamingClient:
190
215
  Parameters
191
216
  ----------
192
217
  language_code : SpeechToTextStreamingLanguageCode
193
- Language code for speech recognition
218
+ Specifies the language of the input audio in BCP-47 format.
194
219
 
195
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
196
- Speech to text model to use
220
+ **Available Options:**
221
+ - `hi-IN`: Hindi
222
+ - `bn-IN`: Bengali
223
+ - `gu-IN`: Gujarati
224
+ - `kn-IN`: Kannada
225
+ - `ml-IN`: Malayalam
226
+ - `mr-IN`: Marathi
227
+ - `od-IN`: Odia
228
+ - `pa-IN`: Punjabi
229
+ - `ta-IN`: Tamil
230
+ - `te-IN`: Telugu
231
+ - `en-IN`: English
197
232
 
198
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
199
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
233
+ model : typing.Optional[SpeechToTextStreamingModel]
234
+ Specifies the model to use for speech-to-text conversion.
235
+
236
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
237
+
238
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
239
+
240
+ mode : typing.Optional[SpeechToTextStreamingMode]
241
+ Mode of operation. **Only applicable when using saaras:v3 model.**
242
+
243
+ Example audio: 'मेरा फोन नंबर है 9840950950'
244
+
245
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
246
+ - Output: `मेरा फोन नंबर है 9840950950`
247
+
248
+ - **translate**: Translates speech from any supported Indic language to English.
249
+ - Output: `My phone number is 9840950950`
250
+
251
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
252
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
253
+
254
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
255
+ - Output: `mera phone number hai 9840950950`
256
+
257
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
258
+ - Output: `मेरा phone number है 9840950950`
200
259
 
201
260
  sample_rate : typing.Optional[str]
202
261
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -210,11 +269,9 @@ class AsyncSpeechToTextStreamingClient:
210
269
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
211
270
  Signal to flush the audio buffer and finalize transcription
212
271
 
213
- stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
214
- Enable streaming of ongoing speech results during active speech
215
-
216
- streaming_ongoing_requests_frame_size : typing.Optional[str]
217
- Frame size for streaming ongoing speech results (1-100)
272
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
273
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
274
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
218
275
 
219
276
  api_subscription_key : typing.Optional[str]
220
277
  API subscription key for authentication
@@ -232,8 +289,8 @@ class AsyncSpeechToTextStreamingClient:
232
289
  query_params = query_params.add("language-code", language_code)
233
290
  if model is not None:
234
291
  query_params = query_params.add("model", model)
235
- if input_audio_codec is not None:
236
- query_params = query_params.add("input_audio_codec", input_audio_codec)
292
+ if mode is not None:
293
+ query_params = query_params.add("mode", mode)
237
294
  if sample_rate is not None:
238
295
  query_params = query_params.add("sample_rate", sample_rate)
239
296
  if high_vad_sensitivity is not None:
@@ -242,12 +299,8 @@ class AsyncSpeechToTextStreamingClient:
242
299
  query_params = query_params.add("vad_signals", vad_signals)
243
300
  if flush_signal is not None:
244
301
  query_params = query_params.add("flush_signal", flush_signal)
245
- if stream_ongoing_speech_results is not None:
246
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
247
- if streaming_ongoing_requests_frame_size is not None:
248
- query_params = query_params.add(
249
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
250
- )
302
+ if input_audio_codec is not None:
303
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
251
304
  ws_url = ws_url + f"?{query_params}"
252
305
  headers = self._raw_client._client_wrapper.get_headers()
253
306
  if api_subscription_key is not None:
@@ -14,9 +14,8 @@ from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFl
14
14
  from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
15
15
  from .types.speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
16
16
  from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
17
- from .types.speech_to_text_streaming_stream_ongoing_speech_results import (
18
- SpeechToTextStreamingStreamOngoingSpeechResults,
19
- )
17
+ from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
18
+ from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
20
19
  from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
21
20
 
22
21
  try:
@@ -34,14 +33,13 @@ class RawSpeechToTextStreamingClient:
34
33
  self,
35
34
  *,
36
35
  language_code: SpeechToTextStreamingLanguageCode,
37
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
38
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
36
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
37
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
39
38
  sample_rate: typing.Optional[str] = None,
40
39
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
41
40
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
42
41
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
43
- stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
44
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
42
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
45
43
  api_subscription_key: typing.Optional[str] = None,
46
44
  request_options: typing.Optional[RequestOptions] = None,
47
45
  ) -> typing.Iterator[SpeechToTextStreamingSocketClient]:
@@ -55,13 +53,47 @@ class RawSpeechToTextStreamingClient:
55
53
  Parameters
56
54
  ----------
57
55
  language_code : SpeechToTextStreamingLanguageCode
58
- Language code for speech recognition
56
+ Specifies the language of the input audio in BCP-47 format.
59
57
 
60
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
61
- Speech to text model to use
58
+ **Available Options:**
59
+ - `hi-IN`: Hindi
60
+ - `bn-IN`: Bengali
61
+ - `gu-IN`: Gujarati
62
+ - `kn-IN`: Kannada
63
+ - `ml-IN`: Malayalam
64
+ - `mr-IN`: Marathi
65
+ - `od-IN`: Odia
66
+ - `pa-IN`: Punjabi
67
+ - `ta-IN`: Tamil
68
+ - `te-IN`: Telugu
69
+ - `en-IN`: English
62
70
 
63
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
64
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
71
+ model : typing.Optional[SpeechToTextStreamingModel]
72
+ Specifies the model to use for speech-to-text conversion.
73
+
74
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
75
+
76
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
77
+
78
+ mode : typing.Optional[SpeechToTextStreamingMode]
79
+ Mode of operation. **Only applicable when using saaras:v3 model.**
80
+
81
+ Example audio: 'मेरा फोन नंबर है 9840950950'
82
+
83
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
84
+ - Output: `मेरा फोन नंबर है 9840950950`
85
+
86
+ - **translate**: Translates speech from any supported Indic language to English.
87
+ - Output: `My phone number is 9840950950`
88
+
89
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
90
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
91
+
92
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
93
+ - Output: `mera phone number hai 9840950950`
94
+
95
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
96
+ - Output: `मेरा phone number है 9840950950`
65
97
 
66
98
  sample_rate : typing.Optional[str]
67
99
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -75,11 +107,9 @@ class RawSpeechToTextStreamingClient:
75
107
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
76
108
  Signal to flush the audio buffer and finalize transcription
77
109
 
78
- stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
79
- Enable streaming of ongoing speech results during active speech
80
-
81
- streaming_ongoing_requests_frame_size : typing.Optional[str]
82
- Frame size for streaming ongoing speech results (1-100)
110
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
111
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
112
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
83
113
 
84
114
  api_subscription_key : typing.Optional[str]
85
115
  API subscription key for authentication
@@ -97,8 +127,8 @@ class RawSpeechToTextStreamingClient:
97
127
  query_params = query_params.add("language-code", language_code)
98
128
  if model is not None:
99
129
  query_params = query_params.add("model", model)
100
- if input_audio_codec is not None:
101
- query_params = query_params.add("input_audio_codec", input_audio_codec)
130
+ if mode is not None:
131
+ query_params = query_params.add("mode", mode)
102
132
  if sample_rate is not None:
103
133
  query_params = query_params.add("sample_rate", sample_rate)
104
134
  if high_vad_sensitivity is not None:
@@ -107,12 +137,8 @@ class RawSpeechToTextStreamingClient:
107
137
  query_params = query_params.add("vad_signals", vad_signals)
108
138
  if flush_signal is not None:
109
139
  query_params = query_params.add("flush_signal", flush_signal)
110
- if stream_ongoing_speech_results is not None:
111
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
112
- if streaming_ongoing_requests_frame_size is not None:
113
- query_params = query_params.add(
114
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
115
- )
140
+ if input_audio_codec is not None:
141
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
116
142
  ws_url = ws_url + f"?{query_params}"
117
143
  headers = self._client_wrapper.get_headers()
118
144
  if api_subscription_key is not None:
@@ -146,14 +172,13 @@ class AsyncRawSpeechToTextStreamingClient:
146
172
  self,
147
173
  *,
148
174
  language_code: SpeechToTextStreamingLanguageCode,
149
- model: typing.Optional[typing.Literal["saarika:v2.5"]] = None,
150
- input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
175
+ model: typing.Optional[SpeechToTextStreamingModel] = None,
176
+ mode: typing.Optional[SpeechToTextStreamingMode] = None,
151
177
  sample_rate: typing.Optional[str] = None,
152
178
  high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
153
179
  vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
154
180
  flush_signal: typing.Optional[SpeechToTextStreamingFlushSignal] = None,
155
- stream_ongoing_speech_results: typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults] = None,
156
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
181
+ input_audio_codec: typing.Optional[SpeechToTextStreamingInputAudioCodec] = None,
157
182
  api_subscription_key: typing.Optional[str] = None,
158
183
  request_options: typing.Optional[RequestOptions] = None,
159
184
  ) -> typing.AsyncIterator[AsyncSpeechToTextStreamingSocketClient]:
@@ -167,13 +192,47 @@ class AsyncRawSpeechToTextStreamingClient:
167
192
  Parameters
168
193
  ----------
169
194
  language_code : SpeechToTextStreamingLanguageCode
170
- Language code for speech recognition
195
+ Specifies the language of the input audio in BCP-47 format.
171
196
 
172
- model : typing.Optional[typing.Literal["saarika:v2.5"]]
173
- Speech to text model to use
197
+ **Available Options:**
198
+ - `hi-IN`: Hindi
199
+ - `bn-IN`: Bengali
200
+ - `gu-IN`: Gujarati
201
+ - `kn-IN`: Kannada
202
+ - `ml-IN`: Malayalam
203
+ - `mr-IN`: Marathi
204
+ - `od-IN`: Odia
205
+ - `pa-IN`: Punjabi
206
+ - `ta-IN`: Tamil
207
+ - `te-IN`: Telugu
208
+ - `en-IN`: English
174
209
 
175
- input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
176
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
210
+ model : typing.Optional[SpeechToTextStreamingModel]
211
+ Specifies the model to use for speech-to-text conversion.
212
+
213
+ - **saarika:v2.5** (default): Transcribes audio in the spoken language.
214
+
215
+ - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
216
+
217
+ mode : typing.Optional[SpeechToTextStreamingMode]
218
+ Mode of operation. **Only applicable when using saaras:v3 model.**
219
+
220
+ Example audio: 'मेरा फोन नंबर है 9840950950'
221
+
222
+ - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
223
+ - Output: `मेरा फोन नंबर है 9840950950`
224
+
225
+ - **translate**: Translates speech from any supported Indic language to English.
226
+ - Output: `My phone number is 9840950950`
227
+
228
+ - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
229
+ - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
230
+
231
+ - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
232
+ - Output: `mera phone number hai 9840950950`
233
+
234
+ - **codemix**: Code-mixed text with English words in English and Indic words in native script.
235
+ - Output: `मेरा phone number है 9840950950`
177
236
 
178
237
  sample_rate : typing.Optional[str]
179
238
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -187,11 +246,9 @@ class AsyncRawSpeechToTextStreamingClient:
187
246
  flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
188
247
  Signal to flush the audio buffer and finalize transcription
189
248
 
190
- stream_ongoing_speech_results : typing.Optional[SpeechToTextStreamingStreamOngoingSpeechResults]
191
- Enable streaming of ongoing speech results during active speech
192
-
193
- streaming_ongoing_requests_frame_size : typing.Optional[str]
194
- Frame size for streaming ongoing speech results (1-100)
249
+ input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
250
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
251
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
195
252
 
196
253
  api_subscription_key : typing.Optional[str]
197
254
  API subscription key for authentication
@@ -209,8 +266,8 @@ class AsyncRawSpeechToTextStreamingClient:
209
266
  query_params = query_params.add("language-code", language_code)
210
267
  if model is not None:
211
268
  query_params = query_params.add("model", model)
212
- if input_audio_codec is not None:
213
- query_params = query_params.add("input_audio_codec", input_audio_codec)
269
+ if mode is not None:
270
+ query_params = query_params.add("mode", mode)
214
271
  if sample_rate is not None:
215
272
  query_params = query_params.add("sample_rate", sample_rate)
216
273
  if high_vad_sensitivity is not None:
@@ -219,12 +276,8 @@ class AsyncRawSpeechToTextStreamingClient:
219
276
  query_params = query_params.add("vad_signals", vad_signals)
220
277
  if flush_signal is not None:
221
278
  query_params = query_params.add("flush_signal", flush_signal)
222
- if stream_ongoing_speech_results is not None:
223
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
224
- if streaming_ongoing_requests_frame_size is not None:
225
- query_params = query_params.add(
226
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
227
- )
279
+ if input_audio_codec is not None:
280
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
228
281
  ws_url = ws_url + f"?{query_params}"
229
282
  headers = self._client_wrapper.get_headers()
230
283
  if api_subscription_key is not None:
@@ -6,7 +6,8 @@ from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSig
6
6
  from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
7
7
  from .speech_to_text_streaming_input_audio_codec import SpeechToTextStreamingInputAudioCodec
8
8
  from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
9
- from .speech_to_text_streaming_stream_ongoing_speech_results import SpeechToTextStreamingStreamOngoingSpeechResults
9
+ from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
10
+ from .speech_to_text_streaming_model import SpeechToTextStreamingModel
10
11
  from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
11
12
 
12
13
  __all__ = [
@@ -14,6 +15,7 @@ __all__ = [
14
15
  "SpeechToTextStreamingHighVadSensitivity",
15
16
  "SpeechToTextStreamingInputAudioCodec",
16
17
  "SpeechToTextStreamingLanguageCode",
17
- "SpeechToTextStreamingStreamOngoingSpeechResults",
18
+ "SpeechToTextStreamingMode",
19
+ "SpeechToTextStreamingModel",
18
20
  "SpeechToTextStreamingVadSignals",
19
21
  ]
@@ -3,31 +3,5 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextStreamingInputAudioCodec = typing.Union[
6
- typing.Literal[
7
- "wav",
8
- "x-wav",
9
- "wave",
10
- "mp3",
11
- "mpeg",
12
- "mpeg3",
13
- "x-mp3",
14
- "x-mpeg-3",
15
- "aac",
16
- "x-aac",
17
- "aiff",
18
- "x-aiff",
19
- "ogg",
20
- "opus",
21
- "flac",
22
- "x-flac",
23
- "mp4",
24
- "x-m4a",
25
- "amr",
26
- "x-ms-wma",
27
- "webm",
28
- "pcm_s16le",
29
- "pcm_l16",
30
- "pcm_raw",
31
- ],
32
- typing.Any,
6
+ typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
33
7
  ]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SpeechToTextStreamingMode = typing.Union[
6
+ typing.Literal["transcribe", "translate", "verbatim", "translit", "codemix"], typing.Any
7
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SpeechToTextStreamingModel = typing.Union[typing.Literal["saarika:v2.5", "saaras:v3"], typing.Any]