sarvamai 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sarvamai/__init__.py +62 -9
  2. sarvamai/client.py +3 -0
  3. sarvamai/core/client_wrapper.py +2 -2
  4. sarvamai/doc_digitization_job/__init__.py +4 -0
  5. sarvamai/doc_digitization_job/client.py +776 -0
  6. sarvamai/doc_digitization_job/job.py +496 -0
  7. sarvamai/doc_digitization_job/raw_client.py +1176 -0
  8. sarvamai/requests/__init__.py +20 -0
  9. sarvamai/requests/audio_data.py +0 -6
  10. sarvamai/requests/configure_connection.py +4 -0
  11. sarvamai/requests/configure_connection_data.py +40 -11
  12. sarvamai/requests/doc_digitization_create_job_response.py +25 -0
  13. sarvamai/requests/doc_digitization_download_files_response.py +37 -0
  14. sarvamai/requests/doc_digitization_error_details.py +21 -0
  15. sarvamai/requests/doc_digitization_error_message.py +11 -0
  16. sarvamai/requests/doc_digitization_job_detail.py +64 -0
  17. sarvamai/requests/doc_digitization_job_parameters.py +21 -0
  18. sarvamai/requests/doc_digitization_job_status_response.py +65 -0
  19. sarvamai/requests/doc_digitization_page_error.py +24 -0
  20. sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
  21. sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
  22. sarvamai/requests/speech_to_text_job_parameters.py +43 -2
  23. sarvamai/requests/speech_to_text_transcription_data.py +0 -6
  24. sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
  25. sarvamai/requests/speech_to_text_translate_transcription_data.py +0 -6
  26. sarvamai/speech_to_text/client.py +95 -10
  27. sarvamai/speech_to_text/raw_client.py +95 -10
  28. sarvamai/speech_to_text_job/client.py +60 -15
  29. sarvamai/speech_to_text_job/job.py +100 -2
  30. sarvamai/speech_to_text_job/raw_client.py +14 -10
  31. sarvamai/speech_to_text_streaming/__init__.py +4 -2
  32. sarvamai/speech_to_text_streaming/client.py +100 -47
  33. sarvamai/speech_to_text_streaming/raw_client.py +100 -47
  34. sarvamai/speech_to_text_streaming/types/__init__.py +4 -2
  35. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
  36. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
  37. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
  38. sarvamai/speech_to_text_translate_job/job.py +100 -2
  39. sarvamai/speech_to_text_translate_job/raw_client.py +14 -10
  40. sarvamai/speech_to_text_translate_streaming/__init__.py +0 -2
  41. sarvamai/speech_to_text_translate_streaming/client.py +18 -41
  42. sarvamai/speech_to_text_translate_streaming/raw_client.py +18 -41
  43. sarvamai/speech_to_text_translate_streaming/types/__init__.py +0 -4
  44. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
  45. sarvamai/text/client.py +0 -12
  46. sarvamai/text/raw_client.py +0 -12
  47. sarvamai/text_to_speech/client.py +116 -14
  48. sarvamai/text_to_speech/raw_client.py +116 -14
  49. sarvamai/text_to_speech_streaming/__init__.py +2 -2
  50. sarvamai/text_to_speech_streaming/client.py +19 -6
  51. sarvamai/text_to_speech_streaming/raw_client.py +19 -6
  52. sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
  53. sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
  54. sarvamai/types/__init__.py +34 -4
  55. sarvamai/types/audio_data.py +0 -6
  56. sarvamai/types/completion_event_flag.py +3 -1
  57. sarvamai/types/configure_connection.py +4 -0
  58. sarvamai/types/configure_connection_data.py +40 -11
  59. sarvamai/types/configure_connection_data_model.py +5 -0
  60. sarvamai/types/configure_connection_data_speaker.py +35 -1
  61. sarvamai/types/doc_digitization_create_job_response.py +37 -0
  62. sarvamai/types/doc_digitization_download_files_response.py +47 -0
  63. sarvamai/types/doc_digitization_error_code.py +15 -0
  64. sarvamai/types/doc_digitization_error_details.py +33 -0
  65. sarvamai/types/doc_digitization_error_message.py +23 -0
  66. sarvamai/types/doc_digitization_job_detail.py +74 -0
  67. sarvamai/types/doc_digitization_job_detail_state.py +7 -0
  68. sarvamai/types/doc_digitization_job_parameters.py +33 -0
  69. sarvamai/types/doc_digitization_job_state.py +7 -0
  70. sarvamai/types/doc_digitization_job_status_response.py +75 -0
  71. sarvamai/types/doc_digitization_output_format.py +5 -0
  72. sarvamai/types/doc_digitization_page_error.py +36 -0
  73. sarvamai/types/doc_digitization_supported_language.py +32 -0
  74. sarvamai/types/doc_digitization_upload_files_response.py +44 -0
  75. sarvamai/types/doc_digitization_webhook_callback.py +31 -0
  76. sarvamai/types/mode.py +5 -0
  77. sarvamai/types/speech_to_text_job_parameters.py +43 -2
  78. sarvamai/types/speech_to_text_model.py +1 -1
  79. sarvamai/types/speech_to_text_transcription_data.py +0 -6
  80. sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
  81. sarvamai/types/speech_to_text_translate_transcription_data.py +0 -6
  82. sarvamai/types/text_to_speech_model.py +1 -1
  83. sarvamai/types/text_to_speech_speaker.py +35 -1
  84. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
  85. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +86 -56
  86. sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_stream_ongoing_speech_results.py +0 -5
  87. sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_stream_ongoing_speech_results.py +0 -5
  88. sarvamai/types/audio_data_input_audio_codec.py +0 -33
  89. sarvamai/types/response_speech_state.py +0 -7
  90. {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
@@ -150,9 +150,58 @@ class AsyncSpeechToTextTranslateJob:
150
150
  "output_file": detail.outputs[0].file_name,
151
151
  }
152
152
  for detail in (job_status.job_details or [])
153
- if detail.inputs and detail.outputs
153
+ if detail.inputs and detail.outputs and detail.state == "Success"
154
154
  ]
155
155
 
156
+ async def get_file_results(
157
+ self,
158
+ ) -> typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]]:
159
+ """
160
+ Get detailed results for each file in the batch job.
161
+
162
+ Returns
163
+ -------
164
+ Dict[str, List[Dict[str, Any]]]
165
+ Dictionary with 'successful' and 'failed' keys, each containing a list of file details.
166
+ Each file detail includes:
167
+ - 'file_name': Name of the input file
168
+ - 'status': Status of processing ('Success' or 'Failed')
169
+ - 'error_message': Error message if failed (None if successful)
170
+ - 'output_file': Name of output file if successful (None if failed)
171
+ """
172
+ job_status = await self.get_status()
173
+ results: typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]] = {
174
+ "successful": [],
175
+ "failed": [],
176
+ }
177
+
178
+ for detail in job_status.job_details or []:
179
+ # Check for empty lists explicitly
180
+ if not detail.inputs or len(detail.inputs) == 0:
181
+ continue
182
+
183
+ try:
184
+ file_info = {
185
+ "file_name": detail.inputs[0].file_name,
186
+ "status": detail.state,
187
+ "error_message": detail.error_message,
188
+ "output_file": (
189
+ detail.outputs[0].file_name
190
+ if detail.outputs and len(detail.outputs) > 0
191
+ else None
192
+ ),
193
+ }
194
+
195
+ if detail.state == "Success":
196
+ results["successful"].append(file_info)
197
+ else:
198
+ results["failed"].append(file_info)
199
+ except (IndexError, AttributeError):
200
+ # Skip malformed job details
201
+ continue
202
+
203
+ return results
204
+
156
205
  async def download_outputs(self, output_dir: str) -> bool:
157
206
  """
158
207
  Download output files to the specified directory.
@@ -395,9 +444,58 @@ class SpeechToTextTranslateJob:
395
444
  "output_file": detail.outputs[0].file_name,
396
445
  }
397
446
  for detail in (job_status.job_details or [])
398
- if detail.inputs and detail.outputs
447
+ if detail.inputs and detail.outputs and detail.state == "Success"
399
448
  ]
400
449
 
450
+ def get_file_results(
451
+ self,
452
+ ) -> typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]]:
453
+ """
454
+ Get detailed results for each file in the batch job.
455
+
456
+ Returns
457
+ -------
458
+ Dict[str, List[Dict[str, Any]]]
459
+ Dictionary with 'successful' and 'failed' keys, each containing a list of file details.
460
+ Each file detail includes:
461
+ - 'file_name': Name of the input file
462
+ - 'status': Status of processing ('Success' or 'Failed')
463
+ - 'error_message': Error message if failed (None if successful)
464
+ - 'output_file': Name of output file if successful (None if failed)
465
+ """
466
+ job_status = self.get_status()
467
+ results: typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]] = {
468
+ "successful": [],
469
+ "failed": [],
470
+ }
471
+
472
+ for detail in job_status.job_details or []:
473
+ # Check for empty lists explicitly
474
+ if not detail.inputs or len(detail.inputs) == 0:
475
+ continue
476
+
477
+ try:
478
+ file_info = {
479
+ "file_name": detail.inputs[0].file_name,
480
+ "status": detail.state,
481
+ "error_message": detail.error_message,
482
+ "output_file": (
483
+ detail.outputs[0].file_name
484
+ if detail.outputs and len(detail.outputs) > 0
485
+ else None
486
+ ),
487
+ }
488
+
489
+ if detail.state == "Success":
490
+ results["successful"].append(file_info)
491
+ else:
492
+ results["failed"].append(file_info)
493
+ except (IndexError, AttributeError):
494
+ # Skip malformed job details
495
+ continue
496
+
497
+ return results
498
+
401
499
  def download_outputs(self, output_dir: str) -> bool:
402
500
  """
403
501
  Download output files to the specified directory.
@@ -40,7 +40,7 @@ class RawSpeechToTextTranslateJobClient:
40
40
  request_options: typing.Optional[RequestOptions] = None,
41
41
  ) -> HttpResponse[BulkJobInitResponseV1]:
42
42
  """
43
- Get a job uuid, and storage folder details for speech to text tranlsate bulk job v1
43
+ Create a new speech to text translate bulk job and receive a job UUID and storage folder details for processing multiple audio files with translation
44
44
 
45
45
  Parameters
46
46
  ----------
@@ -166,7 +166,9 @@ class RawSpeechToTextTranslateJobClient:
166
166
  self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
167
167
  ) -> HttpResponse[JobStatusV1Response]:
168
168
  """
169
- Get the status of a speech to text translate bulk job V1
169
+ Retrieve the current status and details of a speech to text translate bulk job, including progress and file-level information.
170
+
171
+ **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
170
172
 
171
173
  Parameters
172
174
  ----------
@@ -276,7 +278,7 @@ class RawSpeechToTextTranslateJobClient:
276
278
  request_options: typing.Optional[RequestOptions] = None,
277
279
  ) -> HttpResponse[JobStatusV1Response]:
278
280
  """
279
- Start a speech to text translate bulk job V1
281
+ Start processing a speech to text translate bulk job after all audio files have been uploaded
280
282
 
281
283
  Parameters
282
284
  ----------
@@ -392,7 +394,7 @@ class RawSpeechToTextTranslateJobClient:
392
394
  request_options: typing.Optional[RequestOptions] = None,
393
395
  ) -> HttpResponse[FilesUploadResponse]:
394
396
  """
395
- Start a speech to text bulk job V1
397
+ Generate presigned upload URLs for audio files that will be processed in a speech to text translate bulk job
396
398
 
397
399
  Parameters
398
400
  ----------
@@ -517,7 +519,7 @@ class RawSpeechToTextTranslateJobClient:
517
519
  request_options: typing.Optional[RequestOptions] = None,
518
520
  ) -> HttpResponse[FilesDownloadResponse]:
519
521
  """
520
- Start a speech to text bulk job V1
522
+ Generate presigned download URLs for the translated transcription output files of a completed speech to text translate bulk job
521
523
 
522
524
  Parameters
523
525
  ----------
@@ -647,7 +649,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
647
649
  request_options: typing.Optional[RequestOptions] = None,
648
650
  ) -> AsyncHttpResponse[BulkJobInitResponseV1]:
649
651
  """
650
- Get a job uuid, and storage folder details for speech to text tranlsate bulk job v1
652
+ Create a new speech to text translate bulk job and receive a job UUID and storage folder details for processing multiple audio files with translation
651
653
 
652
654
  Parameters
653
655
  ----------
@@ -773,7 +775,9 @@ class AsyncRawSpeechToTextTranslateJobClient:
773
775
  self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
774
776
  ) -> AsyncHttpResponse[JobStatusV1Response]:
775
777
  """
776
- Get the status of a speech to text translate bulk job V1
778
+ Retrieve the current status and details of a speech to text translate bulk job, including progress and file-level information.
779
+
780
+ **Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
777
781
 
778
782
  Parameters
779
783
  ----------
@@ -883,7 +887,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
883
887
  request_options: typing.Optional[RequestOptions] = None,
884
888
  ) -> AsyncHttpResponse[JobStatusV1Response]:
885
889
  """
886
- Start a speech to text translate bulk job V1
890
+ Start processing a speech to text translate bulk job after all audio files have been uploaded
887
891
 
888
892
  Parameters
889
893
  ----------
@@ -999,7 +1003,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
999
1003
  request_options: typing.Optional[RequestOptions] = None,
1000
1004
  ) -> AsyncHttpResponse[FilesUploadResponse]:
1001
1005
  """
1002
- Start a speech to text bulk job V1
1006
+ Generate presigned upload URLs for audio files that will be processed in a speech to text translate bulk job
1003
1007
 
1004
1008
  Parameters
1005
1009
  ----------
@@ -1124,7 +1128,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
1124
1128
  request_options: typing.Optional[RequestOptions] = None,
1125
1129
  ) -> AsyncHttpResponse[FilesDownloadResponse]:
1126
1130
  """
1127
- Start a speech to text bulk job V1
1131
+ Generate presigned download URLs for the translated transcription output files of a completed speech to text translate bulk job
1128
1132
 
1129
1133
  Parameters
1130
1134
  ----------
@@ -6,7 +6,6 @@ from .types import (
6
6
  SpeechToTextTranslateStreamingFlushSignal,
7
7
  SpeechToTextTranslateStreamingHighVadSensitivity,
8
8
  SpeechToTextTranslateStreamingInputAudioCodec,
9
- SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
10
9
  SpeechToTextTranslateStreamingVadSignals,
11
10
  )
12
11
 
@@ -14,6 +13,5 @@ __all__ = [
14
13
  "SpeechToTextTranslateStreamingFlushSignal",
15
14
  "SpeechToTextTranslateStreamingHighVadSensitivity",
16
15
  "SpeechToTextTranslateStreamingInputAudioCodec",
17
- "SpeechToTextTranslateStreamingStreamOngoingSpeechResults",
18
16
  "SpeechToTextTranslateStreamingVadSignals",
19
17
  ]
@@ -16,9 +16,6 @@ from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
16
16
  SpeechToTextTranslateStreamingHighVadSensitivity,
17
17
  )
18
18
  from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
19
- from .types.speech_to_text_translate_streaming_stream_ongoing_speech_results import (
20
- SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
21
- )
22
19
  from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
23
20
 
24
21
  try:
@@ -47,13 +44,11 @@ class SpeechToTextTranslateStreamingClient:
47
44
  self,
48
45
  *,
49
46
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
50
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
51
47
  sample_rate: typing.Optional[str] = None,
52
48
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
53
49
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
54
50
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
55
- stream_ongoing_speech_results: typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults] = None,
56
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
51
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
57
52
  api_subscription_key: typing.Optional[str] = None,
58
53
  request_options: typing.Optional[RequestOptions] = None,
59
54
  ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -67,10 +62,10 @@ class SpeechToTextTranslateStreamingClient:
67
62
  Parameters
68
63
  ----------
69
64
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
70
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
65
+ Model to be used for speech to text translation.
71
66
 
72
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
73
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
67
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
68
+ - Example: Hindi audio English text output
74
69
 
75
70
  sample_rate : typing.Optional[str]
76
71
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -84,11 +79,9 @@ class SpeechToTextTranslateStreamingClient:
84
79
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
85
80
  Signal to flush the audio buffer and finalize transcription and translation
86
81
 
87
- stream_ongoing_speech_results : typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults]
88
- Enable streaming of ongoing speech results during active speech
89
-
90
- streaming_ongoing_requests_frame_size : typing.Optional[str]
91
- Frame size for streaming ongoing speech results (1-100)
82
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
83
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
84
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
92
85
 
93
86
  api_subscription_key : typing.Optional[str]
94
87
  API subscription key for authentication
@@ -104,8 +97,6 @@ class SpeechToTextTranslateStreamingClient:
104
97
  query_params = httpx.QueryParams()
105
98
  if model is not None:
106
99
  query_params = query_params.add("model", model)
107
- if input_audio_codec is not None:
108
- query_params = query_params.add("input_audio_codec", input_audio_codec)
109
100
  if sample_rate is not None:
110
101
  query_params = query_params.add("sample_rate", sample_rate)
111
102
  if high_vad_sensitivity is not None:
@@ -114,12 +105,8 @@ class SpeechToTextTranslateStreamingClient:
114
105
  query_params = query_params.add("vad_signals", vad_signals)
115
106
  if flush_signal is not None:
116
107
  query_params = query_params.add("flush_signal", flush_signal)
117
- if stream_ongoing_speech_results is not None:
118
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
119
- if streaming_ongoing_requests_frame_size is not None:
120
- query_params = query_params.add(
121
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
122
- )
108
+ if input_audio_codec is not None:
109
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
123
110
  ws_url = ws_url + f"?{query_params}"
124
111
  headers = self._raw_client._client_wrapper.get_headers()
125
112
  if api_subscription_key is not None:
@@ -164,13 +151,11 @@ class AsyncSpeechToTextTranslateStreamingClient:
164
151
  self,
165
152
  *,
166
153
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
167
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
168
154
  sample_rate: typing.Optional[str] = None,
169
155
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
170
156
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
171
157
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
172
- stream_ongoing_speech_results: typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults] = None,
173
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
158
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
174
159
  api_subscription_key: typing.Optional[str] = None,
175
160
  request_options: typing.Optional[RequestOptions] = None,
176
161
  ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -184,10 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
184
169
  Parameters
185
170
  ----------
186
171
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
187
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
172
+ Model to be used for speech to text translation.
188
173
 
189
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
190
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
174
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
175
+ - Example: Hindi audio English text output
191
176
 
192
177
  sample_rate : typing.Optional[str]
193
178
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -201,11 +186,9 @@ class AsyncSpeechToTextTranslateStreamingClient:
201
186
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
202
187
  Signal to flush the audio buffer and finalize transcription and translation
203
188
 
204
- stream_ongoing_speech_results : typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults]
205
- Enable streaming of ongoing speech results during active speech
206
-
207
- streaming_ongoing_requests_frame_size : typing.Optional[str]
208
- Frame size for streaming ongoing speech results (1-100)
189
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
190
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
191
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
209
192
 
210
193
  api_subscription_key : typing.Optional[str]
211
194
  API subscription key for authentication
@@ -221,8 +204,6 @@ class AsyncSpeechToTextTranslateStreamingClient:
221
204
  query_params = httpx.QueryParams()
222
205
  if model is not None:
223
206
  query_params = query_params.add("model", model)
224
- if input_audio_codec is not None:
225
- query_params = query_params.add("input_audio_codec", input_audio_codec)
226
207
  if sample_rate is not None:
227
208
  query_params = query_params.add("sample_rate", sample_rate)
228
209
  if high_vad_sensitivity is not None:
@@ -231,12 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
231
212
  query_params = query_params.add("vad_signals", vad_signals)
232
213
  if flush_signal is not None:
233
214
  query_params = query_params.add("flush_signal", flush_signal)
234
- if stream_ongoing_speech_results is not None:
235
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
236
- if streaming_ongoing_requests_frame_size is not None:
237
- query_params = query_params.add(
238
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
239
- )
215
+ if input_audio_codec is not None:
216
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
240
217
  ws_url = ws_url + f"?{query_params}"
241
218
  headers = self._raw_client._client_wrapper.get_headers()
242
219
  if api_subscription_key is not None:
@@ -15,9 +15,6 @@ from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
15
15
  SpeechToTextTranslateStreamingHighVadSensitivity,
16
16
  )
17
17
  from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
18
- from .types.speech_to_text_translate_streaming_stream_ongoing_speech_results import (
19
- SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
20
- )
21
18
  from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
22
19
 
23
20
  try:
@@ -35,13 +32,11 @@ class RawSpeechToTextTranslateStreamingClient:
35
32
  self,
36
33
  *,
37
34
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
38
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
39
35
  sample_rate: typing.Optional[str] = None,
40
36
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
41
37
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
42
38
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
43
- stream_ongoing_speech_results: typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults] = None,
44
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
39
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
45
40
  api_subscription_key: typing.Optional[str] = None,
46
41
  request_options: typing.Optional[RequestOptions] = None,
47
42
  ) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
@@ -55,10 +50,10 @@ class RawSpeechToTextTranslateStreamingClient:
55
50
  Parameters
56
51
  ----------
57
52
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
58
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
53
+ Model to be used for speech to text translation.
59
54
 
60
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
61
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
55
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
56
+ - Example: Hindi audio English text output
62
57
 
63
58
  sample_rate : typing.Optional[str]
64
59
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -72,11 +67,9 @@ class RawSpeechToTextTranslateStreamingClient:
72
67
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
73
68
  Signal to flush the audio buffer and finalize transcription and translation
74
69
 
75
- stream_ongoing_speech_results : typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults]
76
- Enable streaming of ongoing speech results during active speech
77
-
78
- streaming_ongoing_requests_frame_size : typing.Optional[str]
79
- Frame size for streaming ongoing speech results (1-100)
70
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
71
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
72
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
80
73
 
81
74
  api_subscription_key : typing.Optional[str]
82
75
  API subscription key for authentication
@@ -92,8 +85,6 @@ class RawSpeechToTextTranslateStreamingClient:
92
85
  query_params = httpx.QueryParams()
93
86
  if model is not None:
94
87
  query_params = query_params.add("model", model)
95
- if input_audio_codec is not None:
96
- query_params = query_params.add("input_audio_codec", input_audio_codec)
97
88
  if sample_rate is not None:
98
89
  query_params = query_params.add("sample_rate", sample_rate)
99
90
  if high_vad_sensitivity is not None:
@@ -102,12 +93,8 @@ class RawSpeechToTextTranslateStreamingClient:
102
93
  query_params = query_params.add("vad_signals", vad_signals)
103
94
  if flush_signal is not None:
104
95
  query_params = query_params.add("flush_signal", flush_signal)
105
- if stream_ongoing_speech_results is not None:
106
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
107
- if streaming_ongoing_requests_frame_size is not None:
108
- query_params = query_params.add(
109
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
110
- )
96
+ if input_audio_codec is not None:
97
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
111
98
  ws_url = ws_url + f"?{query_params}"
112
99
  headers = self._client_wrapper.get_headers()
113
100
  if api_subscription_key is not None:
@@ -141,13 +128,11 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
141
128
  self,
142
129
  *,
143
130
  model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
144
- input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
145
131
  sample_rate: typing.Optional[str] = None,
146
132
  high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
147
133
  vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
148
134
  flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
149
- stream_ongoing_speech_results: typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults] = None,
150
- streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
135
+ input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
151
136
  api_subscription_key: typing.Optional[str] = None,
152
137
  request_options: typing.Optional[RequestOptions] = None,
153
138
  ) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
@@ -161,10 +146,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
161
146
  Parameters
162
147
  ----------
163
148
  model : typing.Optional[typing.Literal["saaras:v2.5"]]
164
- Speech to text model to use (defaults to "saaras:v2.5" if not specified)
149
+ Model to be used for speech to text translation.
165
150
 
166
- input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
167
- Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files supports sample rate 16000 and 8000.
151
+ - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
152
+ - Example: Hindi audio English text output
168
153
 
169
154
  sample_rate : typing.Optional[str]
170
155
  Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
@@ -178,11 +163,9 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
178
163
  flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
179
164
  Signal to flush the audio buffer and finalize transcription and translation
180
165
 
181
- stream_ongoing_speech_results : typing.Optional[SpeechToTextTranslateStreamingStreamOngoingSpeechResults]
182
- Enable streaming of ongoing speech results during active speech
183
-
184
- streaming_ongoing_requests_frame_size : typing.Optional[str]
185
- Frame size for streaming ongoing speech results (1-100)
166
+ input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
167
+ Audio codec/format of the input stream. Use this when sending raw PCM audio.
168
+ Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
186
169
 
187
170
  api_subscription_key : typing.Optional[str]
188
171
  API subscription key for authentication
@@ -198,8 +181,6 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
198
181
  query_params = httpx.QueryParams()
199
182
  if model is not None:
200
183
  query_params = query_params.add("model", model)
201
- if input_audio_codec is not None:
202
- query_params = query_params.add("input_audio_codec", input_audio_codec)
203
184
  if sample_rate is not None:
204
185
  query_params = query_params.add("sample_rate", sample_rate)
205
186
  if high_vad_sensitivity is not None:
@@ -208,12 +189,8 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
208
189
  query_params = query_params.add("vad_signals", vad_signals)
209
190
  if flush_signal is not None:
210
191
  query_params = query_params.add("flush_signal", flush_signal)
211
- if stream_ongoing_speech_results is not None:
212
- query_params = query_params.add("stream_ongoing_speech_results", stream_ongoing_speech_results)
213
- if streaming_ongoing_requests_frame_size is not None:
214
- query_params = query_params.add(
215
- "streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
216
- )
192
+ if input_audio_codec is not None:
193
+ query_params = query_params.add("input_audio_codec", input_audio_codec)
217
194
  ws_url = ws_url + f"?{query_params}"
218
195
  headers = self._client_wrapper.get_headers()
219
196
  if api_subscription_key is not None:
@@ -5,15 +5,11 @@
5
5
  from .speech_to_text_translate_streaming_flush_signal import SpeechToTextTranslateStreamingFlushSignal
6
6
  from .speech_to_text_translate_streaming_high_vad_sensitivity import SpeechToTextTranslateStreamingHighVadSensitivity
7
7
  from .speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
8
- from .speech_to_text_translate_streaming_stream_ongoing_speech_results import (
9
- SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
10
- )
11
8
  from .speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
12
9
 
13
10
  __all__ = [
14
11
  "SpeechToTextTranslateStreamingFlushSignal",
15
12
  "SpeechToTextTranslateStreamingHighVadSensitivity",
16
13
  "SpeechToTextTranslateStreamingInputAudioCodec",
17
- "SpeechToTextTranslateStreamingStreamOngoingSpeechResults",
18
14
  "SpeechToTextTranslateStreamingVadSignals",
19
15
  ]
@@ -3,31 +3,5 @@
3
3
  import typing
4
4
 
5
5
  SpeechToTextTranslateStreamingInputAudioCodec = typing.Union[
6
- typing.Literal[
7
- "wav",
8
- "x-wav",
9
- "wave",
10
- "mp3",
11
- "mpeg",
12
- "mpeg3",
13
- "x-mp3",
14
- "x-mpeg-3",
15
- "aac",
16
- "x-aac",
17
- "aiff",
18
- "x-aiff",
19
- "ogg",
20
- "opus",
21
- "flac",
22
- "x-flac",
23
- "mp4",
24
- "x-m4a",
25
- "amr",
26
- "x-ms-wma",
27
- "webm",
28
- "pcm_s16le",
29
- "pcm_l16",
30
- "pcm_raw",
31
- ],
32
- typing.Any,
6
+ typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
33
7
  ]
sarvamai/text/client.py CHANGED
@@ -47,7 +47,6 @@ class TextClient:
47
47
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
48
48
  mode: typing.Optional[TranslateMode] = OMIT,
49
49
  model: typing.Optional[TranslateModel] = OMIT,
50
- enable_preprocessing: typing.Optional[bool] = OMIT,
51
50
  output_script: typing.Optional[TransliterateMode] = OMIT,
52
51
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
53
52
  request_options: typing.Optional[RequestOptions] = None,
@@ -125,10 +124,6 @@ class TextClient:
125
124
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
126
125
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
127
126
 
128
- enable_preprocessing : typing.Optional[bool]
129
- This will enable custom preprocessing of the input text which can result in better translations.
130
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
131
-
132
127
  output_script : typing.Optional[TransliterateMode]
133
128
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
134
129
 
@@ -186,7 +181,6 @@ class TextClient:
186
181
  speaker_gender=speaker_gender,
187
182
  mode=mode,
188
183
  model=model,
189
- enable_preprocessing=enable_preprocessing,
190
184
  output_script=output_script,
191
185
  numerals_format=numerals_format,
192
186
  request_options=request_options,
@@ -371,7 +365,6 @@ class AsyncTextClient:
371
365
  speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
372
366
  mode: typing.Optional[TranslateMode] = OMIT,
373
367
  model: typing.Optional[TranslateModel] = OMIT,
374
- enable_preprocessing: typing.Optional[bool] = OMIT,
375
368
  output_script: typing.Optional[TransliterateMode] = OMIT,
376
369
  numerals_format: typing.Optional[NumeralsFormat] = OMIT,
377
370
  request_options: typing.Optional[RequestOptions] = None,
@@ -449,10 +442,6 @@ class AsyncTextClient:
449
442
  - mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
450
443
  - sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
451
444
 
452
- enable_preprocessing : typing.Optional[bool]
453
- This will enable custom preprocessing of the input text which can result in better translations.
454
- Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
455
-
456
445
  output_script : typing.Optional[TransliterateMode]
457
446
  **output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
458
447
 
@@ -518,7 +507,6 @@ class AsyncTextClient:
518
507
  speaker_gender=speaker_gender,
519
508
  mode=mode,
520
509
  model=model,
521
- enable_preprocessing=enable_preprocessing,
522
510
  output_script=output_script,
523
511
  numerals_format=numerals_format,
524
512
  request_options=request_options,