sarvamai 0.1.22a3__py3-none-any.whl → 0.1.22a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +62 -9
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/doc_digitization_job/__init__.py +4 -0
- sarvamai/doc_digitization_job/client.py +776 -0
- sarvamai/doc_digitization_job/job.py +496 -0
- sarvamai/doc_digitization_job/raw_client.py +1176 -0
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_data.py +0 -6
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/doc_digitization_create_job_response.py +25 -0
- sarvamai/requests/doc_digitization_download_files_response.py +37 -0
- sarvamai/requests/doc_digitization_error_details.py +21 -0
- sarvamai/requests/doc_digitization_error_message.py +11 -0
- sarvamai/requests/doc_digitization_job_detail.py +64 -0
- sarvamai/requests/doc_digitization_job_parameters.py +21 -0
- sarvamai/requests/doc_digitization_job_status_response.py +65 -0
- sarvamai/requests/doc_digitization_page_error.py +24 -0
- sarvamai/requests/doc_digitization_upload_files_response.py +34 -0
- sarvamai/requests/doc_digitization_webhook_callback.py +19 -0
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_transcription_data.py +0 -6
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/requests/speech_to_text_translate_transcription_data.py +0 -6
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +95 -10
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_job/job.py +100 -2
- sarvamai/speech_to_text_job/raw_client.py +14 -10
- sarvamai/speech_to_text_streaming/__init__.py +4 -2
- sarvamai/speech_to_text_streaming/client.py +100 -47
- sarvamai/speech_to_text_streaming/raw_client.py +100 -47
- sarvamai/speech_to_text_streaming/types/__init__.py +4 -2
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +1 -27
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_job/job.py +100 -2
- sarvamai/speech_to_text_translate_job/raw_client.py +14 -10
- sarvamai/speech_to_text_translate_streaming/__init__.py +0 -2
- sarvamai/speech_to_text_translate_streaming/client.py +18 -41
- sarvamai/speech_to_text_translate_streaming/raw_client.py +18 -41
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +0 -4
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +1 -27
- sarvamai/text/client.py +0 -12
- sarvamai/text/raw_client.py +0 -12
- sarvamai/text_to_speech/client.py +116 -14
- sarvamai/text_to_speech/raw_client.py +116 -14
- sarvamai/text_to_speech_streaming/__init__.py +2 -2
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +2 -1
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +34 -4
- sarvamai/types/audio_data.py +0 -6
- sarvamai/types/completion_event_flag.py +3 -1
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/doc_digitization_create_job_response.py +37 -0
- sarvamai/types/doc_digitization_download_files_response.py +47 -0
- sarvamai/types/doc_digitization_error_code.py +15 -0
- sarvamai/types/doc_digitization_error_details.py +33 -0
- sarvamai/types/doc_digitization_error_message.py +23 -0
- sarvamai/types/doc_digitization_job_detail.py +74 -0
- sarvamai/types/doc_digitization_job_detail_state.py +7 -0
- sarvamai/types/doc_digitization_job_parameters.py +33 -0
- sarvamai/types/doc_digitization_job_state.py +7 -0
- sarvamai/types/doc_digitization_job_status_response.py +75 -0
- sarvamai/types/doc_digitization_output_format.py +5 -0
- sarvamai/types/doc_digitization_page_error.py +36 -0
- sarvamai/types/doc_digitization_supported_language.py +32 -0
- sarvamai/types/doc_digitization_upload_files_response.py +44 -0
- sarvamai/types/doc_digitization_webhook_callback.py +31 -0
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_transcription_data.py +0 -6
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/speech_to_text_translate_transcription_data.py +0 -6
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/METADATA +1 -1
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/RECORD +86 -56
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_stream_ongoing_speech_results.py +0 -5
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_stream_ongoing_speech_results.py +0 -5
- sarvamai/types/audio_data_input_audio_codec.py +0 -33
- sarvamai/types/response_speech_state.py +0 -7
- {sarvamai-0.1.22a3.dist-info → sarvamai-0.1.22a7.dist-info}/WHEEL +0 -0
|
@@ -150,9 +150,58 @@ class AsyncSpeechToTextTranslateJob:
|
|
|
150
150
|
"output_file": detail.outputs[0].file_name,
|
|
151
151
|
}
|
|
152
152
|
for detail in (job_status.job_details or [])
|
|
153
|
-
if detail.inputs and detail.outputs
|
|
153
|
+
if detail.inputs and detail.outputs and detail.state == "Success"
|
|
154
154
|
]
|
|
155
155
|
|
|
156
|
+
async def get_file_results(
|
|
157
|
+
self,
|
|
158
|
+
) -> typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]]:
|
|
159
|
+
"""
|
|
160
|
+
Get detailed results for each file in the batch job.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
Dict[str, List[Dict[str, Any]]]
|
|
165
|
+
Dictionary with 'successful' and 'failed' keys, each containing a list of file details.
|
|
166
|
+
Each file detail includes:
|
|
167
|
+
- 'file_name': Name of the input file
|
|
168
|
+
- 'status': Status of processing ('Success' or 'Failed')
|
|
169
|
+
- 'error_message': Error message if failed (None if successful)
|
|
170
|
+
- 'output_file': Name of output file if successful (None if failed)
|
|
171
|
+
"""
|
|
172
|
+
job_status = await self.get_status()
|
|
173
|
+
results: typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]] = {
|
|
174
|
+
"successful": [],
|
|
175
|
+
"failed": [],
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
for detail in job_status.job_details or []:
|
|
179
|
+
# Check for empty lists explicitly
|
|
180
|
+
if not detail.inputs or len(detail.inputs) == 0:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
file_info = {
|
|
185
|
+
"file_name": detail.inputs[0].file_name,
|
|
186
|
+
"status": detail.state,
|
|
187
|
+
"error_message": detail.error_message,
|
|
188
|
+
"output_file": (
|
|
189
|
+
detail.outputs[0].file_name
|
|
190
|
+
if detail.outputs and len(detail.outputs) > 0
|
|
191
|
+
else None
|
|
192
|
+
),
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if detail.state == "Success":
|
|
196
|
+
results["successful"].append(file_info)
|
|
197
|
+
else:
|
|
198
|
+
results["failed"].append(file_info)
|
|
199
|
+
except (IndexError, AttributeError):
|
|
200
|
+
# Skip malformed job details
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
return results
|
|
204
|
+
|
|
156
205
|
async def download_outputs(self, output_dir: str) -> bool:
|
|
157
206
|
"""
|
|
158
207
|
Download output files to the specified directory.
|
|
@@ -395,9 +444,58 @@ class SpeechToTextTranslateJob:
|
|
|
395
444
|
"output_file": detail.outputs[0].file_name,
|
|
396
445
|
}
|
|
397
446
|
for detail in (job_status.job_details or [])
|
|
398
|
-
if detail.inputs and detail.outputs
|
|
447
|
+
if detail.inputs and detail.outputs and detail.state == "Success"
|
|
399
448
|
]
|
|
400
449
|
|
|
450
|
+
def get_file_results(
|
|
451
|
+
self,
|
|
452
|
+
) -> typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]]:
|
|
453
|
+
"""
|
|
454
|
+
Get detailed results for each file in the batch job.
|
|
455
|
+
|
|
456
|
+
Returns
|
|
457
|
+
-------
|
|
458
|
+
Dict[str, List[Dict[str, Any]]]
|
|
459
|
+
Dictionary with 'successful' and 'failed' keys, each containing a list of file details.
|
|
460
|
+
Each file detail includes:
|
|
461
|
+
- 'file_name': Name of the input file
|
|
462
|
+
- 'status': Status of processing ('Success' or 'Failed')
|
|
463
|
+
- 'error_message': Error message if failed (None if successful)
|
|
464
|
+
- 'output_file': Name of output file if successful (None if failed)
|
|
465
|
+
"""
|
|
466
|
+
job_status = self.get_status()
|
|
467
|
+
results: typing.Dict[str, typing.List[typing.Dict[str, typing.Any]]] = {
|
|
468
|
+
"successful": [],
|
|
469
|
+
"failed": [],
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
for detail in job_status.job_details or []:
|
|
473
|
+
# Check for empty lists explicitly
|
|
474
|
+
if not detail.inputs or len(detail.inputs) == 0:
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
file_info = {
|
|
479
|
+
"file_name": detail.inputs[0].file_name,
|
|
480
|
+
"status": detail.state,
|
|
481
|
+
"error_message": detail.error_message,
|
|
482
|
+
"output_file": (
|
|
483
|
+
detail.outputs[0].file_name
|
|
484
|
+
if detail.outputs and len(detail.outputs) > 0
|
|
485
|
+
else None
|
|
486
|
+
),
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
if detail.state == "Success":
|
|
490
|
+
results["successful"].append(file_info)
|
|
491
|
+
else:
|
|
492
|
+
results["failed"].append(file_info)
|
|
493
|
+
except (IndexError, AttributeError):
|
|
494
|
+
# Skip malformed job details
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
return results
|
|
498
|
+
|
|
401
499
|
def download_outputs(self, output_dir: str) -> bool:
|
|
402
500
|
"""
|
|
403
501
|
Download output files to the specified directory.
|
|
@@ -40,7 +40,7 @@ class RawSpeechToTextTranslateJobClient:
|
|
|
40
40
|
request_options: typing.Optional[RequestOptions] = None,
|
|
41
41
|
) -> HttpResponse[BulkJobInitResponseV1]:
|
|
42
42
|
"""
|
|
43
|
-
|
|
43
|
+
Create a new speech to text translate bulk job and receive a job UUID and storage folder details for processing multiple audio files with translation
|
|
44
44
|
|
|
45
45
|
Parameters
|
|
46
46
|
----------
|
|
@@ -166,7 +166,9 @@ class RawSpeechToTextTranslateJobClient:
|
|
|
166
166
|
self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
|
167
167
|
) -> HttpResponse[JobStatusV1Response]:
|
|
168
168
|
"""
|
|
169
|
-
|
|
169
|
+
Retrieve the current status and details of a speech to text translate bulk job, including progress and file-level information.
|
|
170
|
+
|
|
171
|
+
**Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
|
|
170
172
|
|
|
171
173
|
Parameters
|
|
172
174
|
----------
|
|
@@ -276,7 +278,7 @@ class RawSpeechToTextTranslateJobClient:
|
|
|
276
278
|
request_options: typing.Optional[RequestOptions] = None,
|
|
277
279
|
) -> HttpResponse[JobStatusV1Response]:
|
|
278
280
|
"""
|
|
279
|
-
Start a speech to text translate bulk job
|
|
281
|
+
Start processing a speech to text translate bulk job after all audio files have been uploaded
|
|
280
282
|
|
|
281
283
|
Parameters
|
|
282
284
|
----------
|
|
@@ -392,7 +394,7 @@ class RawSpeechToTextTranslateJobClient:
|
|
|
392
394
|
request_options: typing.Optional[RequestOptions] = None,
|
|
393
395
|
) -> HttpResponse[FilesUploadResponse]:
|
|
394
396
|
"""
|
|
395
|
-
|
|
397
|
+
Generate presigned upload URLs for audio files that will be processed in a speech to text translate bulk job
|
|
396
398
|
|
|
397
399
|
Parameters
|
|
398
400
|
----------
|
|
@@ -517,7 +519,7 @@ class RawSpeechToTextTranslateJobClient:
|
|
|
517
519
|
request_options: typing.Optional[RequestOptions] = None,
|
|
518
520
|
) -> HttpResponse[FilesDownloadResponse]:
|
|
519
521
|
"""
|
|
520
|
-
|
|
522
|
+
Generate presigned download URLs for the translated transcription output files of a completed speech to text translate bulk job
|
|
521
523
|
|
|
522
524
|
Parameters
|
|
523
525
|
----------
|
|
@@ -647,7 +649,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
|
|
|
647
649
|
request_options: typing.Optional[RequestOptions] = None,
|
|
648
650
|
) -> AsyncHttpResponse[BulkJobInitResponseV1]:
|
|
649
651
|
"""
|
|
650
|
-
|
|
652
|
+
Create a new speech to text translate bulk job and receive a job UUID and storage folder details for processing multiple audio files with translation
|
|
651
653
|
|
|
652
654
|
Parameters
|
|
653
655
|
----------
|
|
@@ -773,7 +775,9 @@ class AsyncRawSpeechToTextTranslateJobClient:
|
|
|
773
775
|
self, job_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
|
774
776
|
) -> AsyncHttpResponse[JobStatusV1Response]:
|
|
775
777
|
"""
|
|
776
|
-
|
|
778
|
+
Retrieve the current status and details of a speech to text translate bulk job, including progress and file-level information.
|
|
779
|
+
|
|
780
|
+
**Rate Limiting Best Practice:** To prevent rate limit errors and ensure optimal server performance, we recommend implementing a minimum 5-millisecond delay between consecutive status polling requests. This helps maintain system stability while still providing timely status updates.
|
|
777
781
|
|
|
778
782
|
Parameters
|
|
779
783
|
----------
|
|
@@ -883,7 +887,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
|
|
|
883
887
|
request_options: typing.Optional[RequestOptions] = None,
|
|
884
888
|
) -> AsyncHttpResponse[JobStatusV1Response]:
|
|
885
889
|
"""
|
|
886
|
-
Start a speech to text translate bulk job
|
|
890
|
+
Start processing a speech to text translate bulk job after all audio files have been uploaded
|
|
887
891
|
|
|
888
892
|
Parameters
|
|
889
893
|
----------
|
|
@@ -999,7 +1003,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
|
|
|
999
1003
|
request_options: typing.Optional[RequestOptions] = None,
|
|
1000
1004
|
) -> AsyncHttpResponse[FilesUploadResponse]:
|
|
1001
1005
|
"""
|
|
1002
|
-
|
|
1006
|
+
Generate presigned upload URLs for audio files that will be processed in a speech to text translate bulk job
|
|
1003
1007
|
|
|
1004
1008
|
Parameters
|
|
1005
1009
|
----------
|
|
@@ -1124,7 +1128,7 @@ class AsyncRawSpeechToTextTranslateJobClient:
|
|
|
1124
1128
|
request_options: typing.Optional[RequestOptions] = None,
|
|
1125
1129
|
) -> AsyncHttpResponse[FilesDownloadResponse]:
|
|
1126
1130
|
"""
|
|
1127
|
-
|
|
1131
|
+
Generate presigned download URLs for the translated transcription output files of a completed speech to text translate bulk job
|
|
1128
1132
|
|
|
1129
1133
|
Parameters
|
|
1130
1134
|
----------
|
|
@@ -6,7 +6,6 @@ from .types import (
|
|
|
6
6
|
SpeechToTextTranslateStreamingFlushSignal,
|
|
7
7
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
8
8
|
SpeechToTextTranslateStreamingInputAudioCodec,
|
|
9
|
-
SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
|
|
10
9
|
SpeechToTextTranslateStreamingVadSignals,
|
|
11
10
|
)
|
|
12
11
|
|
|
@@ -14,6 +13,5 @@ __all__ = [
|
|
|
14
13
|
"SpeechToTextTranslateStreamingFlushSignal",
|
|
15
14
|
"SpeechToTextTranslateStreamingHighVadSensitivity",
|
|
16
15
|
"SpeechToTextTranslateStreamingInputAudioCodec",
|
|
17
|
-
"SpeechToTextTranslateStreamingStreamOngoingSpeechResults",
|
|
18
16
|
"SpeechToTextTranslateStreamingVadSignals",
|
|
19
17
|
]
|
|
@@ -16,9 +16,6 @@ from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
|
|
|
16
16
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
17
17
|
)
|
|
18
18
|
from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
|
|
19
|
-
from .types.speech_to_text_translate_streaming_stream_ongoing_speech_results import (
|
|
20
|
-
SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
|
|
21
|
-
)
|
|
22
19
|
from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
|
|
23
20
|
|
|
24
21
|
try:
|
|
@@ -47,13 +44,11 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
47
44
|
self,
|
|
48
45
|
*,
|
|
49
46
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
50
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
51
47
|
sample_rate: typing.Optional[str] = None,
|
|
52
48
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
53
49
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
54
50
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
55
|
-
|
|
56
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
51
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
57
52
|
api_subscription_key: typing.Optional[str] = None,
|
|
58
53
|
request_options: typing.Optional[RequestOptions] = None,
|
|
59
54
|
) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -67,10 +62,10 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
67
62
|
Parameters
|
|
68
63
|
----------
|
|
69
64
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
70
|
-
|
|
65
|
+
Model to be used for speech to text translation.
|
|
71
66
|
|
|
72
|
-
|
|
73
|
-
|
|
67
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
68
|
+
- Example: Hindi audio → English text output
|
|
74
69
|
|
|
75
70
|
sample_rate : typing.Optional[str]
|
|
76
71
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -84,11 +79,9 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
84
79
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
85
80
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
86
81
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
91
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
82
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
83
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
84
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
92
85
|
|
|
93
86
|
api_subscription_key : typing.Optional[str]
|
|
94
87
|
API subscription key for authentication
|
|
@@ -104,8 +97,6 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
104
97
|
query_params = httpx.QueryParams()
|
|
105
98
|
if model is not None:
|
|
106
99
|
query_params = query_params.add("model", model)
|
|
107
|
-
if input_audio_codec is not None:
|
|
108
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
109
100
|
if sample_rate is not None:
|
|
110
101
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
111
102
|
if high_vad_sensitivity is not None:
|
|
@@ -114,12 +105,8 @@ class SpeechToTextTranslateStreamingClient:
|
|
|
114
105
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
115
106
|
if flush_signal is not None:
|
|
116
107
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
117
|
-
if
|
|
118
|
-
query_params = query_params.add("
|
|
119
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
120
|
-
query_params = query_params.add(
|
|
121
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
122
|
-
)
|
|
108
|
+
if input_audio_codec is not None:
|
|
109
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
123
110
|
ws_url = ws_url + f"?{query_params}"
|
|
124
111
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
125
112
|
if api_subscription_key is not None:
|
|
@@ -164,13 +151,11 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
164
151
|
self,
|
|
165
152
|
*,
|
|
166
153
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
167
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
168
154
|
sample_rate: typing.Optional[str] = None,
|
|
169
155
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
170
156
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
171
157
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
172
|
-
|
|
173
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
158
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
174
159
|
api_subscription_key: typing.Optional[str] = None,
|
|
175
160
|
request_options: typing.Optional[RequestOptions] = None,
|
|
176
161
|
) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -184,10 +169,10 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
184
169
|
Parameters
|
|
185
170
|
----------
|
|
186
171
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
187
|
-
|
|
172
|
+
Model to be used for speech to text translation.
|
|
188
173
|
|
|
189
|
-
|
|
190
|
-
|
|
174
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
175
|
+
- Example: Hindi audio → English text output
|
|
191
176
|
|
|
192
177
|
sample_rate : typing.Optional[str]
|
|
193
178
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -201,11 +186,9 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
201
186
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
202
187
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
203
188
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
208
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
189
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
190
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
191
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
209
192
|
|
|
210
193
|
api_subscription_key : typing.Optional[str]
|
|
211
194
|
API subscription key for authentication
|
|
@@ -221,8 +204,6 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
221
204
|
query_params = httpx.QueryParams()
|
|
222
205
|
if model is not None:
|
|
223
206
|
query_params = query_params.add("model", model)
|
|
224
|
-
if input_audio_codec is not None:
|
|
225
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
226
207
|
if sample_rate is not None:
|
|
227
208
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
228
209
|
if high_vad_sensitivity is not None:
|
|
@@ -231,12 +212,8 @@ class AsyncSpeechToTextTranslateStreamingClient:
|
|
|
231
212
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
232
213
|
if flush_signal is not None:
|
|
233
214
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
234
|
-
if
|
|
235
|
-
query_params = query_params.add("
|
|
236
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
237
|
-
query_params = query_params.add(
|
|
238
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
239
|
-
)
|
|
215
|
+
if input_audio_codec is not None:
|
|
216
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
240
217
|
ws_url = ws_url + f"?{query_params}"
|
|
241
218
|
headers = self._raw_client._client_wrapper.get_headers()
|
|
242
219
|
if api_subscription_key is not None:
|
|
@@ -15,9 +15,6 @@ from .types.speech_to_text_translate_streaming_high_vad_sensitivity import (
|
|
|
15
15
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
16
16
|
)
|
|
17
17
|
from .types.speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
|
|
18
|
-
from .types.speech_to_text_translate_streaming_stream_ongoing_speech_results import (
|
|
19
|
-
SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
|
|
20
|
-
)
|
|
21
18
|
from .types.speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
|
|
22
19
|
|
|
23
20
|
try:
|
|
@@ -35,13 +32,11 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
35
32
|
self,
|
|
36
33
|
*,
|
|
37
34
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
38
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
39
35
|
sample_rate: typing.Optional[str] = None,
|
|
40
36
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
41
37
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
42
38
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
43
|
-
|
|
44
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
39
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
45
40
|
api_subscription_key: typing.Optional[str] = None,
|
|
46
41
|
request_options: typing.Optional[RequestOptions] = None,
|
|
47
42
|
) -> typing.Iterator[SpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -55,10 +50,10 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
55
50
|
Parameters
|
|
56
51
|
----------
|
|
57
52
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
58
|
-
|
|
53
|
+
Model to be used for speech to text translation.
|
|
59
54
|
|
|
60
|
-
|
|
61
|
-
|
|
55
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
56
|
+
- Example: Hindi audio → English text output
|
|
62
57
|
|
|
63
58
|
sample_rate : typing.Optional[str]
|
|
64
59
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -72,11 +67,9 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
72
67
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
73
68
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
74
69
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
79
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
70
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
71
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
72
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
80
73
|
|
|
81
74
|
api_subscription_key : typing.Optional[str]
|
|
82
75
|
API subscription key for authentication
|
|
@@ -92,8 +85,6 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
92
85
|
query_params = httpx.QueryParams()
|
|
93
86
|
if model is not None:
|
|
94
87
|
query_params = query_params.add("model", model)
|
|
95
|
-
if input_audio_codec is not None:
|
|
96
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
97
88
|
if sample_rate is not None:
|
|
98
89
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
99
90
|
if high_vad_sensitivity is not None:
|
|
@@ -102,12 +93,8 @@ class RawSpeechToTextTranslateStreamingClient:
|
|
|
102
93
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
103
94
|
if flush_signal is not None:
|
|
104
95
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
105
|
-
if
|
|
106
|
-
query_params = query_params.add("
|
|
107
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
108
|
-
query_params = query_params.add(
|
|
109
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
110
|
-
)
|
|
96
|
+
if input_audio_codec is not None:
|
|
97
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
111
98
|
ws_url = ws_url + f"?{query_params}"
|
|
112
99
|
headers = self._client_wrapper.get_headers()
|
|
113
100
|
if api_subscription_key is not None:
|
|
@@ -141,13 +128,11 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
141
128
|
self,
|
|
142
129
|
*,
|
|
143
130
|
model: typing.Optional[typing.Literal["saaras:v2.5"]] = None,
|
|
144
|
-
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
145
131
|
sample_rate: typing.Optional[str] = None,
|
|
146
132
|
high_vad_sensitivity: typing.Optional[SpeechToTextTranslateStreamingHighVadSensitivity] = None,
|
|
147
133
|
vad_signals: typing.Optional[SpeechToTextTranslateStreamingVadSignals] = None,
|
|
148
134
|
flush_signal: typing.Optional[SpeechToTextTranslateStreamingFlushSignal] = None,
|
|
149
|
-
|
|
150
|
-
streaming_ongoing_requests_frame_size: typing.Optional[str] = None,
|
|
135
|
+
input_audio_codec: typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec] = None,
|
|
151
136
|
api_subscription_key: typing.Optional[str] = None,
|
|
152
137
|
request_options: typing.Optional[RequestOptions] = None,
|
|
153
138
|
) -> typing.AsyncIterator[AsyncSpeechToTextTranslateStreamingSocketClient]:
|
|
@@ -161,10 +146,10 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
161
146
|
Parameters
|
|
162
147
|
----------
|
|
163
148
|
model : typing.Optional[typing.Literal["saaras:v2.5"]]
|
|
164
|
-
|
|
149
|
+
Model to be used for speech to text translation.
|
|
165
150
|
|
|
166
|
-
|
|
167
|
-
|
|
151
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
152
|
+
- Example: Hindi audio → English text output
|
|
168
153
|
|
|
169
154
|
sample_rate : typing.Optional[str]
|
|
170
155
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -178,11 +163,9 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
178
163
|
flush_signal : typing.Optional[SpeechToTextTranslateStreamingFlushSignal]
|
|
179
164
|
Signal to flush the audio buffer and finalize transcription and translation
|
|
180
165
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
streaming_ongoing_requests_frame_size : typing.Optional[str]
|
|
185
|
-
Frame size for streaming ongoing speech results (1-100)
|
|
166
|
+
input_audio_codec : typing.Optional[SpeechToTextTranslateStreamingInputAudioCodec]
|
|
167
|
+
Audio codec/format of the input stream. Use this when sending raw PCM audio.
|
|
168
|
+
Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
|
|
186
169
|
|
|
187
170
|
api_subscription_key : typing.Optional[str]
|
|
188
171
|
API subscription key for authentication
|
|
@@ -198,8 +181,6 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
198
181
|
query_params = httpx.QueryParams()
|
|
199
182
|
if model is not None:
|
|
200
183
|
query_params = query_params.add("model", model)
|
|
201
|
-
if input_audio_codec is not None:
|
|
202
|
-
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
203
184
|
if sample_rate is not None:
|
|
204
185
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
205
186
|
if high_vad_sensitivity is not None:
|
|
@@ -208,12 +189,8 @@ class AsyncRawSpeechToTextTranslateStreamingClient:
|
|
|
208
189
|
query_params = query_params.add("vad_signals", vad_signals)
|
|
209
190
|
if flush_signal is not None:
|
|
210
191
|
query_params = query_params.add("flush_signal", flush_signal)
|
|
211
|
-
if
|
|
212
|
-
query_params = query_params.add("
|
|
213
|
-
if streaming_ongoing_requests_frame_size is not None:
|
|
214
|
-
query_params = query_params.add(
|
|
215
|
-
"streaming_ongoing_requests_frame_size", streaming_ongoing_requests_frame_size
|
|
216
|
-
)
|
|
192
|
+
if input_audio_codec is not None:
|
|
193
|
+
query_params = query_params.add("input_audio_codec", input_audio_codec)
|
|
217
194
|
ws_url = ws_url + f"?{query_params}"
|
|
218
195
|
headers = self._client_wrapper.get_headers()
|
|
219
196
|
if api_subscription_key is not None:
|
|
@@ -5,15 +5,11 @@
|
|
|
5
5
|
from .speech_to_text_translate_streaming_flush_signal import SpeechToTextTranslateStreamingFlushSignal
|
|
6
6
|
from .speech_to_text_translate_streaming_high_vad_sensitivity import SpeechToTextTranslateStreamingHighVadSensitivity
|
|
7
7
|
from .speech_to_text_translate_streaming_input_audio_codec import SpeechToTextTranslateStreamingInputAudioCodec
|
|
8
|
-
from .speech_to_text_translate_streaming_stream_ongoing_speech_results import (
|
|
9
|
-
SpeechToTextTranslateStreamingStreamOngoingSpeechResults,
|
|
10
|
-
)
|
|
11
8
|
from .speech_to_text_translate_streaming_vad_signals import SpeechToTextTranslateStreamingVadSignals
|
|
12
9
|
|
|
13
10
|
__all__ = [
|
|
14
11
|
"SpeechToTextTranslateStreamingFlushSignal",
|
|
15
12
|
"SpeechToTextTranslateStreamingHighVadSensitivity",
|
|
16
13
|
"SpeechToTextTranslateStreamingInputAudioCodec",
|
|
17
|
-
"SpeechToTextTranslateStreamingStreamOngoingSpeechResults",
|
|
18
14
|
"SpeechToTextTranslateStreamingVadSignals",
|
|
19
15
|
]
|
|
@@ -3,31 +3,5 @@
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
5
|
SpeechToTextTranslateStreamingInputAudioCodec = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
7
|
-
"wav",
|
|
8
|
-
"x-wav",
|
|
9
|
-
"wave",
|
|
10
|
-
"mp3",
|
|
11
|
-
"mpeg",
|
|
12
|
-
"mpeg3",
|
|
13
|
-
"x-mp3",
|
|
14
|
-
"x-mpeg-3",
|
|
15
|
-
"aac",
|
|
16
|
-
"x-aac",
|
|
17
|
-
"aiff",
|
|
18
|
-
"x-aiff",
|
|
19
|
-
"ogg",
|
|
20
|
-
"opus",
|
|
21
|
-
"flac",
|
|
22
|
-
"x-flac",
|
|
23
|
-
"mp4",
|
|
24
|
-
"x-m4a",
|
|
25
|
-
"amr",
|
|
26
|
-
"x-ms-wma",
|
|
27
|
-
"webm",
|
|
28
|
-
"pcm_s16le",
|
|
29
|
-
"pcm_l16",
|
|
30
|
-
"pcm_raw",
|
|
31
|
-
],
|
|
32
|
-
typing.Any,
|
|
6
|
+
typing.Literal["wav", "pcm_s16le", "pcm_l16", "pcm_raw"], typing.Any
|
|
33
7
|
]
|
sarvamai/text/client.py
CHANGED
|
@@ -47,7 +47,6 @@ class TextClient:
|
|
|
47
47
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
48
48
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
49
49
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
50
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
51
50
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
52
51
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
53
52
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -125,10 +124,6 @@ class TextClient:
|
|
|
125
124
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
126
125
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
127
126
|
|
|
128
|
-
enable_preprocessing : typing.Optional[bool]
|
|
129
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
130
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
131
|
-
|
|
132
127
|
output_script : typing.Optional[TransliterateMode]
|
|
133
128
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
134
129
|
|
|
@@ -186,7 +181,6 @@ class TextClient:
|
|
|
186
181
|
speaker_gender=speaker_gender,
|
|
187
182
|
mode=mode,
|
|
188
183
|
model=model,
|
|
189
|
-
enable_preprocessing=enable_preprocessing,
|
|
190
184
|
output_script=output_script,
|
|
191
185
|
numerals_format=numerals_format,
|
|
192
186
|
request_options=request_options,
|
|
@@ -371,7 +365,6 @@ class AsyncTextClient:
|
|
|
371
365
|
speaker_gender: typing.Optional[TranslateSpeakerGender] = OMIT,
|
|
372
366
|
mode: typing.Optional[TranslateMode] = OMIT,
|
|
373
367
|
model: typing.Optional[TranslateModel] = OMIT,
|
|
374
|
-
enable_preprocessing: typing.Optional[bool] = OMIT,
|
|
375
368
|
output_script: typing.Optional[TransliterateMode] = OMIT,
|
|
376
369
|
numerals_format: typing.Optional[NumeralsFormat] = OMIT,
|
|
377
370
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -449,10 +442,6 @@ class AsyncTextClient:
|
|
|
449
442
|
- mayura:v1: Supports 12 languages with all modes, output scripts, and automatic language detection.
|
|
450
443
|
- sarvam-translate:v1: Supports all 22 scheduled languages of India, formal mode only.
|
|
451
444
|
|
|
452
|
-
enable_preprocessing : typing.Optional[bool]
|
|
453
|
-
This will enable custom preprocessing of the input text which can result in better translations.
|
|
454
|
-
Recommendation- You can switch on whenever there is some complex text with difficult vocabulary and sentences, for which you want simple translations that people can understand.
|
|
455
|
-
|
|
456
445
|
output_script : typing.Optional[TransliterateMode]
|
|
457
446
|
**output_script**: This is an optional parameter which controls the transliteration style applied to the output text.
|
|
458
447
|
|
|
@@ -518,7 +507,6 @@ class AsyncTextClient:
|
|
|
518
507
|
speaker_gender=speaker_gender,
|
|
519
508
|
mode=mode,
|
|
520
509
|
model=model,
|
|
521
|
-
enable_preprocessing=enable_preprocessing,
|
|
522
510
|
output_script=output_script,
|
|
523
511
|
numerals_format=numerals_format,
|
|
524
512
|
request_options=request_options,
|