sarvamai 0.1.23a3__py3-none-any.whl → 0.1.23a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +203 -405
- sarvamai/chat/raw_client.py +20 -20
- sarvamai/client.py +34 -186
- sarvamai/core/__init__.py +21 -76
- sarvamai/core/client_wrapper.py +3 -19
- sarvamai/core/force_multipart.py +2 -4
- sarvamai/core/http_client.py +97 -217
- sarvamai/core/http_response.py +1 -1
- sarvamai/core/jsonable_encoder.py +0 -8
- sarvamai/core/pydantic_utilities.py +4 -110
- sarvamai/errors/__init__.py +6 -40
- sarvamai/errors/bad_request_error.py +1 -1
- sarvamai/errors/forbidden_error.py +1 -1
- sarvamai/errors/internal_server_error.py +1 -1
- sarvamai/errors/service_unavailable_error.py +1 -1
- sarvamai/errors/too_many_requests_error.py +1 -1
- sarvamai/errors/unprocessable_entity_error.py +1 -1
- sarvamai/requests/__init__.py +62 -150
- sarvamai/requests/configure_connection.py +4 -0
- sarvamai/requests/configure_connection_data.py +40 -11
- sarvamai/requests/error_response_data.py +1 -1
- sarvamai/requests/file_signed_url_details.py +1 -1
- sarvamai/requests/speech_to_text_job_parameters.py +43 -2
- sarvamai/requests/speech_to_text_transcription_data.py +2 -2
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/speech_to_text/client.py +95 -10
- sarvamai/speech_to_text/raw_client.py +147 -64
- sarvamai/speech_to_text_job/client.py +60 -15
- sarvamai/speech_to_text_job/raw_client.py +120 -120
- sarvamai/speech_to_text_streaming/__init__.py +10 -38
- sarvamai/speech_to_text_streaming/client.py +90 -8
- sarvamai/speech_to_text_streaming/raw_client.py +90 -8
- sarvamai/speech_to_text_streaming/types/__init__.py +8 -36
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_model.py +5 -0
- sarvamai/speech_to_text_translate_job/raw_client.py +120 -120
- sarvamai/speech_to_text_translate_streaming/__init__.py +5 -36
- sarvamai/speech_to_text_translate_streaming/client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/raw_client.py +8 -2
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +3 -36
- sarvamai/text/raw_client.py +60 -60
- sarvamai/text_to_speech/client.py +100 -16
- sarvamai/text_to_speech/raw_client.py +120 -36
- sarvamai/text_to_speech_streaming/__init__.py +2 -29
- sarvamai/text_to_speech_streaming/client.py +19 -6
- sarvamai/text_to_speech_streaming/raw_client.py +19 -6
- sarvamai/text_to_speech_streaming/types/__init__.py +3 -31
- sarvamai/text_to_speech_streaming/types/text_to_speech_streaming_model.py +5 -0
- sarvamai/types/__init__.py +102 -222
- sarvamai/types/chat_completion_request_message.py +2 -6
- sarvamai/types/configure_connection.py +4 -0
- sarvamai/types/configure_connection_data.py +40 -11
- sarvamai/types/configure_connection_data_model.py +5 -0
- sarvamai/types/configure_connection_data_speaker.py +35 -1
- sarvamai/types/error_response_data.py +1 -1
- sarvamai/types/file_signed_url_details.py +1 -1
- sarvamai/types/mode.py +5 -0
- sarvamai/types/speech_to_text_job_parameters.py +43 -2
- sarvamai/types/speech_to_text_model.py +1 -1
- sarvamai/types/speech_to_text_transcription_data.py +2 -2
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/text_to_speech_model.py +1 -1
- sarvamai/types/text_to_speech_speaker.py +35 -1
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/METADATA +1 -2
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/RECORD +66 -66
- sarvamai/core/http_sse/__init__.py +0 -42
- sarvamai/core/http_sse/_api.py +0 -112
- sarvamai/core/http_sse/_decoders.py +0 -61
- sarvamai/core/http_sse/_exceptions.py +0 -7
- sarvamai/core/http_sse/_models.py +0 -17
- {sarvamai-0.1.23a3.dist-info → sarvamai-0.1.23a5.dist-info}/WHEEL +0 -0
|
@@ -2,48 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
SpeechToTextStreamingVadSignals,
|
|
14
|
-
)
|
|
15
|
-
_dynamic_imports: typing.Dict[str, str] = {
|
|
16
|
-
"SpeechToTextStreamingFlushSignal": ".types",
|
|
17
|
-
"SpeechToTextStreamingHighVadSensitivity": ".types",
|
|
18
|
-
"SpeechToTextStreamingLanguageCode": ".types",
|
|
19
|
-
"SpeechToTextStreamingVadSignals": ".types",
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def __getattr__(attr_name: str) -> typing.Any:
|
|
24
|
-
module_name = _dynamic_imports.get(attr_name)
|
|
25
|
-
if module_name is None:
|
|
26
|
-
raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}")
|
|
27
|
-
try:
|
|
28
|
-
module = import_module(module_name, __package__)
|
|
29
|
-
if module_name == f".{attr_name}":
|
|
30
|
-
return module
|
|
31
|
-
else:
|
|
32
|
-
return getattr(module, attr_name)
|
|
33
|
-
except ImportError as e:
|
|
34
|
-
raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e
|
|
35
|
-
except AttributeError as e:
|
|
36
|
-
raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def __dir__():
|
|
40
|
-
lazy_attrs = list(_dynamic_imports.keys())
|
|
41
|
-
return sorted(lazy_attrs)
|
|
42
|
-
|
|
5
|
+
from .types import (
|
|
6
|
+
SpeechToTextStreamingFlushSignal,
|
|
7
|
+
SpeechToTextStreamingHighVadSensitivity,
|
|
8
|
+
SpeechToTextStreamingLanguageCode,
|
|
9
|
+
SpeechToTextStreamingMode,
|
|
10
|
+
SpeechToTextStreamingModel,
|
|
11
|
+
SpeechToTextStreamingVadSignals,
|
|
12
|
+
)
|
|
43
13
|
|
|
44
14
|
__all__ = [
|
|
45
15
|
"SpeechToTextStreamingFlushSignal",
|
|
46
16
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
47
17
|
"SpeechToTextStreamingLanguageCode",
|
|
18
|
+
"SpeechToTextStreamingMode",
|
|
19
|
+
"SpeechToTextStreamingModel",
|
|
48
20
|
"SpeechToTextStreamingVadSignals",
|
|
49
21
|
]
|
|
@@ -14,6 +14,8 @@ from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextS
|
|
|
14
14
|
from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
15
15
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
16
16
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
17
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
18
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
17
19
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
18
20
|
|
|
19
21
|
try:
|
|
@@ -42,7 +44,8 @@ class SpeechToTextStreamingClient:
|
|
|
42
44
|
self,
|
|
43
45
|
*,
|
|
44
46
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
45
|
-
model: typing.Optional[
|
|
47
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
48
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
46
49
|
sample_rate: typing.Optional[str] = None,
|
|
47
50
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
48
51
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
@@ -60,10 +63,47 @@ class SpeechToTextStreamingClient:
|
|
|
60
63
|
Parameters
|
|
61
64
|
----------
|
|
62
65
|
language_code : SpeechToTextStreamingLanguageCode
|
|
63
|
-
|
|
66
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
64
67
|
|
|
65
|
-
|
|
66
|
-
|
|
68
|
+
**Available Options:**
|
|
69
|
+
- `hi-IN`: Hindi
|
|
70
|
+
- `bn-IN`: Bengali
|
|
71
|
+
- `gu-IN`: Gujarati
|
|
72
|
+
- `kn-IN`: Kannada
|
|
73
|
+
- `ml-IN`: Malayalam
|
|
74
|
+
- `mr-IN`: Marathi
|
|
75
|
+
- `od-IN`: Odia
|
|
76
|
+
- `pa-IN`: Punjabi
|
|
77
|
+
- `ta-IN`: Tamil
|
|
78
|
+
- `te-IN`: Telugu
|
|
79
|
+
- `en-IN`: English
|
|
80
|
+
|
|
81
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
82
|
+
Specifies the model to use for speech-to-text conversion.
|
|
83
|
+
|
|
84
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
85
|
+
|
|
86
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
87
|
+
|
|
88
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
89
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
90
|
+
|
|
91
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
92
|
+
|
|
93
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
94
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
95
|
+
|
|
96
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
97
|
+
- Output: `My phone number is 9840950950`
|
|
98
|
+
|
|
99
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
100
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
101
|
+
|
|
102
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
103
|
+
- Output: `mera phone number hai 9840950950`
|
|
104
|
+
|
|
105
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
106
|
+
- Output: `मेरा phone number है 9840950950`
|
|
67
107
|
|
|
68
108
|
sample_rate : typing.Optional[str]
|
|
69
109
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -93,6 +133,8 @@ class SpeechToTextStreamingClient:
|
|
|
93
133
|
query_params = query_params.add("language-code", language_code)
|
|
94
134
|
if model is not None:
|
|
95
135
|
query_params = query_params.add("model", model)
|
|
136
|
+
if mode is not None:
|
|
137
|
+
query_params = query_params.add("mode", mode)
|
|
96
138
|
if sample_rate is not None:
|
|
97
139
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
98
140
|
if high_vad_sensitivity is not None:
|
|
@@ -145,7 +187,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
145
187
|
self,
|
|
146
188
|
*,
|
|
147
189
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
148
|
-
model: typing.Optional[
|
|
190
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
191
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
149
192
|
sample_rate: typing.Optional[str] = None,
|
|
150
193
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
151
194
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
@@ -163,10 +206,47 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
163
206
|
Parameters
|
|
164
207
|
----------
|
|
165
208
|
language_code : SpeechToTextStreamingLanguageCode
|
|
166
|
-
|
|
209
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
210
|
+
|
|
211
|
+
**Available Options:**
|
|
212
|
+
- `hi-IN`: Hindi
|
|
213
|
+
- `bn-IN`: Bengali
|
|
214
|
+
- `gu-IN`: Gujarati
|
|
215
|
+
- `kn-IN`: Kannada
|
|
216
|
+
- `ml-IN`: Malayalam
|
|
217
|
+
- `mr-IN`: Marathi
|
|
218
|
+
- `od-IN`: Odia
|
|
219
|
+
- `pa-IN`: Punjabi
|
|
220
|
+
- `ta-IN`: Tamil
|
|
221
|
+
- `te-IN`: Telugu
|
|
222
|
+
- `en-IN`: English
|
|
223
|
+
|
|
224
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
225
|
+
Specifies the model to use for speech-to-text conversion.
|
|
226
|
+
|
|
227
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
228
|
+
|
|
229
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
230
|
+
|
|
231
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
232
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
233
|
+
|
|
234
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
235
|
+
|
|
236
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
237
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
238
|
+
|
|
239
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
240
|
+
- Output: `My phone number is 9840950950`
|
|
241
|
+
|
|
242
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
243
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
244
|
+
|
|
245
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
246
|
+
- Output: `mera phone number hai 9840950950`
|
|
167
247
|
|
|
168
|
-
|
|
169
|
-
|
|
248
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
249
|
+
- Output: `मेरा phone number है 9840950950`
|
|
170
250
|
|
|
171
251
|
sample_rate : typing.Optional[str]
|
|
172
252
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -196,6 +276,8 @@ class AsyncSpeechToTextStreamingClient:
|
|
|
196
276
|
query_params = query_params.add("language-code", language_code)
|
|
197
277
|
if model is not None:
|
|
198
278
|
query_params = query_params.add("model", model)
|
|
279
|
+
if mode is not None:
|
|
280
|
+
query_params = query_params.add("mode", mode)
|
|
199
281
|
if sample_rate is not None:
|
|
200
282
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
201
283
|
if high_vad_sensitivity is not None:
|
|
@@ -13,6 +13,8 @@ from .socket_client import AsyncSpeechToTextStreamingSocketClient, SpeechToTextS
|
|
|
13
13
|
from .types.speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
14
14
|
from .types.speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
15
15
|
from .types.speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
16
|
+
from .types.speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
17
|
+
from .types.speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
16
18
|
from .types.speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
17
19
|
|
|
18
20
|
try:
|
|
@@ -30,7 +32,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
30
32
|
self,
|
|
31
33
|
*,
|
|
32
34
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
33
|
-
model: typing.Optional[
|
|
35
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
36
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
34
37
|
sample_rate: typing.Optional[str] = None,
|
|
35
38
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
36
39
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
@@ -48,10 +51,47 @@ class RawSpeechToTextStreamingClient:
|
|
|
48
51
|
Parameters
|
|
49
52
|
----------
|
|
50
53
|
language_code : SpeechToTextStreamingLanguageCode
|
|
51
|
-
|
|
54
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
52
55
|
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
**Available Options:**
|
|
57
|
+
- `hi-IN`: Hindi
|
|
58
|
+
- `bn-IN`: Bengali
|
|
59
|
+
- `gu-IN`: Gujarati
|
|
60
|
+
- `kn-IN`: Kannada
|
|
61
|
+
- `ml-IN`: Malayalam
|
|
62
|
+
- `mr-IN`: Marathi
|
|
63
|
+
- `od-IN`: Odia
|
|
64
|
+
- `pa-IN`: Punjabi
|
|
65
|
+
- `ta-IN`: Tamil
|
|
66
|
+
- `te-IN`: Telugu
|
|
67
|
+
- `en-IN`: English
|
|
68
|
+
|
|
69
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
70
|
+
Specifies the model to use for speech-to-text conversion.
|
|
71
|
+
|
|
72
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
73
|
+
|
|
74
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
75
|
+
|
|
76
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
77
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
78
|
+
|
|
79
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
80
|
+
|
|
81
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
82
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
83
|
+
|
|
84
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
85
|
+
- Output: `My phone number is 9840950950`
|
|
86
|
+
|
|
87
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
88
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
89
|
+
|
|
90
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
91
|
+
- Output: `mera phone number hai 9840950950`
|
|
92
|
+
|
|
93
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
94
|
+
- Output: `मेरा phone number है 9840950950`
|
|
55
95
|
|
|
56
96
|
sample_rate : typing.Optional[str]
|
|
57
97
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -81,6 +121,8 @@ class RawSpeechToTextStreamingClient:
|
|
|
81
121
|
query_params = query_params.add("language-code", language_code)
|
|
82
122
|
if model is not None:
|
|
83
123
|
query_params = query_params.add("model", model)
|
|
124
|
+
if mode is not None:
|
|
125
|
+
query_params = query_params.add("mode", mode)
|
|
84
126
|
if sample_rate is not None:
|
|
85
127
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
86
128
|
if high_vad_sensitivity is not None:
|
|
@@ -122,7 +164,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
122
164
|
self,
|
|
123
165
|
*,
|
|
124
166
|
language_code: SpeechToTextStreamingLanguageCode,
|
|
125
|
-
model: typing.Optional[
|
|
167
|
+
model: typing.Optional[SpeechToTextStreamingModel] = None,
|
|
168
|
+
mode: typing.Optional[SpeechToTextStreamingMode] = None,
|
|
126
169
|
sample_rate: typing.Optional[str] = None,
|
|
127
170
|
high_vad_sensitivity: typing.Optional[SpeechToTextStreamingHighVadSensitivity] = None,
|
|
128
171
|
vad_signals: typing.Optional[SpeechToTextStreamingVadSignals] = None,
|
|
@@ -140,10 +183,47 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
140
183
|
Parameters
|
|
141
184
|
----------
|
|
142
185
|
language_code : SpeechToTextStreamingLanguageCode
|
|
143
|
-
|
|
186
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
187
|
+
|
|
188
|
+
**Available Options:**
|
|
189
|
+
- `hi-IN`: Hindi
|
|
190
|
+
- `bn-IN`: Bengali
|
|
191
|
+
- `gu-IN`: Gujarati
|
|
192
|
+
- `kn-IN`: Kannada
|
|
193
|
+
- `ml-IN`: Malayalam
|
|
194
|
+
- `mr-IN`: Marathi
|
|
195
|
+
- `od-IN`: Odia
|
|
196
|
+
- `pa-IN`: Punjabi
|
|
197
|
+
- `ta-IN`: Tamil
|
|
198
|
+
- `te-IN`: Telugu
|
|
199
|
+
- `en-IN`: English
|
|
200
|
+
|
|
201
|
+
model : typing.Optional[SpeechToTextStreamingModel]
|
|
202
|
+
Specifies the model to use for speech-to-text conversion.
|
|
203
|
+
|
|
204
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
205
|
+
|
|
206
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
207
|
+
|
|
208
|
+
mode : typing.Optional[SpeechToTextStreamingMode]
|
|
209
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
210
|
+
|
|
211
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
212
|
+
|
|
213
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
214
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
215
|
+
|
|
216
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
217
|
+
- Output: `My phone number is 9840950950`
|
|
218
|
+
|
|
219
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
220
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
221
|
+
|
|
222
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
223
|
+
- Output: `mera phone number hai 9840950950`
|
|
144
224
|
|
|
145
|
-
|
|
146
|
-
|
|
225
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
226
|
+
- Output: `मेरा phone number है 9840950950`
|
|
147
227
|
|
|
148
228
|
sample_rate : typing.Optional[str]
|
|
149
229
|
Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.
|
|
@@ -173,6 +253,8 @@ class AsyncRawSpeechToTextStreamingClient:
|
|
|
173
253
|
query_params = query_params.add("language-code", language_code)
|
|
174
254
|
if model is not None:
|
|
175
255
|
query_params = query_params.add("model", model)
|
|
256
|
+
if mode is not None:
|
|
257
|
+
query_params = query_params.add("mode", mode)
|
|
176
258
|
if sample_rate is not None:
|
|
177
259
|
query_params = query_params.add("sample_rate", sample_rate)
|
|
178
260
|
if high_vad_sensitivity is not None:
|
|
@@ -2,46 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
# isort: skip_file
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
12
|
-
from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
13
|
-
_dynamic_imports: typing.Dict[str, str] = {
|
|
14
|
-
"SpeechToTextStreamingFlushSignal": ".speech_to_text_streaming_flush_signal",
|
|
15
|
-
"SpeechToTextStreamingHighVadSensitivity": ".speech_to_text_streaming_high_vad_sensitivity",
|
|
16
|
-
"SpeechToTextStreamingLanguageCode": ".speech_to_text_streaming_language_code",
|
|
17
|
-
"SpeechToTextStreamingVadSignals": ".speech_to_text_streaming_vad_signals",
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def __getattr__(attr_name: str) -> typing.Any:
|
|
22
|
-
module_name = _dynamic_imports.get(attr_name)
|
|
23
|
-
if module_name is None:
|
|
24
|
-
raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}")
|
|
25
|
-
try:
|
|
26
|
-
module = import_module(module_name, __package__)
|
|
27
|
-
if module_name == f".{attr_name}":
|
|
28
|
-
return module
|
|
29
|
-
else:
|
|
30
|
-
return getattr(module, attr_name)
|
|
31
|
-
except ImportError as e:
|
|
32
|
-
raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e
|
|
33
|
-
except AttributeError as e:
|
|
34
|
-
raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def __dir__():
|
|
38
|
-
lazy_attrs = list(_dynamic_imports.keys())
|
|
39
|
-
return sorted(lazy_attrs)
|
|
40
|
-
|
|
5
|
+
from .speech_to_text_streaming_flush_signal import SpeechToTextStreamingFlushSignal
|
|
6
|
+
from .speech_to_text_streaming_high_vad_sensitivity import SpeechToTextStreamingHighVadSensitivity
|
|
7
|
+
from .speech_to_text_streaming_language_code import SpeechToTextStreamingLanguageCode
|
|
8
|
+
from .speech_to_text_streaming_mode import SpeechToTextStreamingMode
|
|
9
|
+
from .speech_to_text_streaming_model import SpeechToTextStreamingModel
|
|
10
|
+
from .speech_to_text_streaming_vad_signals import SpeechToTextStreamingVadSignals
|
|
41
11
|
|
|
42
12
|
__all__ = [
|
|
43
13
|
"SpeechToTextStreamingFlushSignal",
|
|
44
14
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
45
15
|
"SpeechToTextStreamingLanguageCode",
|
|
16
|
+
"SpeechToTextStreamingMode",
|
|
17
|
+
"SpeechToTextStreamingModel",
|
|
46
18
|
"SpeechToTextStreamingVadSignals",
|
|
47
19
|
]
|