sarvamai 0.1.23a4__py3-none-any.whl → 0.1.23a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +4 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/requests/speech_to_text_job_parameters.py +37 -5
- sarvamai/requests/speech_to_text_response.py +6 -14
- sarvamai/requests/speech_to_text_transcription_data.py +14 -0
- sarvamai/requests/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/requests/speech_to_text_translate_response.py +6 -9
- sarvamai/requests/speech_to_text_translate_transcription_data.py +13 -0
- sarvamai/speech_to_text/client.py +84 -26
- sarvamai/speech_to_text/raw_client.py +84 -26
- sarvamai/speech_to_text_streaming/__init__.py +2 -0
- sarvamai/speech_to_text_streaming/client.py +117 -18
- sarvamai/speech_to_text_streaming/raw_client.py +117 -18
- sarvamai/speech_to_text_streaming/types/__init__.py +2 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_input_audio_codec.py +7 -0
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_language_code.py +25 -1
- sarvamai/speech_to_text_streaming/types/speech_to_text_streaming_mode.py +1 -1
- sarvamai/speech_to_text_translate_streaming/__init__.py +2 -0
- sarvamai/speech_to_text_translate_streaming/client.py +23 -2
- sarvamai/speech_to_text_translate_streaming/raw_client.py +23 -2
- sarvamai/speech_to_text_translate_streaming/types/__init__.py +2 -0
- sarvamai/speech_to_text_translate_streaming/types/speech_to_text_translate_streaming_input_audio_codec.py +7 -0
- sarvamai/types/mode.py +1 -3
- sarvamai/types/speech_to_text_job_parameters.py +37 -5
- sarvamai/types/speech_to_text_language.py +24 -1
- sarvamai/types/speech_to_text_model.py +1 -3
- sarvamai/types/speech_to_text_response.py +6 -14
- sarvamai/types/speech_to_text_transcription_data.py +14 -0
- sarvamai/types/speech_to_text_translate_job_parameters.py +4 -1
- sarvamai/types/speech_to_text_translate_language.py +25 -1
- sarvamai/types/speech_to_text_translate_model.py +1 -1
- sarvamai/types/speech_to_text_translate_response.py +6 -9
- sarvamai/types/speech_to_text_translate_transcription_data.py +13 -0
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/METADATA +1 -1
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/RECORD +36 -34
- {sarvamai-0.1.23a4.dist-info → sarvamai-0.1.23a6.dist-info}/WHEEL +0 -0
sarvamai/__init__.py
CHANGED
|
@@ -188,6 +188,7 @@ from .requests import (
|
|
|
188
188
|
from .speech_to_text_streaming import (
|
|
189
189
|
SpeechToTextStreamingFlushSignal,
|
|
190
190
|
SpeechToTextStreamingHighVadSensitivity,
|
|
191
|
+
SpeechToTextStreamingInputAudioCodec,
|
|
191
192
|
SpeechToTextStreamingLanguageCode,
|
|
192
193
|
SpeechToTextStreamingMode,
|
|
193
194
|
SpeechToTextStreamingModel,
|
|
@@ -196,6 +197,7 @@ from .speech_to_text_streaming import (
|
|
|
196
197
|
from .speech_to_text_translate_streaming import (
|
|
197
198
|
SpeechToTextTranslateStreamingFlushSignal,
|
|
198
199
|
SpeechToTextTranslateStreamingHighVadSensitivity,
|
|
200
|
+
SpeechToTextTranslateStreamingInputAudioCodec,
|
|
199
201
|
SpeechToTextTranslateStreamingVadSignals,
|
|
200
202
|
)
|
|
201
203
|
from .text_to_speech_streaming import TextToSpeechStreamingModel, TextToSpeechStreamingSendCompletionEvent
|
|
@@ -320,6 +322,7 @@ __all__ = [
|
|
|
320
322
|
"SpeechToTextResponseParams",
|
|
321
323
|
"SpeechToTextStreamingFlushSignal",
|
|
322
324
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
325
|
+
"SpeechToTextStreamingInputAudioCodec",
|
|
323
326
|
"SpeechToTextStreamingLanguageCode",
|
|
324
327
|
"SpeechToTextStreamingMode",
|
|
325
328
|
"SpeechToTextStreamingModel",
|
|
@@ -338,6 +341,7 @@ __all__ = [
|
|
|
338
341
|
"SpeechToTextTranslateResponseParams",
|
|
339
342
|
"SpeechToTextTranslateStreamingFlushSignal",
|
|
340
343
|
"SpeechToTextTranslateStreamingHighVadSensitivity",
|
|
344
|
+
"SpeechToTextTranslateStreamingInputAudioCodec",
|
|
341
345
|
"SpeechToTextTranslateStreamingResponse",
|
|
342
346
|
"SpeechToTextTranslateStreamingResponseParams",
|
|
343
347
|
"SpeechToTextTranslateStreamingVadSignals",
|
sarvamai/core/client_wrapper.py
CHANGED
|
@@ -23,10 +23,10 @@ class BaseClientWrapper:
|
|
|
23
23
|
|
|
24
24
|
def get_headers(self) -> typing.Dict[str, str]:
|
|
25
25
|
headers: typing.Dict[str, str] = {
|
|
26
|
-
"User-Agent": "sarvamai/0.1.
|
|
26
|
+
"User-Agent": "sarvamai/0.1.23a6",
|
|
27
27
|
"X-Fern-Language": "Python",
|
|
28
28
|
"X-Fern-SDK-Name": "sarvamai",
|
|
29
|
-
"X-Fern-SDK-Version": "0.1.
|
|
29
|
+
"X-Fern-SDK-Version": "0.1.23a6",
|
|
30
30
|
**(self.get_custom_headers() or {}),
|
|
31
31
|
}
|
|
32
32
|
headers["api-subscription-key"] = self.api_subscription_key
|
|
@@ -9,20 +9,52 @@ from ..types.speech_to_text_translate_language import SpeechToTextTranslateLangu
|
|
|
9
9
|
class SpeechToTextJobParametersParams(typing_extensions.TypedDict):
|
|
10
10
|
language_code: typing_extensions.NotRequired[SpeechToTextTranslateLanguage]
|
|
11
11
|
"""
|
|
12
|
-
|
|
12
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
13
|
+
|
|
14
|
+
**Available Options:**
|
|
15
|
+
- `unknown` (default): Use when the language is not known; the API will auto-detect.
|
|
16
|
+
- `hi-IN`: Hindi
|
|
17
|
+
- `bn-IN`: Bengali
|
|
18
|
+
- `kn-IN`: Kannada
|
|
19
|
+
- `ml-IN`: Malayalam
|
|
20
|
+
- `mr-IN`: Marathi
|
|
21
|
+
- `od-IN`: Odia
|
|
22
|
+
- `pa-IN`: Punjabi
|
|
23
|
+
- `ta-IN`: Tamil
|
|
24
|
+
- `te-IN`: Telugu
|
|
25
|
+
- `en-IN`: English
|
|
26
|
+
- `gu-IN`: Gujarati
|
|
13
27
|
"""
|
|
14
28
|
|
|
15
29
|
model: typing_extensions.NotRequired[SpeechToTextModel]
|
|
16
30
|
"""
|
|
17
31
|
Model to be used for speech to text.
|
|
18
|
-
|
|
19
|
-
- **saarika:
|
|
20
|
-
|
|
32
|
+
|
|
33
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
34
|
+
|
|
35
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
21
36
|
"""
|
|
22
37
|
|
|
23
38
|
mode: typing_extensions.NotRequired[Mode]
|
|
24
39
|
"""
|
|
25
|
-
Mode of operation. Only applicable
|
|
40
|
+
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
41
|
+
|
|
42
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
43
|
+
|
|
44
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
45
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
46
|
+
|
|
47
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
48
|
+
- Output: `My phone number is 9840950950`
|
|
49
|
+
|
|
50
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
51
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
52
|
+
|
|
53
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
54
|
+
- Output: `mera phone number hai 9840950950`
|
|
55
|
+
|
|
56
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
57
|
+
- Output: `मेरा phone number है 9840950950`
|
|
26
58
|
"""
|
|
27
59
|
|
|
28
60
|
with_timestamps: typing_extensions.NotRequired[bool]
|
|
@@ -1,28 +1,20 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
3
|
import typing_extensions
|
|
4
|
-
from .diarized_transcript import DiarizedTranscriptParams
|
|
5
|
-
from .timestamps_model import TimestampsModelParams
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class SpeechToTextResponseParams(typing_extensions.TypedDict):
|
|
9
|
-
request_id:
|
|
10
|
-
transcript: str
|
|
7
|
+
request_id: str
|
|
11
8
|
"""
|
|
12
|
-
|
|
9
|
+
Unique identifier for the request
|
|
13
10
|
"""
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
"""
|
|
17
|
-
Contains timestamps for the transcribed text. This field is included only if with_timestamps is set to true
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
diarized_transcript: typing_extensions.NotRequired[DiarizedTranscriptParams]
|
|
12
|
+
transcript: str
|
|
21
13
|
"""
|
|
22
|
-
|
|
14
|
+
The transcribed text from the provided audio file.
|
|
23
15
|
"""
|
|
24
16
|
|
|
25
|
-
language_code:
|
|
17
|
+
language_code: str
|
|
26
18
|
"""
|
|
27
|
-
|
|
19
|
+
The BCP-47 code of language spoken in the input (e.g., hi-IN, en-IN). If multiple languages are detected, returns the most predominant spoken language.
|
|
28
20
|
"""
|
|
@@ -32,4 +32,18 @@ class SpeechToTextTranscriptionDataParams(typing_extensions.TypedDict):
|
|
|
32
32
|
BCP-47 code of detected language
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
+
language_probability: typing_extensions.NotRequired[float]
|
|
36
|
+
"""
|
|
37
|
+
Float value (0.0 to 1.0) indicating the probability of the detected language being correct. Higher values indicate higher confidence.
|
|
38
|
+
|
|
39
|
+
**When it returns a value:**
|
|
40
|
+
- When `language_code` is not provided in the request
|
|
41
|
+
- When `language_code` is set to `unknown`
|
|
42
|
+
|
|
43
|
+
**When it returns null:**
|
|
44
|
+
- When a specific `language_code` is provided (language detection is skipped)
|
|
45
|
+
|
|
46
|
+
The parameter is always present in the response.
|
|
47
|
+
"""
|
|
48
|
+
|
|
35
49
|
metrics: TranscriptionMetricsParams
|
|
@@ -12,7 +12,10 @@ class SpeechToTextTranslateJobParametersParams(typing_extensions.TypedDict):
|
|
|
12
12
|
|
|
13
13
|
model: typing_extensions.NotRequired[SpeechToTextTranslateModel]
|
|
14
14
|
"""
|
|
15
|
-
Model to be used for
|
|
15
|
+
Model to be used for speech to text translation.
|
|
16
|
+
|
|
17
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
18
|
+
- Example: Hindi audio → English text output
|
|
16
19
|
"""
|
|
17
20
|
|
|
18
21
|
with_diarization: typing_extensions.NotRequired[bool]
|
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
3
|
import typing_extensions
|
|
4
|
-
from ..types.speech_to_text_translate_language import SpeechToTextTranslateLanguage
|
|
5
|
-
from .diarized_transcript import DiarizedTranscriptParams
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class SpeechToTextTranslateResponseParams(typing_extensions.TypedDict):
|
|
9
|
-
request_id:
|
|
10
|
-
transcript: str
|
|
7
|
+
request_id: str
|
|
11
8
|
"""
|
|
12
|
-
|
|
9
|
+
Unique identifier for the request
|
|
13
10
|
"""
|
|
14
11
|
|
|
15
|
-
|
|
12
|
+
transcript: str
|
|
16
13
|
"""
|
|
17
|
-
|
|
14
|
+
English translation of the provided speech
|
|
18
15
|
"""
|
|
19
16
|
|
|
20
|
-
|
|
17
|
+
language_code: str
|
|
21
18
|
"""
|
|
22
|
-
|
|
19
|
+
The BCP-47 code of the detected source language spoken in the input (e.g., hi-IN, kn-IN).
|
|
23
20
|
"""
|
|
@@ -20,4 +20,17 @@ class SpeechToTextTranslateTranscriptionDataParams(typing_extensions.TypedDict):
|
|
|
20
20
|
BCP-47 code of detected source language (null when language detection is in progress)
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
+
language_probability: typing_extensions.NotRequired[float]
|
|
24
|
+
"""
|
|
25
|
+
Float value (0.0 to 1.0) indicating the probability of the detected source language being correct. Higher values indicate higher confidence.
|
|
26
|
+
|
|
27
|
+
**When it returns a value:**
|
|
28
|
+
- Always returns a value as source language is auto-detected for translation
|
|
29
|
+
|
|
30
|
+
**When it returns null:**
|
|
31
|
+
- When language detection confidence is unavailable
|
|
32
|
+
|
|
33
|
+
The parameter is always present in the response.
|
|
34
|
+
"""
|
|
35
|
+
|
|
23
36
|
metrics: TranscriptionMetricsParams
|
|
@@ -65,23 +65,49 @@ class SpeechToTextClient:
|
|
|
65
65
|
|
|
66
66
|
model : typing.Optional[SpeechToTextModel]
|
|
67
67
|
Specifies the model to use for speech-to-text conversion.
|
|
68
|
-
|
|
69
|
-
- **saarika:
|
|
70
|
-
|
|
68
|
+
|
|
69
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
70
|
+
|
|
71
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
71
72
|
|
|
72
73
|
mode : typing.Optional[Mode]
|
|
73
74
|
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
- **
|
|
78
|
-
|
|
79
|
-
|
|
75
|
+
|
|
76
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
77
|
+
|
|
78
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
79
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
80
|
+
|
|
81
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
82
|
+
- Output: `My phone number is 9840950950`
|
|
83
|
+
|
|
84
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
85
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
86
|
+
|
|
87
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
88
|
+
- Output: `mera phone number hai 9840950950`
|
|
89
|
+
|
|
90
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
91
|
+
- Output: `मेरा phone number है 9840950950`
|
|
80
92
|
|
|
81
93
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
82
|
-
Specifies the language of the input audio.
|
|
83
|
-
|
|
84
|
-
|
|
94
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
95
|
+
|
|
96
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
97
|
+
|
|
98
|
+
**Available Options:**
|
|
99
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
100
|
+
- `hi-IN`: Hindi
|
|
101
|
+
- `bn-IN`: Bengali
|
|
102
|
+
- `kn-IN`: Kannada
|
|
103
|
+
- `ml-IN`: Malayalam
|
|
104
|
+
- `mr-IN`: Marathi
|
|
105
|
+
- `od-IN`: Odia
|
|
106
|
+
- `pa-IN`: Punjabi
|
|
107
|
+
- `ta-IN`: Tamil
|
|
108
|
+
- `te-IN`: Telugu
|
|
109
|
+
- `en-IN`: English
|
|
110
|
+
- `gu-IN`: Gujarati
|
|
85
111
|
|
|
86
112
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
87
113
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -146,7 +172,10 @@ class SpeechToTextClient:
|
|
|
146
172
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
147
173
|
|
|
148
174
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
149
|
-
Model to be used for
|
|
175
|
+
Model to be used for speech to text translation.
|
|
176
|
+
|
|
177
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
178
|
+
- Example: Hindi audio → English text output
|
|
150
179
|
|
|
151
180
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
152
181
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -221,23 +250,49 @@ class AsyncSpeechToTextClient:
|
|
|
221
250
|
|
|
222
251
|
model : typing.Optional[SpeechToTextModel]
|
|
223
252
|
Specifies the model to use for speech-to-text conversion.
|
|
224
|
-
|
|
225
|
-
- **saarika:
|
|
226
|
-
|
|
253
|
+
|
|
254
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
255
|
+
|
|
256
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
227
257
|
|
|
228
258
|
mode : typing.Optional[Mode]
|
|
229
259
|
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
- **
|
|
234
|
-
|
|
235
|
-
|
|
260
|
+
|
|
261
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
262
|
+
|
|
263
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
264
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
265
|
+
|
|
266
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
267
|
+
- Output: `My phone number is 9840950950`
|
|
268
|
+
|
|
269
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
270
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
271
|
+
|
|
272
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
273
|
+
- Output: `mera phone number hai 9840950950`
|
|
274
|
+
|
|
275
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
276
|
+
- Output: `मेरा phone number है 9840950950`
|
|
236
277
|
|
|
237
278
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
238
|
-
Specifies the language of the input audio.
|
|
239
|
-
|
|
240
|
-
|
|
279
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
280
|
+
|
|
281
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
282
|
+
|
|
283
|
+
**Available Options:**
|
|
284
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
285
|
+
- `hi-IN`: Hindi
|
|
286
|
+
- `bn-IN`: Bengali
|
|
287
|
+
- `kn-IN`: Kannada
|
|
288
|
+
- `ml-IN`: Malayalam
|
|
289
|
+
- `mr-IN`: Marathi
|
|
290
|
+
- `od-IN`: Odia
|
|
291
|
+
- `pa-IN`: Punjabi
|
|
292
|
+
- `ta-IN`: Tamil
|
|
293
|
+
- `te-IN`: Telugu
|
|
294
|
+
- `en-IN`: English
|
|
295
|
+
- `gu-IN`: Gujarati
|
|
241
296
|
|
|
242
297
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
243
298
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -310,7 +365,10 @@ class AsyncSpeechToTextClient:
|
|
|
310
365
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
311
366
|
|
|
312
367
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
313
|
-
Model to be used for
|
|
368
|
+
Model to be used for speech to text translation.
|
|
369
|
+
|
|
370
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
371
|
+
- Example: Hindi audio → English text output
|
|
314
372
|
|
|
315
373
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
316
374
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -63,23 +63,49 @@ class RawSpeechToTextClient:
|
|
|
63
63
|
|
|
64
64
|
model : typing.Optional[SpeechToTextModel]
|
|
65
65
|
Specifies the model to use for speech-to-text conversion.
|
|
66
|
-
|
|
67
|
-
- **saarika:
|
|
68
|
-
|
|
66
|
+
|
|
67
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
68
|
+
|
|
69
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
69
70
|
|
|
70
71
|
mode : typing.Optional[Mode]
|
|
71
72
|
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
- **
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
|
|
74
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
75
|
+
|
|
76
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
77
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
78
|
+
|
|
79
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
80
|
+
- Output: `My phone number is 9840950950`
|
|
81
|
+
|
|
82
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
83
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
84
|
+
|
|
85
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
86
|
+
- Output: `mera phone number hai 9840950950`
|
|
87
|
+
|
|
88
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
89
|
+
- Output: `मेरा phone number है 9840950950`
|
|
78
90
|
|
|
79
91
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
80
|
-
Specifies the language of the input audio.
|
|
81
|
-
|
|
82
|
-
|
|
92
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
93
|
+
|
|
94
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
95
|
+
|
|
96
|
+
**Available Options:**
|
|
97
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
98
|
+
- `hi-IN`: Hindi
|
|
99
|
+
- `bn-IN`: Bengali
|
|
100
|
+
- `kn-IN`: Kannada
|
|
101
|
+
- `ml-IN`: Malayalam
|
|
102
|
+
- `mr-IN`: Marathi
|
|
103
|
+
- `od-IN`: Odia
|
|
104
|
+
- `pa-IN`: Punjabi
|
|
105
|
+
- `ta-IN`: Tamil
|
|
106
|
+
- `te-IN`: Telugu
|
|
107
|
+
- `en-IN`: English
|
|
108
|
+
- `gu-IN`: Gujarati
|
|
83
109
|
|
|
84
110
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
85
111
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -223,7 +249,10 @@ class RawSpeechToTextClient:
|
|
|
223
249
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
224
250
|
|
|
225
251
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
226
|
-
Model to be used for
|
|
252
|
+
Model to be used for speech to text translation.
|
|
253
|
+
|
|
254
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
255
|
+
- Example: Hindi audio → English text output
|
|
227
256
|
|
|
228
257
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
229
258
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -370,23 +399,49 @@ class AsyncRawSpeechToTextClient:
|
|
|
370
399
|
|
|
371
400
|
model : typing.Optional[SpeechToTextModel]
|
|
372
401
|
Specifies the model to use for speech-to-text conversion.
|
|
373
|
-
|
|
374
|
-
- **saarika:
|
|
375
|
-
|
|
402
|
+
|
|
403
|
+
- **saarika:v2.5** (default): Transcribes audio in the spoken language.
|
|
404
|
+
|
|
405
|
+
- **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.
|
|
376
406
|
|
|
377
407
|
mode : typing.Optional[Mode]
|
|
378
408
|
Mode of operation. **Only applicable when using saaras:v3 model.**
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
- **
|
|
383
|
-
|
|
384
|
-
|
|
409
|
+
|
|
410
|
+
Example audio: 'मेरा फोन नंबर है 9840950950'
|
|
411
|
+
|
|
412
|
+
- **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
|
|
413
|
+
- Output: `मेरा फोन नंबर है 9840950950`
|
|
414
|
+
|
|
415
|
+
- **translate**: Translates speech from any supported Indic language to English.
|
|
416
|
+
- Output: `My phone number is 9840950950`
|
|
417
|
+
|
|
418
|
+
- **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
|
|
419
|
+
- Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`
|
|
420
|
+
|
|
421
|
+
- **translit**: Romanization - Transliterates speech to Latin/Roman script only.
|
|
422
|
+
- Output: `mera phone number hai 9840950950`
|
|
423
|
+
|
|
424
|
+
- **codemix**: Code-mixed text with English words in English and Indic words in native script.
|
|
425
|
+
- Output: `मेरा phone number है 9840950950`
|
|
385
426
|
|
|
386
427
|
language_code : typing.Optional[SpeechToTextLanguage]
|
|
387
|
-
Specifies the language of the input audio.
|
|
388
|
-
|
|
389
|
-
|
|
428
|
+
Specifies the language of the input audio in BCP-47 format.
|
|
429
|
+
|
|
430
|
+
**Note:** This parameter is optional for `saarika:v2.5` model.
|
|
431
|
+
|
|
432
|
+
**Available Options:**
|
|
433
|
+
- `unknown`: Use when the language is not known; the API will auto-detect.
|
|
434
|
+
- `hi-IN`: Hindi
|
|
435
|
+
- `bn-IN`: Bengali
|
|
436
|
+
- `kn-IN`: Kannada
|
|
437
|
+
- `ml-IN`: Malayalam
|
|
438
|
+
- `mr-IN`: Marathi
|
|
439
|
+
- `od-IN`: Odia
|
|
440
|
+
- `pa-IN`: Punjabi
|
|
441
|
+
- `ta-IN`: Tamil
|
|
442
|
+
- `te-IN`: Telugu
|
|
443
|
+
- `en-IN`: English
|
|
444
|
+
- `gu-IN`: Gujarati
|
|
390
445
|
|
|
391
446
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
392
447
|
Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.
|
|
@@ -530,7 +585,10 @@ class AsyncRawSpeechToTextClient:
|
|
|
530
585
|
Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.
|
|
531
586
|
|
|
532
587
|
model : typing.Optional[SpeechToTextTranslateModel]
|
|
533
|
-
Model to be used for
|
|
588
|
+
Model to be used for speech to text translation.
|
|
589
|
+
|
|
590
|
+
- **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
|
|
591
|
+
- Example: Hindi audio → English text output
|
|
534
592
|
|
|
535
593
|
input_audio_codec : typing.Optional[InputAudioCodec]
|
|
536
594
|
Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
from .types import (
|
|
6
6
|
SpeechToTextStreamingFlushSignal,
|
|
7
7
|
SpeechToTextStreamingHighVadSensitivity,
|
|
8
|
+
SpeechToTextStreamingInputAudioCodec,
|
|
8
9
|
SpeechToTextStreamingLanguageCode,
|
|
9
10
|
SpeechToTextStreamingMode,
|
|
10
11
|
SpeechToTextStreamingModel,
|
|
@@ -14,6 +15,7 @@ from .types import (
|
|
|
14
15
|
__all__ = [
|
|
15
16
|
"SpeechToTextStreamingFlushSignal",
|
|
16
17
|
"SpeechToTextStreamingHighVadSensitivity",
|
|
18
|
+
"SpeechToTextStreamingInputAudioCodec",
|
|
17
19
|
"SpeechToTextStreamingLanguageCode",
|
|
18
20
|
"SpeechToTextStreamingMode",
|
|
19
21
|
"SpeechToTextStreamingModel",
|