sarvamai 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +56 -1
- sarvamai/client.py +3 -0
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/requests/__init__.py +20 -0
- sarvamai/requests/audio_output.py +11 -0
- sarvamai/requests/audio_output_data.py +15 -0
- sarvamai/requests/configure_connection.py +18 -0
- sarvamai/requests/configure_connection_data.py +83 -0
- sarvamai/requests/error_response.py +11 -0
- sarvamai/requests/error_response_data.py +18 -0
- sarvamai/requests/flush_signal.py +14 -0
- sarvamai/requests/ping_signal.py +14 -0
- sarvamai/requests/send_text.py +11 -0
- sarvamai/requests/send_text_data.py +7 -0
- sarvamai/text_to_speech_streaming/__init__.py +4 -0
- sarvamai/text_to_speech_streaming/client.py +153 -0
- sarvamai/text_to_speech_streaming/raw_client.py +130 -0
- sarvamai/text_to_speech_streaming/socket_client.py +309 -0
- sarvamai/types/__init__.py +26 -0
- sarvamai/types/audio_output.py +21 -0
- sarvamai/types/audio_output_data.py +27 -0
- sarvamai/types/configure_connection.py +28 -0
- sarvamai/types/configure_connection_data.py +93 -0
- sarvamai/types/configure_connection_data_output_audio_bitrate.py +7 -0
- sarvamai/types/configure_connection_data_speaker.py +7 -0
- sarvamai/types/configure_connection_data_target_language_code.py +8 -0
- sarvamai/types/error_response.py +21 -0
- sarvamai/types/error_response_data.py +28 -0
- sarvamai/types/flush_signal.py +24 -0
- sarvamai/types/ping_signal.py +24 -0
- sarvamai/types/send_text.py +21 -0
- sarvamai/types/send_text_data.py +19 -0
- {sarvamai-0.1.7.dist-info → sarvamai-0.1.8.dist-info}/METADATA +2 -2
- {sarvamai-0.1.7.dist-info → sarvamai-0.1.8.dist-info}/RECORD +35 -8
- {sarvamai-0.1.7.dist-info → sarvamai-0.1.8.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import websockets
|
|
8
|
+
import websockets.sync.client as websockets_sync_client
|
|
9
|
+
from ..core.api_error import ApiError
|
|
10
|
+
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
11
|
+
from ..core.request_options import RequestOptions
|
|
12
|
+
from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RawTextToSpeechStreamingClient:
|
|
16
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
|
17
|
+
self._client_wrapper = client_wrapper
|
|
18
|
+
|
|
19
|
+
@contextmanager
|
|
20
|
+
def connect(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
24
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
25
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
26
|
+
) -> typing.Iterator[TextToSpeechStreamingSocketClient]:
|
|
27
|
+
"""
|
|
28
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
29
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
34
|
+
Text to speech model to use
|
|
35
|
+
|
|
36
|
+
api_subscription_key : typing.Optional[str]
|
|
37
|
+
API subscription key for authentication
|
|
38
|
+
|
|
39
|
+
request_options : typing.Optional[RequestOptions]
|
|
40
|
+
Request-specific configuration.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
TextToSpeechStreamingSocketClient
|
|
45
|
+
"""
|
|
46
|
+
ws_url = self._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
47
|
+
query_params = httpx.QueryParams()
|
|
48
|
+
if model is not None:
|
|
49
|
+
query_params = query_params.add("model", model)
|
|
50
|
+
ws_url = ws_url + f"?{query_params}"
|
|
51
|
+
headers = self._client_wrapper.get_headers()
|
|
52
|
+
if api_subscription_key is not None:
|
|
53
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
54
|
+
if request_options and "additional_headers" in request_options:
|
|
55
|
+
headers.update(request_options["additional_headers"])
|
|
56
|
+
try:
|
|
57
|
+
with websockets_sync_client.connect(ws_url, additional_headers=headers) as protocol:
|
|
58
|
+
yield TextToSpeechStreamingSocketClient(websocket=protocol)
|
|
59
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
60
|
+
status_code: int = exc.status_code
|
|
61
|
+
if status_code == 401:
|
|
62
|
+
raise ApiError(
|
|
63
|
+
status_code=status_code,
|
|
64
|
+
headers=dict(headers),
|
|
65
|
+
body="Websocket initialized with invalid credentials.",
|
|
66
|
+
)
|
|
67
|
+
raise ApiError(
|
|
68
|
+
status_code=status_code,
|
|
69
|
+
headers=dict(headers),
|
|
70
|
+
body="Unexpected error when initializing websocket connection.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AsyncRawTextToSpeechStreamingClient:
|
|
75
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
76
|
+
self._client_wrapper = client_wrapper
|
|
77
|
+
|
|
78
|
+
@asynccontextmanager
|
|
79
|
+
async def connect(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
83
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
84
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
85
|
+
) -> typing.AsyncIterator[AsyncTextToSpeechStreamingSocketClient]:
|
|
86
|
+
"""
|
|
87
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
88
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
93
|
+
Text to speech model to use
|
|
94
|
+
|
|
95
|
+
api_subscription_key : typing.Optional[str]
|
|
96
|
+
API subscription key for authentication
|
|
97
|
+
|
|
98
|
+
request_options : typing.Optional[RequestOptions]
|
|
99
|
+
Request-specific configuration.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
AsyncTextToSpeechStreamingSocketClient
|
|
104
|
+
"""
|
|
105
|
+
ws_url = self._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
106
|
+
query_params = httpx.QueryParams()
|
|
107
|
+
if model is not None:
|
|
108
|
+
query_params = query_params.add("model", model)
|
|
109
|
+
ws_url = ws_url + f"?{query_params}"
|
|
110
|
+
headers = self._client_wrapper.get_headers()
|
|
111
|
+
if api_subscription_key is not None:
|
|
112
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
113
|
+
if request_options and "additional_headers" in request_options:
|
|
114
|
+
headers.update(request_options["additional_headers"])
|
|
115
|
+
try:
|
|
116
|
+
async with websockets.connect(ws_url, extra_headers=headers) as protocol:
|
|
117
|
+
yield AsyncTextToSpeechStreamingSocketClient(websocket=protocol)
|
|
118
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
119
|
+
status_code: int = exc.status_code
|
|
120
|
+
if status_code == 401:
|
|
121
|
+
raise ApiError(
|
|
122
|
+
status_code=status_code,
|
|
123
|
+
headers=dict(headers),
|
|
124
|
+
body="Websocket initialized with invalid credentials.",
|
|
125
|
+
)
|
|
126
|
+
raise ApiError(
|
|
127
|
+
status_code=status_code,
|
|
128
|
+
headers=dict(headers),
|
|
129
|
+
body="Unexpected error when initializing websocket connection.",
|
|
130
|
+
)
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
import websockets
|
|
7
|
+
import websockets.sync.connection as websockets_sync_connection
|
|
8
|
+
from ..core.events import EventEmitterMixin, EventType
|
|
9
|
+
from ..core.pydantic_utilities import parse_obj_as
|
|
10
|
+
from ..types.audio_output import AudioOutput
|
|
11
|
+
from ..types.flush_signal import FlushSignal
|
|
12
|
+
from ..types.error_response import ErrorResponse
|
|
13
|
+
from ..types.configure_connection import ConfigureConnection
|
|
14
|
+
from ..types.configure_connection_data import ConfigureConnectionData
|
|
15
|
+
from ..types.ping_signal import PingSignal
|
|
16
|
+
from ..types.send_text import SendText
|
|
17
|
+
from ..types.send_text_data import SendTextData
|
|
18
|
+
|
|
19
|
+
TextToSpeechStreamingSocketClientResponse = typing.Union[AudioOutput, ErrorResponse]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AsyncTextToSpeechStreamingSocketClient(EventEmitterMixin):
|
|
23
|
+
def __init__(self, *, websocket: websockets.WebSocketClientProtocol):
|
|
24
|
+
super().__init__()
|
|
25
|
+
self._websocket = websocket
|
|
26
|
+
|
|
27
|
+
async def __aiter__(self):
|
|
28
|
+
async for message in self._websocket:
|
|
29
|
+
message = json.loads(message) if isinstance(message, str) else message
|
|
30
|
+
yield parse_obj_as(TextToSpeechStreamingSocketClientResponse, message) # type: ignore
|
|
31
|
+
|
|
32
|
+
async def start_listening(self):
|
|
33
|
+
"""
|
|
34
|
+
Start listening for messages on the websocket connection.
|
|
35
|
+
|
|
36
|
+
Emits events in the following order:
|
|
37
|
+
- EventType.OPEN when connection is established
|
|
38
|
+
- EventType.MESSAGE for each message received
|
|
39
|
+
- EventType.ERROR if an error occurs
|
|
40
|
+
- EventType.CLOSE when connection is closed
|
|
41
|
+
"""
|
|
42
|
+
self._emit(EventType.OPEN, None)
|
|
43
|
+
try:
|
|
44
|
+
async for raw_message in self._websocket:
|
|
45
|
+
raw_message = (
|
|
46
|
+
json.loads(raw_message)
|
|
47
|
+
if isinstance(raw_message, str)
|
|
48
|
+
else raw_message
|
|
49
|
+
)
|
|
50
|
+
parsed = parse_obj_as(TextToSpeechStreamingSocketClientResponse, raw_message) # type: ignore
|
|
51
|
+
self._emit(EventType.MESSAGE, parsed)
|
|
52
|
+
except websockets.WebSocketException as exc:
|
|
53
|
+
self._emit(EventType.ERROR, exc)
|
|
54
|
+
finally:
|
|
55
|
+
self._emit(EventType.CLOSE, None)
|
|
56
|
+
|
|
57
|
+
async def configure(
|
|
58
|
+
self,
|
|
59
|
+
target_language_code: str,
|
|
60
|
+
speaker: str = "anushka",
|
|
61
|
+
pitch: float = 0.0,
|
|
62
|
+
pace: float = 1.0,
|
|
63
|
+
loudness: float = 1.0,
|
|
64
|
+
speech_sample_rate: int = 22050,
|
|
65
|
+
enable_preprocessing: bool = False,
|
|
66
|
+
output_audio_codec: str = "mp3",
|
|
67
|
+
output_audio_bitrate: str = "128k",
|
|
68
|
+
min_buffer_size: int = 50,
|
|
69
|
+
max_chunk_length: int = 150,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""
|
|
72
|
+
Configuration message required as the first message after establishing the WebSocket connection.
|
|
73
|
+
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
74
|
+
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
75
|
+
will be automatically flushed and processed before applying the new configuration.
|
|
76
|
+
|
|
77
|
+
:param target_language_code: The language of the text is BCP-47 format
|
|
78
|
+
:param speaker: The speaker voice to be used for the output audio. Default: Anushka.
|
|
79
|
+
Model Compatibility (bulbul:v2): Female: Anushka, Manisha, Vidya, Arya;
|
|
80
|
+
Male: Abhilash, Karun, Hitesh
|
|
81
|
+
:param pitch: Controls the pitch of the audio. Lower values result in a deeper voice,
|
|
82
|
+
while higher values make it sharper. The suitable range is between -0.75
|
|
83
|
+
and 0.75. Default is 0.0.
|
|
84
|
+
:param pace: Controls the speed of the audio. Lower values result in slower speech,
|
|
85
|
+
while higher values make it faster. The suitable range is between 0.5
|
|
86
|
+
and 2.0. Default is 1.0.
|
|
87
|
+
:param loudness: Controls the loudness of the audio. Lower values result in quieter audio,
|
|
88
|
+
while higher values make it louder. The suitable range is between 0.3
|
|
89
|
+
and 3.0. Default is 1.0.
|
|
90
|
+
:param speech_sample_rate: Specifies the sample rate of the output audio. Supported values are
|
|
91
|
+
8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
|
|
92
|
+
:param enable_preprocessing: Controls whether normalization of English words and numeric entities
|
|
93
|
+
(e.g., numbers, dates) is performed. Set to true for better handling
|
|
94
|
+
of mixed-language text. Default is false.
|
|
95
|
+
:param output_audio_codec: Audio codec (currently supports MP3 only, optimized for real-time playback)
|
|
96
|
+
:param output_audio_bitrate: Audio bitrate (choose from 5 supported bitrate options)
|
|
97
|
+
:param min_buffer_size: Minimum character length that triggers buffer flushing for TTS model processing
|
|
98
|
+
:param max_chunk_length: Maximum length for sentence splitting (adjust based on content length)
|
|
99
|
+
"""
|
|
100
|
+
data = ConfigureConnectionData(
|
|
101
|
+
target_language_code=target_language_code,
|
|
102
|
+
speaker=speaker,
|
|
103
|
+
pitch=pitch,
|
|
104
|
+
pace=pace,
|
|
105
|
+
loudness=loudness,
|
|
106
|
+
speech_sample_rate=speech_sample_rate,
|
|
107
|
+
enable_preprocessing=enable_preprocessing,
|
|
108
|
+
output_audio_codec=output_audio_codec,
|
|
109
|
+
output_audio_bitrate=output_audio_bitrate,
|
|
110
|
+
min_buffer_size=min_buffer_size,
|
|
111
|
+
max_chunk_length=max_chunk_length,
|
|
112
|
+
)
|
|
113
|
+
message = ConfigureConnection(data=data)
|
|
114
|
+
await self._send_model(message)
|
|
115
|
+
|
|
116
|
+
async def convert(self, text: str) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Send text to be converted to speech. Text length should be 1-2500 characters.
|
|
119
|
+
Recommended: <500 characters for optimal streaming performance.
|
|
120
|
+
Real-time endpoints perform better with longer character counts.
|
|
121
|
+
|
|
122
|
+
:param text: Text to be synthesized (1-2500 characters, recommended <500)
|
|
123
|
+
"""
|
|
124
|
+
data = SendTextData(text=text)
|
|
125
|
+
message = SendText(data=data)
|
|
126
|
+
await self._send_model(message)
|
|
127
|
+
|
|
128
|
+
async def flush(self) -> None:
|
|
129
|
+
"""
|
|
130
|
+
Forces the text buffer to process immediately, regardless of the min_buffer_size threshold.
|
|
131
|
+
Use this when you need to process remaining text that hasn't reached the minimum buffer size.
|
|
132
|
+
"""
|
|
133
|
+
message = FlushSignal()
|
|
134
|
+
await self._send_model(message)
|
|
135
|
+
|
|
136
|
+
async def ping(self) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Send ping signal to keep the WebSocket connection alive. The connection automatically
|
|
139
|
+
closes after one minute of inactivity.
|
|
140
|
+
"""
|
|
141
|
+
message = PingSignal()
|
|
142
|
+
await self._send_model(message)
|
|
143
|
+
|
|
144
|
+
async def recv(self) -> TextToSpeechStreamingSocketClientResponse:
|
|
145
|
+
"""
|
|
146
|
+
Receive a message from the websocket connection.
|
|
147
|
+
"""
|
|
148
|
+
data = await self._websocket.recv()
|
|
149
|
+
data = json.loads(data) if isinstance(data, str) else data
|
|
150
|
+
return parse_obj_as(TextToSpeechStreamingSocketClientResponse, data) # type: ignore
|
|
151
|
+
|
|
152
|
+
async def _send(self, data: typing.Any) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Send a message to the websocket connection.
|
|
155
|
+
"""
|
|
156
|
+
if isinstance(data, dict):
|
|
157
|
+
data = json.dumps(data)
|
|
158
|
+
await self._websocket.send(data)
|
|
159
|
+
|
|
160
|
+
async def _send_model(self, data: typing.Any) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Send a Pydantic model to the websocket connection.
|
|
163
|
+
"""
|
|
164
|
+
await self._send(data.dict())
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class TextToSpeechStreamingSocketClient(EventEmitterMixin):
|
|
168
|
+
def __init__(self, *, websocket: websockets_sync_connection.Connection):
|
|
169
|
+
super().__init__()
|
|
170
|
+
self._websocket = websocket
|
|
171
|
+
|
|
172
|
+
def __iter__(self):
|
|
173
|
+
for message in self._websocket:
|
|
174
|
+
message = json.loads(message) if isinstance(message, str) else message
|
|
175
|
+
yield parse_obj_as(TextToSpeechStreamingSocketClientResponse, message) # type: ignore
|
|
176
|
+
|
|
177
|
+
def start_listening(self):
|
|
178
|
+
"""
|
|
179
|
+
Start listening for messages on the websocket connection.
|
|
180
|
+
|
|
181
|
+
Emits events in the following order:
|
|
182
|
+
- EventType.OPEN when connection is established
|
|
183
|
+
- EventType.MESSAGE for each message received
|
|
184
|
+
- EventType.ERROR if an error occurs
|
|
185
|
+
- EventType.CLOSE when connection is closed
|
|
186
|
+
"""
|
|
187
|
+
self._emit(EventType.OPEN, None)
|
|
188
|
+
try:
|
|
189
|
+
for raw_message in self._websocket:
|
|
190
|
+
raw_message = (
|
|
191
|
+
json.loads(raw_message)
|
|
192
|
+
if isinstance(raw_message, str)
|
|
193
|
+
else raw_message
|
|
194
|
+
)
|
|
195
|
+
parsed = parse_obj_as(TextToSpeechStreamingSocketClientResponse, raw_message) # type: ignore
|
|
196
|
+
self._emit(EventType.MESSAGE, parsed)
|
|
197
|
+
except websockets.WebSocketException as exc:
|
|
198
|
+
self._emit(EventType.ERROR, exc)
|
|
199
|
+
finally:
|
|
200
|
+
self._emit(EventType.CLOSE, None)
|
|
201
|
+
|
|
202
|
+
def configure(
|
|
203
|
+
self,
|
|
204
|
+
target_language_code: str,
|
|
205
|
+
speaker: str = "anushka",
|
|
206
|
+
pitch: float = 0.0,
|
|
207
|
+
pace: float = 1.0,
|
|
208
|
+
loudness: float = 1.0,
|
|
209
|
+
speech_sample_rate: int = 22050,
|
|
210
|
+
enable_preprocessing: bool = False,
|
|
211
|
+
output_audio_codec: str = "mp3",
|
|
212
|
+
output_audio_bitrate: str = "128k",
|
|
213
|
+
min_buffer_size: int = 50,
|
|
214
|
+
max_chunk_length: int = 150,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Configuration message required as the first message after establishing the WebSocket connection.
|
|
218
|
+
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
219
|
+
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
220
|
+
will be automatically flushed and processed before applying the new configuration.
|
|
221
|
+
|
|
222
|
+
:param target_language_code: The language of the text is BCP-47 format
|
|
223
|
+
:param speaker: The speaker voice to be used for the output audio. Default: Anushka.
|
|
224
|
+
Model Compatibility (bulbul:v2): Female: Anushka, Manisha, Vidya, Arya;
|
|
225
|
+
Male: Abhilash, Karun, Hitesh
|
|
226
|
+
:param pitch: Controls the pitch of the audio. Lower values result in a deeper voice,
|
|
227
|
+
while higher values make it sharper. The suitable range is between -0.75
|
|
228
|
+
and 0.75. Default is 0.0.
|
|
229
|
+
:param pace: Controls the speed of the audio. Lower values result in slower speech,
|
|
230
|
+
while higher values make it faster. The suitable range is between 0.5
|
|
231
|
+
and 2.0. Default is 1.0.
|
|
232
|
+
:param loudness: Controls the loudness of the audio. Lower values result in quieter audio,
|
|
233
|
+
while higher values make it louder. The suitable range is between 0.3
|
|
234
|
+
and 3.0. Default is 1.0.
|
|
235
|
+
:param speech_sample_rate: Specifies the sample rate of the output audio. Supported values are
|
|
236
|
+
8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
|
|
237
|
+
:param enable_preprocessing: Controls whether normalization of English words and numeric entities
|
|
238
|
+
(e.g., numbers, dates) is performed. Set to true for better handling
|
|
239
|
+
of mixed-language text. Default is false.
|
|
240
|
+
:param output_audio_codec: Audio codec (currently supports MP3 only, optimized for real-time playback)
|
|
241
|
+
:param output_audio_bitrate: Audio bitrate (choose from 5 supported bitrate options)
|
|
242
|
+
:param min_buffer_size: Minimum character length that triggers buffer flushing for TTS model processing
|
|
243
|
+
:param max_chunk_length: Maximum length for sentence splitting (adjust based on content length)
|
|
244
|
+
"""
|
|
245
|
+
data = ConfigureConnectionData(
|
|
246
|
+
target_language_code=target_language_code,
|
|
247
|
+
speaker=speaker,
|
|
248
|
+
pitch=pitch,
|
|
249
|
+
pace=pace,
|
|
250
|
+
loudness=loudness,
|
|
251
|
+
speech_sample_rate=speech_sample_rate,
|
|
252
|
+
enable_preprocessing=enable_preprocessing,
|
|
253
|
+
output_audio_codec=output_audio_codec,
|
|
254
|
+
output_audio_bitrate=output_audio_bitrate,
|
|
255
|
+
min_buffer_size=min_buffer_size,
|
|
256
|
+
max_chunk_length=max_chunk_length,
|
|
257
|
+
)
|
|
258
|
+
message = ConfigureConnection(data=data)
|
|
259
|
+
self._send_model(message)
|
|
260
|
+
|
|
261
|
+
def convert(self, text: str) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Send text to be converted to speech. Text length should be 1-2500 characters.
|
|
264
|
+
Recommended: <500 characters for optimal streaming performance.
|
|
265
|
+
Real-time endpoints perform better with longer character counts.
|
|
266
|
+
|
|
267
|
+
:param text: Text to be synthesized (1-2500 characters, recommended <500)
|
|
268
|
+
"""
|
|
269
|
+
data = SendTextData(text=text)
|
|
270
|
+
message = SendText(data=data)
|
|
271
|
+
self._send_model(message)
|
|
272
|
+
|
|
273
|
+
def flush(self) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Forces the text buffer to process immediately, regardless of the min_buffer_size threshold.
|
|
276
|
+
Use this when you need to process remaining text that hasn't reached the minimum buffer size.
|
|
277
|
+
"""
|
|
278
|
+
message = FlushSignal()
|
|
279
|
+
self._send_model(message)
|
|
280
|
+
|
|
281
|
+
def ping(self) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Send ping signal to keep the WebSocket connection alive. The connection automatically
|
|
284
|
+
closes after one minute of inactivity.
|
|
285
|
+
"""
|
|
286
|
+
message = PingSignal()
|
|
287
|
+
self._send_model(message)
|
|
288
|
+
|
|
289
|
+
def recv(self) -> TextToSpeechStreamingSocketClientResponse:
|
|
290
|
+
"""
|
|
291
|
+
Receive a message from the websocket connection.
|
|
292
|
+
"""
|
|
293
|
+
data = self._websocket.recv()
|
|
294
|
+
data = json.loads(data) if isinstance(data, str) else data
|
|
295
|
+
return parse_obj_as(TextToSpeechStreamingSocketClientResponse, data) # type: ignore
|
|
296
|
+
|
|
297
|
+
def _send(self, data: typing.Any) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Send a message to the websocket connection.
|
|
300
|
+
"""
|
|
301
|
+
if isinstance(data, dict):
|
|
302
|
+
data = json.dumps(data)
|
|
303
|
+
self._websocket.send(data)
|
|
304
|
+
|
|
305
|
+
def _send_model(self, data: typing.Any) -> None:
|
|
306
|
+
"""
|
|
307
|
+
Send a Pydantic model to the websocket connection.
|
|
308
|
+
"""
|
|
309
|
+
self._send(data.dict())
|
sarvamai/types/__init__.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
from .audio_data import AudioData
|
|
6
6
|
from .audio_message import AudioMessage
|
|
7
|
+
from .audio_output import AudioOutput
|
|
8
|
+
from .audio_output_data import AudioOutputData
|
|
7
9
|
from .chat_completion_request_assistant_message import ChatCompletionRequestAssistantMessage
|
|
8
10
|
from .chat_completion_request_message import (
|
|
9
11
|
ChatCompletionRequestMessage,
|
|
@@ -17,6 +19,11 @@ from .chat_completion_response_message import ChatCompletionResponseMessage
|
|
|
17
19
|
from .choice import Choice
|
|
18
20
|
from .completion_usage import CompletionUsage
|
|
19
21
|
from .config_message import ConfigMessage
|
|
22
|
+
from .configure_connection import ConfigureConnection
|
|
23
|
+
from .configure_connection_data import ConfigureConnectionData
|
|
24
|
+
from .configure_connection_data_output_audio_bitrate import ConfigureConnectionDataOutputAudioBitrate
|
|
25
|
+
from .configure_connection_data_speaker import ConfigureConnectionDataSpeaker
|
|
26
|
+
from .configure_connection_data_target_language_code import ConfigureConnectionDataTargetLanguageCode
|
|
20
27
|
from .create_chat_completion_response import CreateChatCompletionResponse
|
|
21
28
|
from .diarized_entry import DiarizedEntry
|
|
22
29
|
from .diarized_transcript import DiarizedTranscript
|
|
@@ -24,15 +31,21 @@ from .error_code import ErrorCode
|
|
|
24
31
|
from .error_data import ErrorData
|
|
25
32
|
from .error_details import ErrorDetails
|
|
26
33
|
from .error_message import ErrorMessage
|
|
34
|
+
from .error_response import ErrorResponse
|
|
35
|
+
from .error_response_data import ErrorResponseData
|
|
27
36
|
from .events_data import EventsData
|
|
28
37
|
from .finish_reason import FinishReason
|
|
38
|
+
from .flush_signal import FlushSignal
|
|
29
39
|
from .format import Format
|
|
30
40
|
from .language_identification_response import LanguageIdentificationResponse
|
|
31
41
|
from .numerals_format import NumeralsFormat
|
|
42
|
+
from .ping_signal import PingSignal
|
|
32
43
|
from .reasoning_effort import ReasoningEffort
|
|
33
44
|
from .response_type import ResponseType
|
|
34
45
|
from .role import Role
|
|
35
46
|
from .sarvam_model_ids import SarvamModelIds
|
|
47
|
+
from .send_text import SendText
|
|
48
|
+
from .send_text_data import SendTextData
|
|
36
49
|
from .speech_sample_rate import SpeechSampleRate
|
|
37
50
|
from .speech_to_text_language import SpeechToTextLanguage
|
|
38
51
|
from .speech_to_text_model import SpeechToTextModel
|
|
@@ -68,6 +81,8 @@ from .transliteration_response import TransliterationResponse
|
|
|
68
81
|
__all__ = [
|
|
69
82
|
"AudioData",
|
|
70
83
|
"AudioMessage",
|
|
84
|
+
"AudioOutput",
|
|
85
|
+
"AudioOutputData",
|
|
71
86
|
"ChatCompletionRequestAssistantMessage",
|
|
72
87
|
"ChatCompletionRequestMessage",
|
|
73
88
|
"ChatCompletionRequestMessage_Assistant",
|
|
@@ -79,6 +94,11 @@ __all__ = [
|
|
|
79
94
|
"Choice",
|
|
80
95
|
"CompletionUsage",
|
|
81
96
|
"ConfigMessage",
|
|
97
|
+
"ConfigureConnection",
|
|
98
|
+
"ConfigureConnectionData",
|
|
99
|
+
"ConfigureConnectionDataOutputAudioBitrate",
|
|
100
|
+
"ConfigureConnectionDataSpeaker",
|
|
101
|
+
"ConfigureConnectionDataTargetLanguageCode",
|
|
82
102
|
"CreateChatCompletionResponse",
|
|
83
103
|
"DiarizedEntry",
|
|
84
104
|
"DiarizedTranscript",
|
|
@@ -86,15 +106,21 @@ __all__ = [
|
|
|
86
106
|
"ErrorData",
|
|
87
107
|
"ErrorDetails",
|
|
88
108
|
"ErrorMessage",
|
|
109
|
+
"ErrorResponse",
|
|
110
|
+
"ErrorResponseData",
|
|
89
111
|
"EventsData",
|
|
90
112
|
"FinishReason",
|
|
113
|
+
"FlushSignal",
|
|
91
114
|
"Format",
|
|
92
115
|
"LanguageIdentificationResponse",
|
|
93
116
|
"NumeralsFormat",
|
|
117
|
+
"PingSignal",
|
|
94
118
|
"ReasoningEffort",
|
|
95
119
|
"ResponseType",
|
|
96
120
|
"Role",
|
|
97
121
|
"SarvamModelIds",
|
|
122
|
+
"SendText",
|
|
123
|
+
"SendTextData",
|
|
98
124
|
"SpeechSampleRate",
|
|
99
125
|
"SpeechToTextLanguage",
|
|
100
126
|
"SpeechToTextModel",
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
|
7
|
+
from .audio_output_data import AudioOutputData
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AudioOutput(UniversalBaseModel):
|
|
11
|
+
type: typing.Literal["audio"] = "audio"
|
|
12
|
+
data: AudioOutputData
|
|
13
|
+
|
|
14
|
+
if IS_PYDANTIC_V2:
|
|
15
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
|
16
|
+
else:
|
|
17
|
+
|
|
18
|
+
class Config:
|
|
19
|
+
frozen = True
|
|
20
|
+
smart_union = True
|
|
21
|
+
extra = pydantic.Extra.allow
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AudioOutputData(UniversalBaseModel):
|
|
10
|
+
content_type: str = pydantic.Field()
|
|
11
|
+
"""
|
|
12
|
+
MIME type of the audio content (e.g., 'audio/mp3', 'audio/wav')
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
audio: str = pydantic.Field()
|
|
16
|
+
"""
|
|
17
|
+
Base64-encoded audio data ready for playback or download
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
if IS_PYDANTIC_V2:
|
|
21
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
|
22
|
+
else:
|
|
23
|
+
|
|
24
|
+
class Config:
|
|
25
|
+
frozen = True
|
|
26
|
+
smart_union = True
|
|
27
|
+
extra = pydantic.Extra.allow
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
from ..core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
|
7
|
+
from .configure_connection_data import ConfigureConnectionData
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigureConnection(UniversalBaseModel):
|
|
11
|
+
"""
|
|
12
|
+
Configuration message required as the first message after establishing the WebSocket connection.
|
|
13
|
+
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
14
|
+
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
15
|
+
will be automatically flushed and processed before applying the new configuration.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
type: typing.Literal["config"] = "config"
|
|
19
|
+
data: ConfigureConnectionData
|
|
20
|
+
|
|
21
|
+
if IS_PYDANTIC_V2:
|
|
22
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
|
23
|
+
else:
|
|
24
|
+
|
|
25
|
+
class Config:
|
|
26
|
+
frozen = True
|
|
27
|
+
smart_union = True
|
|
28
|
+
extra = pydantic.Extra.allow
|