sarvamai 0.1.7a0__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarvamai/__init__.py +48 -52
- sarvamai/client.py +3 -3
- sarvamai/core/client_wrapper.py +2 -2
- sarvamai/errors/service_unavailable_error.py +2 -1
- sarvamai/requests/__init__.py +20 -22
- sarvamai/requests/audio_output.py +11 -0
- sarvamai/requests/audio_output_data.py +15 -0
- sarvamai/requests/configure_connection.py +18 -0
- sarvamai/requests/configure_connection_data.py +83 -0
- sarvamai/requests/error_response.py +11 -0
- sarvamai/requests/error_response_data.py +18 -0
- sarvamai/requests/flush_signal.py +14 -0
- sarvamai/requests/ping_signal.py +14 -0
- sarvamai/requests/send_text.py +11 -0
- sarvamai/requests/{base_job_parameters.py → send_text_data.py} +2 -2
- sarvamai/speech_to_text/raw_client.py +9 -8
- sarvamai/text_to_speech_streaming/client.py +153 -0
- sarvamai/text_to_speech_streaming/raw_client.py +130 -0
- sarvamai/text_to_speech_streaming/socket_client.py +309 -0
- sarvamai/types/__init__.py +26 -28
- sarvamai/types/{files_request.py → audio_output.py} +4 -3
- sarvamai/types/{bulk_job_callback.py → audio_output_data.py} +5 -5
- sarvamai/types/configure_connection.py +28 -0
- sarvamai/types/configure_connection_data.py +93 -0
- sarvamai/types/configure_connection_data_output_audio_bitrate.py +7 -0
- sarvamai/types/configure_connection_data_speaker.py +7 -0
- sarvamai/types/configure_connection_data_target_language_code.py +8 -0
- sarvamai/types/{file_signed_url_details.py → error_response.py} +4 -3
- sarvamai/types/{files_download_response.py → error_response_data.py} +11 -8
- sarvamai/types/flush_signal.py +24 -0
- sarvamai/types/ping_signal.py +24 -0
- sarvamai/types/{task_file_details.py → send_text.py} +4 -3
- sarvamai/types/{base_job_parameters.py → send_text_data.py} +3 -1
- {sarvamai-0.1.7a0.dist-info → sarvamai-0.1.8.dist-info}/METADATA +1 -1
- {sarvamai-0.1.7a0.dist-info → sarvamai-0.1.8.dist-info}/RECORD +37 -38
- sarvamai/requests/bulk_job_callback.py +0 -15
- sarvamai/requests/bulk_job_init_response_v_1.py +0 -27
- sarvamai/requests/file_signed_url_details.py +0 -10
- sarvamai/requests/files_download_response.py +0 -15
- sarvamai/requests/files_request.py +0 -10
- sarvamai/requests/files_upload_response.py +0 -15
- sarvamai/requests/job_status_v_1.py +0 -70
- sarvamai/requests/speech_to_text_job_parameters.py +0 -32
- sarvamai/requests/task_detail_v_1.py +0 -15
- sarvamai/requests/task_file_details.py +0 -8
- sarvamai/speech_to_text_job/client.py +0 -454
- sarvamai/speech_to_text_job/raw_client.py +0 -1189
- sarvamai/types/bulk_job_init_response_v_1.py +0 -39
- sarvamai/types/files_upload_response.py +0 -25
- sarvamai/types/job_state.py +0 -5
- sarvamai/types/job_status_v_1.py +0 -80
- sarvamai/types/speech_to_text_job_parameters.py +0 -44
- sarvamai/types/storage_container_type.py +0 -5
- sarvamai/types/task_detail_v_1.py +0 -25
- sarvamai/types/task_state.py +0 -5
- /sarvamai/{speech_to_text_job → text_to_speech_streaming}/__init__.py +0 -0
- {sarvamai-0.1.7a0.dist-info → sarvamai-0.1.8.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import websockets
|
|
8
|
+
import websockets.sync.client as websockets_sync_client
|
|
9
|
+
from ..core.api_error import ApiError
|
|
10
|
+
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
11
|
+
from ..core.request_options import RequestOptions
|
|
12
|
+
from .raw_client import AsyncRawTextToSpeechStreamingClient, RawTextToSpeechStreamingClient
|
|
13
|
+
from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TextToSpeechStreamingClient:
|
|
17
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
|
18
|
+
self._raw_client = RawTextToSpeechStreamingClient(client_wrapper=client_wrapper)
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def with_raw_response(self) -> RawTextToSpeechStreamingClient:
|
|
22
|
+
"""
|
|
23
|
+
Retrieves a raw implementation of this client that returns raw responses.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
RawTextToSpeechStreamingClient
|
|
28
|
+
"""
|
|
29
|
+
return self._raw_client
|
|
30
|
+
|
|
31
|
+
@contextmanager
|
|
32
|
+
def connect(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
36
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
37
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
38
|
+
) -> typing.Iterator[TextToSpeechStreamingSocketClient]:
|
|
39
|
+
"""
|
|
40
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
41
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
46
|
+
Text to speech model to use
|
|
47
|
+
|
|
48
|
+
api_subscription_key : typing.Optional[str]
|
|
49
|
+
API subscription key for authentication
|
|
50
|
+
|
|
51
|
+
request_options : typing.Optional[RequestOptions]
|
|
52
|
+
Request-specific configuration.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
TextToSpeechStreamingSocketClient
|
|
57
|
+
"""
|
|
58
|
+
ws_url = self._raw_client._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
59
|
+
query_params = httpx.QueryParams()
|
|
60
|
+
if model is not None:
|
|
61
|
+
query_params = query_params.add("model", model)
|
|
62
|
+
ws_url = ws_url + f"?{query_params}"
|
|
63
|
+
headers = self._raw_client._client_wrapper.get_headers()
|
|
64
|
+
if api_subscription_key is not None:
|
|
65
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
66
|
+
if request_options and "additional_headers" in request_options:
|
|
67
|
+
headers.update(request_options["additional_headers"])
|
|
68
|
+
try:
|
|
69
|
+
with websockets_sync_client.connect(ws_url, additional_headers=headers) as protocol:
|
|
70
|
+
yield TextToSpeechStreamingSocketClient(websocket=protocol)
|
|
71
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
72
|
+
status_code: int = exc.status_code
|
|
73
|
+
if status_code == 401:
|
|
74
|
+
raise ApiError(
|
|
75
|
+
status_code=status_code,
|
|
76
|
+
headers=dict(headers),
|
|
77
|
+
body="Websocket initialized with invalid credentials.",
|
|
78
|
+
)
|
|
79
|
+
raise ApiError(
|
|
80
|
+
status_code=status_code,
|
|
81
|
+
headers=dict(headers),
|
|
82
|
+
body="Unexpected error when initializing websocket connection.",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class AsyncTextToSpeechStreamingClient:
|
|
87
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
88
|
+
self._raw_client = AsyncRawTextToSpeechStreamingClient(client_wrapper=client_wrapper)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def with_raw_response(self) -> AsyncRawTextToSpeechStreamingClient:
|
|
92
|
+
"""
|
|
93
|
+
Retrieves a raw implementation of this client that returns raw responses.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
AsyncRawTextToSpeechStreamingClient
|
|
98
|
+
"""
|
|
99
|
+
return self._raw_client
|
|
100
|
+
|
|
101
|
+
@asynccontextmanager
|
|
102
|
+
async def connect(
|
|
103
|
+
self,
|
|
104
|
+
*,
|
|
105
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
106
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
107
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
108
|
+
) -> typing.AsyncIterator[AsyncTextToSpeechStreamingSocketClient]:
|
|
109
|
+
"""
|
|
110
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
111
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
116
|
+
Text to speech model to use
|
|
117
|
+
|
|
118
|
+
api_subscription_key : typing.Optional[str]
|
|
119
|
+
API subscription key for authentication
|
|
120
|
+
|
|
121
|
+
request_options : typing.Optional[RequestOptions]
|
|
122
|
+
Request-specific configuration.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
AsyncTextToSpeechStreamingSocketClient
|
|
127
|
+
"""
|
|
128
|
+
ws_url = self._raw_client._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
129
|
+
query_params = httpx.QueryParams()
|
|
130
|
+
if model is not None:
|
|
131
|
+
query_params = query_params.add("model", model)
|
|
132
|
+
ws_url = ws_url + f"?{query_params}"
|
|
133
|
+
headers = self._raw_client._client_wrapper.get_headers()
|
|
134
|
+
if api_subscription_key is not None:
|
|
135
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
136
|
+
if request_options and "additional_headers" in request_options:
|
|
137
|
+
headers.update(request_options["additional_headers"])
|
|
138
|
+
try:
|
|
139
|
+
async with websockets.connect(ws_url, extra_headers=headers) as protocol:
|
|
140
|
+
yield AsyncTextToSpeechStreamingSocketClient(websocket=protocol)
|
|
141
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
142
|
+
status_code: int = exc.status_code
|
|
143
|
+
if status_code == 401:
|
|
144
|
+
raise ApiError(
|
|
145
|
+
status_code=status_code,
|
|
146
|
+
headers=dict(headers),
|
|
147
|
+
body="Websocket initialized with invalid credentials.",
|
|
148
|
+
)
|
|
149
|
+
raise ApiError(
|
|
150
|
+
status_code=status_code,
|
|
151
|
+
headers=dict(headers),
|
|
152
|
+
body="Unexpected error when initializing websocket connection.",
|
|
153
|
+
)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import websockets
|
|
8
|
+
import websockets.sync.client as websockets_sync_client
|
|
9
|
+
from ..core.api_error import ApiError
|
|
10
|
+
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
11
|
+
from ..core.request_options import RequestOptions
|
|
12
|
+
from .socket_client import AsyncTextToSpeechStreamingSocketClient, TextToSpeechStreamingSocketClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RawTextToSpeechStreamingClient:
|
|
16
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
|
17
|
+
self._client_wrapper = client_wrapper
|
|
18
|
+
|
|
19
|
+
@contextmanager
|
|
20
|
+
def connect(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
24
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
25
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
26
|
+
) -> typing.Iterator[TextToSpeechStreamingSocketClient]:
|
|
27
|
+
"""
|
|
28
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
29
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
34
|
+
Text to speech model to use
|
|
35
|
+
|
|
36
|
+
api_subscription_key : typing.Optional[str]
|
|
37
|
+
API subscription key for authentication
|
|
38
|
+
|
|
39
|
+
request_options : typing.Optional[RequestOptions]
|
|
40
|
+
Request-specific configuration.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
TextToSpeechStreamingSocketClient
|
|
45
|
+
"""
|
|
46
|
+
ws_url = self._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
47
|
+
query_params = httpx.QueryParams()
|
|
48
|
+
if model is not None:
|
|
49
|
+
query_params = query_params.add("model", model)
|
|
50
|
+
ws_url = ws_url + f"?{query_params}"
|
|
51
|
+
headers = self._client_wrapper.get_headers()
|
|
52
|
+
if api_subscription_key is not None:
|
|
53
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
54
|
+
if request_options and "additional_headers" in request_options:
|
|
55
|
+
headers.update(request_options["additional_headers"])
|
|
56
|
+
try:
|
|
57
|
+
with websockets_sync_client.connect(ws_url, additional_headers=headers) as protocol:
|
|
58
|
+
yield TextToSpeechStreamingSocketClient(websocket=protocol)
|
|
59
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
60
|
+
status_code: int = exc.status_code
|
|
61
|
+
if status_code == 401:
|
|
62
|
+
raise ApiError(
|
|
63
|
+
status_code=status_code,
|
|
64
|
+
headers=dict(headers),
|
|
65
|
+
body="Websocket initialized with invalid credentials.",
|
|
66
|
+
)
|
|
67
|
+
raise ApiError(
|
|
68
|
+
status_code=status_code,
|
|
69
|
+
headers=dict(headers),
|
|
70
|
+
body="Unexpected error when initializing websocket connection.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AsyncRawTextToSpeechStreamingClient:
|
|
75
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
76
|
+
self._client_wrapper = client_wrapper
|
|
77
|
+
|
|
78
|
+
@asynccontextmanager
|
|
79
|
+
async def connect(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
model: typing.Optional[typing.Literal["bulbul:v2"]] = None,
|
|
83
|
+
api_subscription_key: typing.Optional[str] = None,
|
|
84
|
+
request_options: typing.Optional[RequestOptions] = None,
|
|
85
|
+
) -> typing.AsyncIterator[AsyncTextToSpeechStreamingSocketClient]:
|
|
86
|
+
"""
|
|
87
|
+
Bidirectional WebSocket channel for real-time TTS synthesis.
|
|
88
|
+
Supports streaming, flushing, config updates, and audio playback.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
model : typing.Optional[typing.Literal["bulbul:v2"]]
|
|
93
|
+
Text to speech model to use
|
|
94
|
+
|
|
95
|
+
api_subscription_key : typing.Optional[str]
|
|
96
|
+
API subscription key for authentication
|
|
97
|
+
|
|
98
|
+
request_options : typing.Optional[RequestOptions]
|
|
99
|
+
Request-specific configuration.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
AsyncTextToSpeechStreamingSocketClient
|
|
104
|
+
"""
|
|
105
|
+
ws_url = self._client_wrapper.get_environment().production + "/text-to-speech/ws"
|
|
106
|
+
query_params = httpx.QueryParams()
|
|
107
|
+
if model is not None:
|
|
108
|
+
query_params = query_params.add("model", model)
|
|
109
|
+
ws_url = ws_url + f"?{query_params}"
|
|
110
|
+
headers = self._client_wrapper.get_headers()
|
|
111
|
+
if api_subscription_key is not None:
|
|
112
|
+
headers["Api-Subscription-Key"] = str(api_subscription_key)
|
|
113
|
+
if request_options and "additional_headers" in request_options:
|
|
114
|
+
headers.update(request_options["additional_headers"])
|
|
115
|
+
try:
|
|
116
|
+
async with websockets.connect(ws_url, extra_headers=headers) as protocol:
|
|
117
|
+
yield AsyncTextToSpeechStreamingSocketClient(websocket=protocol)
|
|
118
|
+
except websockets.exceptions.InvalidStatusCode as exc:
|
|
119
|
+
status_code: int = exc.status_code
|
|
120
|
+
if status_code == 401:
|
|
121
|
+
raise ApiError(
|
|
122
|
+
status_code=status_code,
|
|
123
|
+
headers=dict(headers),
|
|
124
|
+
body="Websocket initialized with invalid credentials.",
|
|
125
|
+
)
|
|
126
|
+
raise ApiError(
|
|
127
|
+
status_code=status_code,
|
|
128
|
+
headers=dict(headers),
|
|
129
|
+
body="Unexpected error when initializing websocket connection.",
|
|
130
|
+
)
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
import websockets
|
|
7
|
+
import websockets.sync.connection as websockets_sync_connection
|
|
8
|
+
from ..core.events import EventEmitterMixin, EventType
|
|
9
|
+
from ..core.pydantic_utilities import parse_obj_as
|
|
10
|
+
from ..types.audio_output import AudioOutput
|
|
11
|
+
from ..types.flush_signal import FlushSignal
|
|
12
|
+
from ..types.error_response import ErrorResponse
|
|
13
|
+
from ..types.configure_connection import ConfigureConnection
|
|
14
|
+
from ..types.configure_connection_data import ConfigureConnectionData
|
|
15
|
+
from ..types.ping_signal import PingSignal
|
|
16
|
+
from ..types.send_text import SendText
|
|
17
|
+
from ..types.send_text_data import SendTextData
|
|
18
|
+
|
|
19
|
+
TextToSpeechStreamingSocketClientResponse = typing.Union[AudioOutput, ErrorResponse]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AsyncTextToSpeechStreamingSocketClient(EventEmitterMixin):
|
|
23
|
+
def __init__(self, *, websocket: websockets.WebSocketClientProtocol):
|
|
24
|
+
super().__init__()
|
|
25
|
+
self._websocket = websocket
|
|
26
|
+
|
|
27
|
+
async def __aiter__(self):
|
|
28
|
+
async for message in self._websocket:
|
|
29
|
+
message = json.loads(message) if isinstance(message, str) else message
|
|
30
|
+
yield parse_obj_as(TextToSpeechStreamingSocketClientResponse, message) # type: ignore
|
|
31
|
+
|
|
32
|
+
async def start_listening(self):
|
|
33
|
+
"""
|
|
34
|
+
Start listening for messages on the websocket connection.
|
|
35
|
+
|
|
36
|
+
Emits events in the following order:
|
|
37
|
+
- EventType.OPEN when connection is established
|
|
38
|
+
- EventType.MESSAGE for each message received
|
|
39
|
+
- EventType.ERROR if an error occurs
|
|
40
|
+
- EventType.CLOSE when connection is closed
|
|
41
|
+
"""
|
|
42
|
+
self._emit(EventType.OPEN, None)
|
|
43
|
+
try:
|
|
44
|
+
async for raw_message in self._websocket:
|
|
45
|
+
raw_message = (
|
|
46
|
+
json.loads(raw_message)
|
|
47
|
+
if isinstance(raw_message, str)
|
|
48
|
+
else raw_message
|
|
49
|
+
)
|
|
50
|
+
parsed = parse_obj_as(TextToSpeechStreamingSocketClientResponse, raw_message) # type: ignore
|
|
51
|
+
self._emit(EventType.MESSAGE, parsed)
|
|
52
|
+
except websockets.WebSocketException as exc:
|
|
53
|
+
self._emit(EventType.ERROR, exc)
|
|
54
|
+
finally:
|
|
55
|
+
self._emit(EventType.CLOSE, None)
|
|
56
|
+
|
|
57
|
+
async def configure(
|
|
58
|
+
self,
|
|
59
|
+
target_language_code: str,
|
|
60
|
+
speaker: str = "anushka",
|
|
61
|
+
pitch: float = 0.0,
|
|
62
|
+
pace: float = 1.0,
|
|
63
|
+
loudness: float = 1.0,
|
|
64
|
+
speech_sample_rate: int = 22050,
|
|
65
|
+
enable_preprocessing: bool = False,
|
|
66
|
+
output_audio_codec: str = "mp3",
|
|
67
|
+
output_audio_bitrate: str = "128k",
|
|
68
|
+
min_buffer_size: int = 50,
|
|
69
|
+
max_chunk_length: int = 150,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""
|
|
72
|
+
Configuration message required as the first message after establishing the WebSocket connection.
|
|
73
|
+
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
74
|
+
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
75
|
+
will be automatically flushed and processed before applying the new configuration.
|
|
76
|
+
|
|
77
|
+
:param target_language_code: The language of the text is BCP-47 format
|
|
78
|
+
:param speaker: The speaker voice to be used for the output audio. Default: Anushka.
|
|
79
|
+
Model Compatibility (bulbul:v2): Female: Anushka, Manisha, Vidya, Arya;
|
|
80
|
+
Male: Abhilash, Karun, Hitesh
|
|
81
|
+
:param pitch: Controls the pitch of the audio. Lower values result in a deeper voice,
|
|
82
|
+
while higher values make it sharper. The suitable range is between -0.75
|
|
83
|
+
and 0.75. Default is 0.0.
|
|
84
|
+
:param pace: Controls the speed of the audio. Lower values result in slower speech,
|
|
85
|
+
while higher values make it faster. The suitable range is between 0.5
|
|
86
|
+
and 2.0. Default is 1.0.
|
|
87
|
+
:param loudness: Controls the loudness of the audio. Lower values result in quieter audio,
|
|
88
|
+
while higher values make it louder. The suitable range is between 0.3
|
|
89
|
+
and 3.0. Default is 1.0.
|
|
90
|
+
:param speech_sample_rate: Specifies the sample rate of the output audio. Supported values are
|
|
91
|
+
8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
|
|
92
|
+
:param enable_preprocessing: Controls whether normalization of English words and numeric entities
|
|
93
|
+
(e.g., numbers, dates) is performed. Set to true for better handling
|
|
94
|
+
of mixed-language text. Default is false.
|
|
95
|
+
:param output_audio_codec: Audio codec (currently supports MP3 only, optimized for real-time playback)
|
|
96
|
+
:param output_audio_bitrate: Audio bitrate (choose from 5 supported bitrate options)
|
|
97
|
+
:param min_buffer_size: Minimum character length that triggers buffer flushing for TTS model processing
|
|
98
|
+
:param max_chunk_length: Maximum length for sentence splitting (adjust based on content length)
|
|
99
|
+
"""
|
|
100
|
+
data = ConfigureConnectionData(
|
|
101
|
+
target_language_code=target_language_code,
|
|
102
|
+
speaker=speaker,
|
|
103
|
+
pitch=pitch,
|
|
104
|
+
pace=pace,
|
|
105
|
+
loudness=loudness,
|
|
106
|
+
speech_sample_rate=speech_sample_rate,
|
|
107
|
+
enable_preprocessing=enable_preprocessing,
|
|
108
|
+
output_audio_codec=output_audio_codec,
|
|
109
|
+
output_audio_bitrate=output_audio_bitrate,
|
|
110
|
+
min_buffer_size=min_buffer_size,
|
|
111
|
+
max_chunk_length=max_chunk_length,
|
|
112
|
+
)
|
|
113
|
+
message = ConfigureConnection(data=data)
|
|
114
|
+
await self._send_model(message)
|
|
115
|
+
|
|
116
|
+
async def convert(self, text: str) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Send text to be converted to speech. Text length should be 1-2500 characters.
|
|
119
|
+
Recommended: <500 characters for optimal streaming performance.
|
|
120
|
+
Real-time endpoints perform better with longer character counts.
|
|
121
|
+
|
|
122
|
+
:param text: Text to be synthesized (1-2500 characters, recommended <500)
|
|
123
|
+
"""
|
|
124
|
+
data = SendTextData(text=text)
|
|
125
|
+
message = SendText(data=data)
|
|
126
|
+
await self._send_model(message)
|
|
127
|
+
|
|
128
|
+
async def flush(self) -> None:
|
|
129
|
+
"""
|
|
130
|
+
Forces the text buffer to process immediately, regardless of the min_buffer_size threshold.
|
|
131
|
+
Use this when you need to process remaining text that hasn't reached the minimum buffer size.
|
|
132
|
+
"""
|
|
133
|
+
message = FlushSignal()
|
|
134
|
+
await self._send_model(message)
|
|
135
|
+
|
|
136
|
+
async def ping(self) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Send ping signal to keep the WebSocket connection alive. The connection automatically
|
|
139
|
+
closes after one minute of inactivity.
|
|
140
|
+
"""
|
|
141
|
+
message = PingSignal()
|
|
142
|
+
await self._send_model(message)
|
|
143
|
+
|
|
144
|
+
async def recv(self) -> TextToSpeechStreamingSocketClientResponse:
|
|
145
|
+
"""
|
|
146
|
+
Receive a message from the websocket connection.
|
|
147
|
+
"""
|
|
148
|
+
data = await self._websocket.recv()
|
|
149
|
+
data = json.loads(data) if isinstance(data, str) else data
|
|
150
|
+
return parse_obj_as(TextToSpeechStreamingSocketClientResponse, data) # type: ignore
|
|
151
|
+
|
|
152
|
+
async def _send(self, data: typing.Any) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Send a message to the websocket connection.
|
|
155
|
+
"""
|
|
156
|
+
if isinstance(data, dict):
|
|
157
|
+
data = json.dumps(data)
|
|
158
|
+
await self._websocket.send(data)
|
|
159
|
+
|
|
160
|
+
async def _send_model(self, data: typing.Any) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Send a Pydantic model to the websocket connection.
|
|
163
|
+
"""
|
|
164
|
+
await self._send(data.dict())
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class TextToSpeechStreamingSocketClient(EventEmitterMixin):
|
|
168
|
+
def __init__(self, *, websocket: websockets_sync_connection.Connection):
|
|
169
|
+
super().__init__()
|
|
170
|
+
self._websocket = websocket
|
|
171
|
+
|
|
172
|
+
def __iter__(self):
|
|
173
|
+
for message in self._websocket:
|
|
174
|
+
message = json.loads(message) if isinstance(message, str) else message
|
|
175
|
+
yield parse_obj_as(TextToSpeechStreamingSocketClientResponse, message) # type: ignore
|
|
176
|
+
|
|
177
|
+
def start_listening(self):
|
|
178
|
+
"""
|
|
179
|
+
Start listening for messages on the websocket connection.
|
|
180
|
+
|
|
181
|
+
Emits events in the following order:
|
|
182
|
+
- EventType.OPEN when connection is established
|
|
183
|
+
- EventType.MESSAGE for each message received
|
|
184
|
+
- EventType.ERROR if an error occurs
|
|
185
|
+
- EventType.CLOSE when connection is closed
|
|
186
|
+
"""
|
|
187
|
+
self._emit(EventType.OPEN, None)
|
|
188
|
+
try:
|
|
189
|
+
for raw_message in self._websocket:
|
|
190
|
+
raw_message = (
|
|
191
|
+
json.loads(raw_message)
|
|
192
|
+
if isinstance(raw_message, str)
|
|
193
|
+
else raw_message
|
|
194
|
+
)
|
|
195
|
+
parsed = parse_obj_as(TextToSpeechStreamingSocketClientResponse, raw_message) # type: ignore
|
|
196
|
+
self._emit(EventType.MESSAGE, parsed)
|
|
197
|
+
except websockets.WebSocketException as exc:
|
|
198
|
+
self._emit(EventType.ERROR, exc)
|
|
199
|
+
finally:
|
|
200
|
+
self._emit(EventType.CLOSE, None)
|
|
201
|
+
|
|
202
|
+
def configure(
|
|
203
|
+
self,
|
|
204
|
+
target_language_code: str,
|
|
205
|
+
speaker: str = "anushka",
|
|
206
|
+
pitch: float = 0.0,
|
|
207
|
+
pace: float = 1.0,
|
|
208
|
+
loudness: float = 1.0,
|
|
209
|
+
speech_sample_rate: int = 22050,
|
|
210
|
+
enable_preprocessing: bool = False,
|
|
211
|
+
output_audio_codec: str = "mp3",
|
|
212
|
+
output_audio_bitrate: str = "128k",
|
|
213
|
+
min_buffer_size: int = 50,
|
|
214
|
+
max_chunk_length: int = 150,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Configuration message required as the first message after establishing the WebSocket connection.
|
|
218
|
+
This initializes TTS parameters and can be updated at any time during the WebSocket lifecycle
|
|
219
|
+
by sending a new config message. When a config update is sent, any text currently in the buffer
|
|
220
|
+
will be automatically flushed and processed before applying the new configuration.
|
|
221
|
+
|
|
222
|
+
:param target_language_code: The language of the text is BCP-47 format
|
|
223
|
+
:param speaker: The speaker voice to be used for the output audio. Default: Anushka.
|
|
224
|
+
Model Compatibility (bulbul:v2): Female: Anushka, Manisha, Vidya, Arya;
|
|
225
|
+
Male: Abhilash, Karun, Hitesh
|
|
226
|
+
:param pitch: Controls the pitch of the audio. Lower values result in a deeper voice,
|
|
227
|
+
while higher values make it sharper. The suitable range is between -0.75
|
|
228
|
+
and 0.75. Default is 0.0.
|
|
229
|
+
:param pace: Controls the speed of the audio. Lower values result in slower speech,
|
|
230
|
+
while higher values make it faster. The suitable range is between 0.5
|
|
231
|
+
and 2.0. Default is 1.0.
|
|
232
|
+
:param loudness: Controls the loudness of the audio. Lower values result in quieter audio,
|
|
233
|
+
while higher values make it louder. The suitable range is between 0.3
|
|
234
|
+
and 3.0. Default is 1.0.
|
|
235
|
+
:param speech_sample_rate: Specifies the sample rate of the output audio. Supported values are
|
|
236
|
+
8000, 16000, 22050, 24000 Hz. If not provided, the default is 22050 Hz.
|
|
237
|
+
:param enable_preprocessing: Controls whether normalization of English words and numeric entities
|
|
238
|
+
(e.g., numbers, dates) is performed. Set to true for better handling
|
|
239
|
+
of mixed-language text. Default is false.
|
|
240
|
+
:param output_audio_codec: Audio codec (currently supports MP3 only, optimized for real-time playback)
|
|
241
|
+
:param output_audio_bitrate: Audio bitrate (choose from 5 supported bitrate options)
|
|
242
|
+
:param min_buffer_size: Minimum character length that triggers buffer flushing for TTS model processing
|
|
243
|
+
:param max_chunk_length: Maximum length for sentence splitting (adjust based on content length)
|
|
244
|
+
"""
|
|
245
|
+
data = ConfigureConnectionData(
|
|
246
|
+
target_language_code=target_language_code,
|
|
247
|
+
speaker=speaker,
|
|
248
|
+
pitch=pitch,
|
|
249
|
+
pace=pace,
|
|
250
|
+
loudness=loudness,
|
|
251
|
+
speech_sample_rate=speech_sample_rate,
|
|
252
|
+
enable_preprocessing=enable_preprocessing,
|
|
253
|
+
output_audio_codec=output_audio_codec,
|
|
254
|
+
output_audio_bitrate=output_audio_bitrate,
|
|
255
|
+
min_buffer_size=min_buffer_size,
|
|
256
|
+
max_chunk_length=max_chunk_length,
|
|
257
|
+
)
|
|
258
|
+
message = ConfigureConnection(data=data)
|
|
259
|
+
self._send_model(message)
|
|
260
|
+
|
|
261
|
+
def convert(self, text: str) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Send text to be converted to speech. Text length should be 1-2500 characters.
|
|
264
|
+
Recommended: <500 characters for optimal streaming performance.
|
|
265
|
+
Real-time endpoints perform better with longer character counts.
|
|
266
|
+
|
|
267
|
+
:param text: Text to be synthesized (1-2500 characters, recommended <500)
|
|
268
|
+
"""
|
|
269
|
+
data = SendTextData(text=text)
|
|
270
|
+
message = SendText(data=data)
|
|
271
|
+
self._send_model(message)
|
|
272
|
+
|
|
273
|
+
def flush(self) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Forces the text buffer to process immediately, regardless of the min_buffer_size threshold.
|
|
276
|
+
Use this when you need to process remaining text that hasn't reached the minimum buffer size.
|
|
277
|
+
"""
|
|
278
|
+
message = FlushSignal()
|
|
279
|
+
self._send_model(message)
|
|
280
|
+
|
|
281
|
+
def ping(self) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Send ping signal to keep the WebSocket connection alive. The connection automatically
|
|
284
|
+
closes after one minute of inactivity.
|
|
285
|
+
"""
|
|
286
|
+
message = PingSignal()
|
|
287
|
+
self._send_model(message)
|
|
288
|
+
|
|
289
|
+
def recv(self) -> TextToSpeechStreamingSocketClientResponse:
|
|
290
|
+
"""
|
|
291
|
+
Receive a message from the websocket connection.
|
|
292
|
+
"""
|
|
293
|
+
data = self._websocket.recv()
|
|
294
|
+
data = json.loads(data) if isinstance(data, str) else data
|
|
295
|
+
return parse_obj_as(TextToSpeechStreamingSocketClientResponse, data) # type: ignore
|
|
296
|
+
|
|
297
|
+
def _send(self, data: typing.Any) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Send a message to the websocket connection.
|
|
300
|
+
"""
|
|
301
|
+
if isinstance(data, dict):
|
|
302
|
+
data = json.dumps(data)
|
|
303
|
+
self._websocket.send(data)
|
|
304
|
+
|
|
305
|
+
def _send_model(self, data: typing.Any) -> None:
|
|
306
|
+
"""
|
|
307
|
+
Send a Pydantic model to the websocket connection.
|
|
308
|
+
"""
|
|
309
|
+
self._send(data.dict())
|