livekit-plugins-cartesia 0.1.1__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,39 +9,6 @@ TTSEncoding = Literal[
9
9
  ]
10
10
 
11
11
 
12
- TTSModels = Literal["upbeat-moon"]
13
-
14
-
15
- # fmt: off
16
- # Barbershop Man in upbeat-moon
17
- TTSDefaultVoiceEmbedding: list[float] = [
18
- -0.033633083, 0.072083704, -0.01807767, -0.083488315, -0.04407617, 0.0022592682, 0.070505895,
19
- 0.023946615, -0.04788024, -0.06388413, -0.0716355, -0.0022612812, -0.0053448505, -0.07848381,
20
- 0.0348162, -0.053745482, -0.092399485, -0.02950225, 0.028591828, -0.10556894, 0.023313355,
21
- 0.06224387, 0.0362463, 0.029258432, 0.10769641, 0.043595582, -0.058543224, -0.080402784,
22
- -0.0953816, -0.008988032, -0.0028981369, -0.004752721, -0.20742874, 0.058907595, 0.08813939,
23
- -0.06192675, 0.099082634, -0.09661578, -0.0077761724, -0.013982456, -0.025798267, 0.04467142,
24
- 0.026222011, 0.023023574, 0.011227064, -0.17462021, -0.09880612, -0.1521035, -0.060464993,
25
- -0.04735665, -0.09725187, -0.006127679, 0.15818526, -0.039493002, -0.067719474, 0.0066190436,
26
- -0.10636633, 0.17073768, -0.051717706, 0.03186961, -0.020547207, -0.02244247, 0.013196935,
27
- -0.06431055, -0.115360335, 0.016918058, -0.033195216, 0.11255181, 0.020366343, -0.041032124,
28
- 0.08780918, -0.040567942, 0.057276532, 0.05848221, -0.077479474, -0.073524915, -0.01913317,
29
- -0.029291833, 0.11210393, -0.09859328, 0.2152541, -0.022976823, 0.028627992, -0.039598297,
30
- 0.041829932, -0.05593181, -0.06444655, -0.018057477, -0.008098263, 0.05994528, 0.10430693,
31
- -0.13121894, -0.06512868, -0.026126215, 0.046727825, -0.17180993, -0.10577226, -0.08610466,
32
- 0.008862588, 0.09547498, -0.010965332, -0.061217085, -0.038954042, 0.019930292, -0.017192135,
33
- 0.007296275, 0.03273872, 0.04389937, -0.056483064, 0.003420891, -0.10319067, -0.015706042,
34
- 0.1308774, -0.0018035866, -0.03582506, 0.077131025, 0.013398928, 0.003188886, 0.12039741,
35
- -0.033974767, 0.06899378, -0.059775922, -0.026934423, 0.028482193, 0.100996524, 0.004498743,
36
- -0.02291186, 0.078752205, -0.0063796206, 0.04206536, 0.05721349, 0.06290694, 0.06130212,
37
- 0.096969016, -0.057664312, -0.16727506, -0.035220966, 0.090760484, 0.010039947, 0.06513242,
38
- 0.011055657, -0.004258431, -0.08316792, -0.15650468, -0.076931365, 0.11385587, -0.038372636,
39
- 0.015648656, -0.12029895, -0.06604956, 0.009441591, -0.11912808, 0.013378132, 0.029525978,
40
- -0.0056742397, -0.0075976513, 0.019999338, -0.05521377, -0.07650746, -0.017710293, -0.033986397,
41
- -0.047768556, 0.13857274, 0.099290825, 0.11736938, 0.017834296, -0.07140237, -0.052047748,
42
- -0.06398965, -0.037033975, -0.061061256, -0.03330076, -0.024472248, -0.059656, 0.05359946,
43
- -0.043915518, -0.086325996, 0.14189173, 0.021086395, 0.02945159, 0.1029604, 0.018490415,
44
- -0.028736332, -0.025272416, -0.06082937, -0.031339463, -0.0007249595, 0.025595888, 0.007144545,
45
- -0.16938712, -0.1160664, -0.0654145,
46
- ]
47
- # fmt: on
12
+ TTSModels = Literal["sonic-english", "sonic-multilingual"]
13
+ TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
14
+ TTSDefaultVoiceId = "248be419-c632-4f23-adf1-5324ed7dbf1d"
@@ -14,18 +14,14 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
- import asyncio
18
- import contextlib
19
17
  import os
20
18
  from dataclasses import dataclass
21
- from typing import Optional
22
19
 
23
20
  import aiohttp
24
- from livekit import rtc
25
21
  from livekit.agents import tts, utils
26
22
 
27
23
  from .log import logger
28
- from .models import TTSDefaultVoiceEmbedding, TTSEncoding, TTSModels
24
+ from .models import TTSDefaultVoiceId, TTSEncoding, TTSModels
29
25
 
30
26
  API_AUTH_HEADER = "X-API-Key"
31
27
  API_VERSION_HEADER = "Cartesia-Version"
@@ -39,21 +35,23 @@ class _TTSOptions:
39
35
  sample_rate: int
40
36
  voice: str | list[float]
41
37
  api_key: str
38
+ language: str
42
39
 
43
40
 
44
41
  class TTS(tts.TTS):
45
42
  def __init__(
46
43
  self,
47
44
  *,
48
- model: TTSModels = "upbeat-moon",
45
+ model: TTSModels = "sonic-english",
46
+ language: str = "en",
49
47
  encoding: TTSEncoding = "pcm_s16le",
50
- voice: str | list[float] = TTSDefaultVoiceEmbedding,
48
+ voice: str | list[float] = TTSDefaultVoiceId,
51
49
  sample_rate: int = 24000,
52
50
  api_key: str | None = None,
53
51
  http_session: aiohttp.ClientSession | None = None,
54
52
  ) -> None:
55
53
  super().__init__(
56
- streaming_supported=False,
54
+ capabilities=tts.TTSCapabilities(streaming=False),
57
55
  sample_rate=sample_rate,
58
56
  num_channels=1,
59
57
  )
@@ -64,6 +62,7 @@ class TTS(tts.TTS):
64
62
 
65
63
  self._opts = _TTSOptions(
66
64
  model=model,
65
+ language=language,
67
66
  encoding=encoding,
68
67
  sample_rate=sample_rate,
69
68
  voice=voice,
@@ -73,14 +72,11 @@ class TTS(tts.TTS):
73
72
 
74
73
  def _ensure_session(self) -> aiohttp.ClientSession:
75
74
  if not self._session:
76
- self._session = utils.http_session()
75
+ self._session = utils.http_context.http_session()
77
76
 
78
77
  return self._session
79
78
 
80
- def synthesize(
81
- self,
82
- text: str,
83
- ) -> "ChunkedStream":
79
+ def synthesize(self, text: str) -> "ChunkedStream":
84
80
  return ChunkedStream(text, self._opts, self._ensure_session())
85
81
 
86
82
 
@@ -88,14 +84,17 @@ class ChunkedStream(tts.ChunkedStream):
88
84
  def __init__(
89
85
  self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
90
86
  ) -> None:
91
- self._opts = opts
92
- self._text = text
93
- self._session = session
94
- self._main_task: asyncio.Task | None = None
95
- self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
87
+ super().__init__()
88
+ self._text, self._opts, self._session = text, opts, session
96
89
 
97
90
  @utils.log_exceptions(logger=logger)
98
- async def _run(self):
91
+ async def _main_task(self):
92
+ bstream = utils.audio.AudioByteStream(
93
+ sample_rate=self._opts.sample_rate, num_channels=1
94
+ )
95
+ request_id = utils.shortuuid()
96
+ segment_id = utils.shortuuid()
97
+
99
98
  voice = {}
100
99
  if isinstance(self._opts.voice, str):
101
100
  voice["mode"] = "id"
@@ -104,77 +103,37 @@ class ChunkedStream(tts.ChunkedStream):
104
103
  voice["mode"] = "embedding"
105
104
  voice["embedding"] = self._opts.voice
106
105
 
107
- try:
108
- async with self._session.post(
109
- "https://api.cartesia.ai/tts/bytes",
110
- headers={
111
- API_AUTH_HEADER: f"{self._opts.api_key}",
112
- API_VERSION_HEADER: API_VERSION,
113
- },
114
- json={
115
- "model_id": self._opts.model,
116
- "transcript": self._text,
117
- "voice": voice,
118
- "output_format": {
119
- "container": "raw",
120
- "encoding": self._opts.encoding,
121
- "sample_rate": self._opts.sample_rate,
122
- },
123
- },
124
- ) as resp:
125
- bytes_per_frame = (self._opts.sample_rate // 100) * 2
126
- buf = bytearray()
127
-
128
- async for data, _ in resp.content.iter_chunks():
129
- buf.extend(data)
130
-
131
- while len(buf) >= bytes_per_frame:
132
- frame_data = buf[:bytes_per_frame]
133
- buf = buf[bytes_per_frame:]
134
-
135
- self._queue.put_nowait(
136
- tts.SynthesizedAudio(
137
- text=self._text,
138
- data=rtc.AudioFrame(
139
- data=frame_data,
140
- sample_rate=self._opts.sample_rate,
141
- num_channels=1,
142
- samples_per_channel=len(frame_data) // 2,
143
- ),
144
- )
145
- )
146
-
147
- # send any remaining data
148
- if len(buf) > 0:
149
- self._queue.put_nowait(
106
+ data = {
107
+ "model_id": self._opts.model,
108
+ "transcript": self._text,
109
+ "voice": voice,
110
+ "output_format": {
111
+ "container": "raw",
112
+ "encoding": self._opts.encoding,
113
+ "sample_rate": self._opts.sample_rate,
114
+ },
115
+ "language": self._opts.language,
116
+ }
117
+
118
+ async with self._session.post(
119
+ "https://api.cartesia.ai/tts/bytes",
120
+ headers={
121
+ API_AUTH_HEADER: f"{self._opts.api_key}",
122
+ API_VERSION_HEADER: API_VERSION,
123
+ },
124
+ json=data,
125
+ ) as resp:
126
+ async for data, _ in resp.content.iter_chunks():
127
+ for frame in bstream.write(data):
128
+ self._event_ch.send_nowait(
150
129
  tts.SynthesizedAudio(
151
- text=self._text,
152
- data=rtc.AudioFrame(
153
- data=buf,
154
- sample_rate=self._opts.sample_rate,
155
- num_channels=1,
156
- samples_per_channel=len(buf) // 2,
157
- ),
130
+ request_id=request_id, segment_id=segment_id, frame=frame
158
131
  )
159
132
  )
160
133
 
161
- finally:
162
- self._queue.put_nowait(None)
163
-
164
- async def __anext__(self) -> tts.SynthesizedAudio:
165
- if not self._main_task:
166
- self._main_task = asyncio.create_task(self._run())
167
-
168
- frame = await self._queue.get()
169
- if frame is None:
170
- raise StopAsyncIteration
171
-
172
- return frame
173
-
174
- async def aclose(self) -> None:
175
- if not self._main_task:
176
- return
177
-
178
- self._main_task.cancel()
179
- with contextlib.suppress(asyncio.CancelledError):
180
- await self._main_task
134
+ for frame in bstream.flush():
135
+ self._event_ch.send_nowait(
136
+ tts.SynthesizedAudio(
137
+ request_id=request_id, segment_id=segment_id, frame=frame
138
+ )
139
+ )
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.1.1"
15
+ __version__ = "0.2.0-dev.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-cartesia
3
- Version: 0.1.1
3
+ Version: 0.2.0.dev1
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -0,0 +1,10 @@
1
+ livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
2
+ livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
+ livekit/plugins/cartesia/models.py,sha256=06S-Z-M90kB-kEOQsQk70xfQUD-TztU4ZIU_AfAyUMc,335
4
+ livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/cartesia/tts.py,sha256=S5BMSVtsbNI_c2PpgyFK6wvleudmJZLTUt3ZmGNKlRI,4319
6
+ livekit/plugins/cartesia/version.py,sha256=ypu6ttoYyC198vzZ_HCF0aB8kPNeygXXxDGxbrCf9s4,606
7
+ livekit_plugins_cartesia-0.2.0.dev1.dist-info/METADATA,sha256=fCqrA_MFJSMweSuHmtD29iZoSVYzFD1TROXmay8AWcE,1250
8
+ livekit_plugins_cartesia-0.2.0.dev1.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
9
+ livekit_plugins_cartesia-0.2.0.dev1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_cartesia-0.2.0.dev1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/cartesia/__init__.py,sha256=_a8u7qqya1pjZTV19gNOpMKTO7ccAVZAeCukiDKAG-U,937
2
- livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
- livekit/plugins/cartesia/models.py,sha256=Qhl51ZScuB61bEzN1tBlHMuHO_kCXSzuVOicYa16EL8,2922
4
- livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/cartesia/tts.py,sha256=16BneZFQQsS-lB9Ug1HYj4QW7-VnNdpTJ0CW5A1b9EU,5725
6
- livekit/plugins/cartesia/version.py,sha256=3-nEcobvIJfZdV4yNIRuYpAGQ3svREnYIv2ivxoIZcQ,600
7
- livekit_plugins_cartesia-0.1.1.dist-info/METADATA,sha256=MfqyeBD4BF8NE4A8O9hIboC0WMmQ5EKo8RPzkGc8-a8,1245
8
- livekit_plugins_cartesia-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
- livekit_plugins_cartesia-0.1.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_cartesia-0.1.1.dist-info/RECORD,,