livekit-plugins-rumik-ai 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # Tooling caches
12
+ .pytest_cache/
13
+ .mypy_cache/
14
+ .ruff_cache/
15
+
16
+ # Env / secrets
17
+ .env
18
+ .env.local
19
+
20
+ # Example output
21
+ recordings/
22
+
23
+ # OS / editor
24
+ .DS_Store
25
+ .idea/
26
+ .vscode/
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or Derivative
95
+ Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and do
117
+ not modify the License. You may add Your own attribution notices
118
+ within Derivative Works that You distribute, alongside or as an
119
+ addendum to the NOTICE text from the Work, provided that such
120
+ additional attribution notices cannot be construed as modifying
121
+ the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2026 Rumik AI
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-rumik-ai
3
+ Version: 0.1.1
4
+ Summary: LiveKit Agents plugin for text-to-speech with Rumik AI (muga & mulberry).
5
+ Project-URL: Homepage, https://rumik.ai/
6
+ Project-URL: Source, https://github.com/rumik-ai/livekit-plugins-rumik-ai
7
+ Project-URL: Issues, https://github.com/rumik-ai/livekit-plugins-rumik-ai/issues
8
+ Project-URL: Documentation, https://docs.livekit.io/agents/integrations/tts/
9
+ Author-email: Rumik AI <hello@rumik.ai>
10
+ License-Expression: Apache-2.0
11
+ License-File: LICENSE
12
+ Keywords: audio,hinglish,livekit,realtime,rumik-ai,text-to-speech,tts,webrtc
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Multimedia :: Sound/Audio
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10.0
23
+ Requires-Dist: livekit-agents[codecs]<2,>=1.5
24
+ Provides-Extra: dev
25
+ Requires-Dist: aiohttp; extra == 'dev'
26
+ Requires-Dist: mypy; extra == 'dev'
27
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
28
+ Requires-Dist: pytest>=8; extra == 'dev'
29
+ Requires-Dist: ruff; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # livekit-plugins-rumik-ai
33
+
34
+ [Rumik AI](https://rumik.ai/) text-to-speech plugin for [LiveKit Agents](https://github.com/livekit/agents).
35
+
36
+ Streams low-latency 24 kHz speech from Rumik's **Silk** models over a reusable WebSocket
37
+ session:
38
+
39
+ - **muga** — emotion-controlled via a leading `[tone]` tag (e.g. `[happy]`, `[sad]`) plus
40
+ optional `<laugh>`/`<chuckle>`/`<sigh>` events. Tuned for Romanized Hinglish.
41
+ - **mulberry** — steered by a natural-language voice `description` or a preset `speaker`,
42
+ with optional pitch shift (`f0_up_key`).
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ pip install livekit-plugins-rumik-ai
48
+ ```
49
+
50
+ This depends on `livekit-agents` (1.5+). Set your key:
51
+
52
+ ```bash
53
+ export RUMIK_API_KEY="your-rumik-api-key"
54
+ ```
55
+
56
+ ## Quickstart
57
+
58
+ ```python
59
+ from livekit.agents import AgentSession
60
+ from livekit.plugins import rumik_ai
61
+
62
+ # muga: the LLM should start each reply with one tone tag, e.g. "[happy] ..."
63
+ session = AgentSession(
64
+ stt=...,
65
+ llm=...,
66
+ tts=rumik_ai.TTS(model="muga"),
67
+ )
68
+ ```
69
+
70
+ Mulberry, steered by a voice description (or a preset speaker):
71
+
72
+ ```python
73
+ tts = rumik_ai.TTS(
74
+ model="mulberry",
75
+ description="warm, gentle female friend",
76
+ # speaker="speaker_1", # optional preset, overrides description
77
+ # f0_up_key=2.0, # optional pitch shift, -12..12 semitones
78
+ )
79
+ ```
80
+
81
+ ### Changing the voice at runtime
82
+
83
+ `description`, `speaker`, `f0_up_key`, and the sampling params are sent on **every
84
+ request**, so you can change mulberry's voice between turns without reconnecting — the
85
+ pooled WebSocket is reused (only a `model` change re-mints the session):
86
+
87
+ ```python
88
+ tts.update_options(description="excited young man, fast and energetic")
89
+ # the next synthesis request uses the new voice
90
+ ```
91
+
92
+ ## Latency vs. smoothness
93
+
94
+ The default is model-aware:
95
+
96
+ - **muga** buffers the full LLM reply and synthesizes it in one request, so its leading
97
+ `[tone]` tag conditions the whole utterance (and there are no per-request TTFB gaps).
98
+ - **mulberry** streams sentence-by-sentence for lower time-to-first-word, since it has
99
+ no tone tag to protect.
100
+
101
+ Override either with `full_response_aggregation`:
102
+
103
+ ```python
104
+ rumik_ai.TTS(model="muga", full_response_aggregation=False, tone="neutral") # muga, lower latency
105
+ rumik_ai.TTS(model="mulberry", full_response_aggregation=True) # mulberry, smoother
106
+ ```
107
+
108
+ When you turn aggregation **off for muga**, set a fallback `tone=` so every sentence
109
+ keeps a tone tag.
110
+
111
+ ## Configuration
112
+
113
+ | Argument | Models | Notes |
114
+ |---|---|---|
115
+ | `model` | both | `"muga"` (default) or `"mulberry"` |
116
+ | `tone` | muga | fallback tone when input is untagged |
117
+ | `description` | mulberry | natural-language voice description |
118
+ | `speaker` | mulberry | `speaker_1`..`speaker_4` |
119
+ | `f0_up_key` | mulberry | pitch shift, `-12`..`12` |
120
+ | `temperature`, `top_p`, `top_k`, `repetition_penalty`, `max_new_tokens` | both | omitted unless set (Rumik defaults apply) |
121
+ | `full_response_aggregation` | both | buffer the full reply (`True`) vs. stream per sentence (`False`). Default: `True` for muga, `False` for mulberry |
122
+ | `api_key` | — | defaults to `RUMIK_API_KEY` |
123
+ | `base_url` | — | defaults to `https://silk-api.rumik.ai` |
124
+
125
+ ## Examples
126
+
127
+ See [`examples/`](./examples) for a full voice agent (`rumik_ai_agent.py`) and a
128
+ record-to-WAV demo (`rumik_ai_tts.py`).
129
+
130
+ ## License
131
+
132
+ Apache-2.0
@@ -0,0 +1,101 @@
1
+ # livekit-plugins-rumik-ai
2
+
3
+ [Rumik AI](https://rumik.ai/) text-to-speech plugin for [LiveKit Agents](https://github.com/livekit/agents).
4
+
5
+ Streams low-latency 24 kHz speech from Rumik's **Silk** models over a reusable WebSocket
6
+ session:
7
+
8
+ - **muga** — emotion-controlled via a leading `[tone]` tag (e.g. `[happy]`, `[sad]`) plus
9
+ optional `<laugh>`/`<chuckle>`/`<sigh>` events. Tuned for Romanized Hinglish.
10
+ - **mulberry** — steered by a natural-language voice `description` or a preset `speaker`,
11
+ with optional pitch shift (`f0_up_key`).
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install livekit-plugins-rumik-ai
17
+ ```
18
+
19
+ This depends on `livekit-agents` (1.5+). Set your key:
20
+
21
+ ```bash
22
+ export RUMIK_API_KEY="your-rumik-api-key"
23
+ ```
24
+
25
+ ## Quickstart
26
+
27
+ ```python
28
+ from livekit.agents import AgentSession
29
+ from livekit.plugins import rumik_ai
30
+
31
+ # muga: the LLM should start each reply with one tone tag, e.g. "[happy] ..."
32
+ session = AgentSession(
33
+ stt=...,
34
+ llm=...,
35
+ tts=rumik_ai.TTS(model="muga"),
36
+ )
37
+ ```
38
+
39
+ Mulberry, steered by a voice description (or a preset speaker):
40
+
41
+ ```python
42
+ tts = rumik_ai.TTS(
43
+ model="mulberry",
44
+ description="warm, gentle female friend",
45
+ # speaker="speaker_1", # optional preset, overrides description
46
+ # f0_up_key=2.0, # optional pitch shift, -12..12 semitones
47
+ )
48
+ ```
49
+
50
+ ### Changing the voice at runtime
51
+
52
+ `description`, `speaker`, `f0_up_key`, and the sampling params are sent on **every
53
+ request**, so you can change mulberry's voice between turns without reconnecting — the
54
+ pooled WebSocket is reused (only a `model` change re-mints the session):
55
+
56
+ ```python
57
+ tts.update_options(description="excited young man, fast and energetic")
58
+ # the next synthesis request uses the new voice
59
+ ```
60
+
61
+ ## Latency vs. smoothness
62
+
63
+ The default is model-aware:
64
+
65
+ - **muga** buffers the full LLM reply and synthesizes it in one request, so its leading
66
+ `[tone]` tag conditions the whole utterance (and there are no per-request TTFB gaps).
67
+ - **mulberry** streams sentence-by-sentence for lower time-to-first-word, since it has
68
+ no tone tag to protect.
69
+
70
+ Override either with `full_response_aggregation`:
71
+
72
+ ```python
73
+ rumik_ai.TTS(model="muga", full_response_aggregation=False, tone="neutral") # muga, lower latency
74
+ rumik_ai.TTS(model="mulberry", full_response_aggregation=True) # mulberry, smoother
75
+ ```
76
+
77
+ When you turn aggregation **off for muga**, set a fallback `tone=` so every sentence
78
+ keeps a tone tag.
79
+
80
+ ## Configuration
81
+
82
+ | Argument | Models | Notes |
83
+ |---|---|---|
84
+ | `model` | both | `"muga"` (default) or `"mulberry"` |
85
+ | `tone` | muga | fallback tone when input is untagged |
86
+ | `description` | mulberry | natural-language voice description |
87
+ | `speaker` | mulberry | `speaker_1`..`speaker_4` |
88
+ | `f0_up_key` | mulberry | pitch shift, `-12`..`12` |
89
+ | `temperature`, `top_p`, `top_k`, `repetition_penalty`, `max_new_tokens` | both | omitted unless set (Rumik defaults apply) |
90
+ | `full_response_aggregation` | both | buffer the full reply (`True`) vs. stream per sentence (`False`). Default: `True` for muga, `False` for mulberry |
91
+ | `api_key` | — | defaults to `RUMIK_API_KEY` |
92
+ | `base_url` | — | defaults to `https://silk-api.rumik.ai` |
93
+
94
+ ## Examples
95
+
96
+ See [`examples/`](./examples) for a full voice agent (`rumik_ai_agent.py`) and a
97
+ record-to-WAV demo (`rumik_ai_tts.py`).
98
+
99
+ ## License
100
+
101
+ Apache-2.0
@@ -0,0 +1,46 @@
1
+ # Copyright 2026 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Rumik AI plugin for LiveKit Agents.
16
+
17
+ Support for low-latency text-to-speech with Rumik AI Silk models.
18
+
19
+ For API access, visit https://rumik.ai/
20
+ """
21
+
22
+ from .tts import TTS
23
+ from .version import __version__
24
+
25
+ __all__ = ["TTS", "__version__"]
26
+
27
+ from livekit.agents import Plugin
28
+
29
+ from .log import logger
30
+
31
+
32
+ class RumikAIPlugin(Plugin):
33
+ def __init__(self) -> None:
34
+ super().__init__(__name__, __version__, __package__, logger)
35
+
36
+
37
+ Plugin.register_plugin(RumikAIPlugin())
38
+
39
+ # Cleanup docs of unexported modules
40
+ _module = dir()
41
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
42
+
43
+ __pdoc__ = {}
44
+
45
+ for n in NOT_IN_ALL:
46
+ __pdoc__[n] = False
@@ -0,0 +1,17 @@
1
+ # Copyright 2026 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+
17
+ logger = logging.getLogger("livekit.plugins.rumik_ai")
@@ -0,0 +1,715 @@
1
+ # Copyright 2026 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import json
19
+ import os
20
+ import re
21
+ import weakref
22
+ from collections.abc import Callable
23
+ from dataclasses import dataclass, replace
24
+ from typing import Any, Literal, cast
25
+ from urllib.parse import quote
26
+
27
+ import aiohttp
28
+
29
+ from livekit.agents import (
30
+ APIConnectionError,
31
+ APIConnectOptions,
32
+ APIStatusError,
33
+ APITimeoutError,
34
+ create_api_error_from_http,
35
+ tts,
36
+ utils,
37
+ )
38
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
39
+ from livekit.agents.utils import is_given
40
+
41
+ from .log import logger
42
+
43
+ Model = Literal["muga", "mulberry"]
44
+ MugaTone = Literal["happy", "excited", "sad", "angry", "neutral", "whisper"]
45
+ MulberrySpeaker = Literal["speaker_1", "speaker_2", "speaker_3", "speaker_4"]
46
+
47
+ DEFAULT_BASE_URL = "https://silk-api.rumik.ai"
48
+ DEFAULT_SAMPLE_RATE = 24000
49
+ NUM_CHANNELS = 1
50
+ MAX_TEXT_LENGTH = 2000
51
+
52
+ # Text used only to mint a reusable WebSocket session. The real text is sent per
53
+ # synthesis request over the same socket, so this value is never synthesized.
54
+ _INIT_TEXT = "init"
55
+ # Backstop cap on a pooled session's age. Heartbeat pings (see _WS_HEARTBEAT) keep
56
+ # the socket warm so Rumik rarely idle-closes it between turns, and stale sockets are
57
+ # still detected on checkout and re-minted -- so this is only a safety cap, not the
58
+ # primary refresh mechanism.
59
+ _MAX_SESSION_DURATION = 300.0
60
+ # aiohttp sends a WebSocket ping every _WS_HEARTBEAT seconds and drops the socket if
61
+ # no pong returns. This keeps the reused session alive across conversational pauses so
62
+ # the next turn does not pay a fresh mint, mirroring the Rumik Pipecat integration.
63
+ _WS_HEARTBEAT = 20.0
64
+
65
+ MUGA_TONES = {"happy", "excited", "sad", "angry", "neutral", "whisper"}
66
+ MUGA_EVENTS = {"laugh", "chuckle", "sigh"}
67
+ MUGA_EVENT_COMPATIBILITY = {
68
+ "happy": {"laugh", "chuckle"},
69
+ "excited": {"laugh", "chuckle"},
70
+ "sad": {"sigh"},
71
+ "angry": {"sigh"},
72
+ "neutral": {"laugh", "sigh"},
73
+ "whisper": {"chuckle", "sigh"},
74
+ }
75
+ MULBERRY_SPEAKERS = {"speaker_1", "speaker_2", "speaker_3", "speaker_4"}
76
+
77
+ _SQUARE_TAG_RE = re.compile(r"\[([^\]]+)\]")
78
+ _TONE_PREFIX_RE = re.compile(r"^\[([^\]]+)\](.*)$", re.DOTALL)
79
+ _EVENT_TAG_RE = re.compile(r"<([^>]+)>")
80
+ _DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
81
+
82
+
83
+ class _StaleConnectionError(Exception):
84
+ """Raised when a pooled WebSocket is found closed; triggers a reconnect."""
85
+
86
+
87
+ @dataclass
88
+ class _TTSOptions:
89
+ model: Model | str
90
+ api_key: str
91
+ tone: MugaTone | str | None
92
+ description: NotGivenOr[str]
93
+ speaker: NotGivenOr[MulberrySpeaker | str]
94
+ f0_up_key: NotGivenOr[float]
95
+ temperature: NotGivenOr[float]
96
+ top_p: NotGivenOr[float]
97
+ top_k: NotGivenOr[int]
98
+ repetition_penalty: NotGivenOr[float]
99
+ max_new_tokens: NotGivenOr[int]
100
+ base_url: str
101
+
102
+
103
+ class TTS(tts.TTS):
104
+ def __init__(
105
+ self,
106
+ *,
107
+ model: Model | str = "muga",
108
+ tone: MugaTone | str | None = None,
109
+ description: NotGivenOr[str] = NOT_GIVEN,
110
+ speaker: NotGivenOr[MulberrySpeaker | str] = NOT_GIVEN,
111
+ f0_up_key: NotGivenOr[float] = NOT_GIVEN,
112
+ temperature: NotGivenOr[float] = NOT_GIVEN,
113
+ top_p: NotGivenOr[float] = NOT_GIVEN,
114
+ top_k: NotGivenOr[int] = NOT_GIVEN,
115
+ repetition_penalty: NotGivenOr[float] = NOT_GIVEN,
116
+ max_new_tokens: NotGivenOr[int] = NOT_GIVEN,
117
+ full_response_aggregation: NotGivenOr[bool] = NOT_GIVEN,
118
+ api_key: str | None = None,
119
+ base_url: str = DEFAULT_BASE_URL,
120
+ http_session: aiohttp.ClientSession | None = None,
121
+ ) -> None:
122
+ """Create a Rumik AI text-to-speech client.
123
+
124
+ The client maintains a reusable Rumik WebSocket session: it mints the
125
+ session once and streams every synthesis request over the same socket,
126
+ re-minting only when the socket goes stale. By default ``muga`` buffers the full
127
+ LLM response and synthesizes it in one request so its leading ``[tone]`` tag
128
+ conditions the whole utterance; ``mulberry`` has no tone tag, so it streams
129
+ sentence-by-sentence for lower latency (see ``full_response_aggregation``).
130
+
131
+ Args:
132
+ model: Rumik AI Silk model to use. Supports ``"muga"`` and ``"mulberry"``.
133
+ tone: Optional Muga fallback tone. When omitted, each Muga input must already
134
+ start with a valid tone tag such as ``[happy]`` or ``[sad]``. When provided,
135
+ untagged input is prefixed with this tone, and existing tags must match it.
136
+ description: Mulberry-only natural language voice description.
137
+ speaker: Optional Mulberry preset speaker, ``speaker_1`` through ``speaker_4``.
138
+ f0_up_key: Mulberry-only pitch shift in semitones, from -12 to 12.
139
+ temperature: Optional sampling temperature. Omitted unless set, so Rumik AI
140
+ applies its own default.
141
+ top_p: Optional nucleus sampling value. Omitted unless set.
142
+ top_k: Optional top-k sampling value. Omitted unless set.
143
+ repetition_penalty: Optional penalty for repeated tokens. Omitted unless set.
144
+ max_new_tokens: Optional output length cap. Omitted unless set.
145
+ full_response_aggregation: When True, buffer the complete LLM response and
146
+ synthesize it in one request to avoid sentence-level TTFB gaps. When
147
+ False, stream sentence-by-sentence via the framework's StreamAdapter for
148
+ lower latency; with muga, set a fallback ``tone`` so each sentence keeps
149
+ a tone tag. Defaults to True for muga (its ``[tone]`` tag must condition
150
+ the whole utterance) and False for mulberry (lower latency).
151
+ api_key: Rumik AI API key. If not provided, reads ``RUMIK_API_KEY``.
152
+ base_url: Rumik AI API base URL.
153
+ http_session: Existing aiohttp session to use.
154
+ """
155
+ # muga buffers the full LLM response and synthesizes it in one request
156
+ # (streaming=True) so its leading [tone] tag conditions the whole utterance and
157
+ # there are no audible gaps from Rumik's per-request TTFB. mulberry has no tone
158
+ # tag, so by default it streams sentence-by-sentence via the framework's
159
+ # StreamAdapter for lower time-to-first-word. An explicit value always wins.
160
+ if is_given(full_response_aggregation):
161
+ full_aggregation = full_response_aggregation
162
+ else:
163
+ full_aggregation = model != "mulberry"
164
+
165
+ super().__init__(
166
+ capabilities=tts.TTSCapabilities(streaming=full_aggregation, aligned_transcript=False),
167
+ sample_rate=DEFAULT_SAMPLE_RATE,
168
+ num_channels=NUM_CHANNELS,
169
+ )
170
+
171
+ api_key = api_key or os.environ.get("RUMIK_API_KEY")
172
+ if not api_key:
173
+ raise ValueError(
174
+ "Rumik AI API key is required, either as argument or set RUMIK_API_KEY"
175
+ )
176
+
177
+ opts = _TTSOptions(
178
+ model=model,
179
+ api_key=api_key,
180
+ tone=tone,
181
+ description=description,
182
+ speaker=speaker,
183
+ f0_up_key=f0_up_key,
184
+ temperature=temperature,
185
+ top_p=top_p,
186
+ top_k=top_k,
187
+ repetition_penalty=repetition_penalty,
188
+ max_new_tokens=max_new_tokens,
189
+ base_url=base_url.rstrip("/"),
190
+ )
191
+ _validate_options(opts)
192
+
193
+ self._opts = opts
194
+ self._session = http_session
195
+ self._streams = weakref.WeakSet[SynthesizeStream]()
196
+ # Mint the Rumik session once and reuse the socket across requests/turns.
197
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
198
+ connect_cb=self._connect_ws,
199
+ close_cb=self._close_ws,
200
+ max_session_duration=_MAX_SESSION_DURATION,
201
+ )
202
+
203
+ @property
204
+ def model(self) -> str:
205
+ return self._opts.model
206
+
207
+ @property
208
+ def provider(self) -> str:
209
+ return "Rumik AI"
210
+
211
+ def _ensure_session(self) -> aiohttp.ClientSession:
212
+ if not self._session:
213
+ self._session = utils.http_context.http_session()
214
+ return self._session
215
+
216
+ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
217
+ ws_url = await self._mint_ws_session(timeout)
218
+
219
+ async def _open() -> aiohttp.ClientWebSocketResponse:
220
+ ws = await self._ensure_session().ws_connect(ws_url, heartbeat=_WS_HEARTBEAT)
221
+ # aiohttp's overloads narrow the autoping type param to ``bool`` when
222
+ # ``heartbeat`` is passed; at runtime autoping stays True, so normalize.
223
+ return cast(aiohttp.ClientWebSocketResponse, ws)
224
+
225
+ try:
226
+ return await asyncio.wait_for(_open(), timeout)
227
+ except asyncio.TimeoutError:
228
+ raise APITimeoutError() from None
229
+ except Exception as e:
230
+ raise APIConnectionError() from e
231
+
232
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
233
+ # Tell Rumik we're done so it can release the session, then close. Best-effort:
234
+ # the socket may already be gone (idle timeout, server close, interruption).
235
+ try:
236
+ if not ws.closed:
237
+ await ws.send_str(json.dumps({"type": "close"}))
238
+ except Exception:
239
+ pass
240
+ await ws.close()
241
+
242
+ async def _mint_ws_session(self, timeout: float) -> str:
243
+ try:
244
+ async with self._ensure_session().post(
245
+ f"{self._opts.base_url}/v1/tts/ws-connect",
246
+ headers={"Authorization": f"Bearer {self._opts.api_key}"},
247
+ json={"model": self._opts.model, "text": _INIT_TEXT},
248
+ timeout=aiohttp.ClientTimeout(total=timeout),
249
+ ) as resp:
250
+ body = await _read_response_body(resp)
251
+ if resp.status >= 400:
252
+ message = _error_message(body)
253
+ raise create_api_error_from_http(message, status=resp.status, body=body)
254
+
255
+ if not isinstance(body, dict):
256
+ raise APIStatusError(
257
+ "Rumik AI ws-connect returned a non-JSON response",
258
+ status_code=resp.status,
259
+ body=body,
260
+ retryable=False,
261
+ )
262
+
263
+ ws_url = body.get("ws_url")
264
+ token = body.get("token")
265
+ if not isinstance(ws_url, str) or not isinstance(token, str):
266
+ raise APIStatusError(
267
+ "Rumik AI ws-connect response is missing ws_url or token",
268
+ status_code=resp.status,
269
+ body=body,
270
+ retryable=False,
271
+ )
272
+
273
+ separator = "&" if "?" in ws_url else "?"
274
+ return f"{ws_url}{separator}token={quote(token)}"
275
+ except asyncio.TimeoutError:
276
+ raise APITimeoutError() from None
277
+ except APIStatusError:
278
+ raise
279
+ except Exception as e:
280
+ raise APIConnectionError() from e
281
+
282
+ def update_options(
283
+ self,
284
+ *,
285
+ model: NotGivenOr[Model | str] = NOT_GIVEN,
286
+ tone: NotGivenOr[MugaTone | str | None] = NOT_GIVEN,
287
+ description: NotGivenOr[str] = NOT_GIVEN,
288
+ speaker: NotGivenOr[MulberrySpeaker | str] = NOT_GIVEN,
289
+ f0_up_key: NotGivenOr[float] = NOT_GIVEN,
290
+ temperature: NotGivenOr[float] = NOT_GIVEN,
291
+ top_p: NotGivenOr[float] = NOT_GIVEN,
292
+ top_k: NotGivenOr[int] = NOT_GIVEN,
293
+ repetition_penalty: NotGivenOr[float] = NOT_GIVEN,
294
+ max_new_tokens: NotGivenOr[int] = NOT_GIVEN,
295
+ ) -> None:
296
+ """Update TTS options.
297
+
298
+ Options are sent on each synthesis request, so changes take effect on the next
299
+ request without reconnecting -- handy for varying mulberry's ``description``,
300
+ ``speaker``, or ``f0_up_key`` between turns. Changing ``model`` is the exception:
301
+ the model is pinned when the WebSocket session is minted, so it invalidates the
302
+ pooled connection and the next request re-mints.
303
+ """
304
+ opts = replace(self._opts)
305
+ model_changed = False
306
+ if is_given(model):
307
+ model_changed = model != opts.model
308
+ opts.model = model
309
+ if is_given(tone):
310
+ opts.tone = tone
311
+ if is_given(description):
312
+ opts.description = description
313
+ if is_given(speaker):
314
+ opts.speaker = speaker
315
+ if is_given(f0_up_key):
316
+ opts.f0_up_key = f0_up_key
317
+ if is_given(temperature):
318
+ opts.temperature = temperature
319
+ if is_given(top_p):
320
+ opts.top_p = top_p
321
+ if is_given(top_k):
322
+ opts.top_k = top_k
323
+ if is_given(repetition_penalty):
324
+ opts.repetition_penalty = repetition_penalty
325
+ if is_given(max_new_tokens):
326
+ opts.max_new_tokens = max_new_tokens
327
+
328
+ _validate_options(opts)
329
+ self._opts = opts
330
+ if model_changed:
331
+ # The model is fixed at mint time, so existing pooled sockets are stale.
332
+ self._pool.invalidate()
333
+
334
+ def synthesize(
335
+ self,
336
+ text: str,
337
+ *,
338
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
339
+ ) -> ChunkedStream:
340
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
341
+
342
+ def stream(
343
+ self,
344
+ *,
345
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
346
+ ) -> SynthesizeStream:
347
+ stream = SynthesizeStream(tts=self, conn_options=conn_options)
348
+ self._streams.add(stream)
349
+ return stream
350
+
351
+ def prewarm(self) -> None:
352
+ """Open the Rumik WebSocket session ahead of the first request."""
353
+ self._pool.prewarm()
354
+
355
+ async def aclose(self) -> None:
356
+ for stream in list(self._streams):
357
+ await stream.aclose()
358
+ self._streams.clear()
359
+ await self._pool.aclose()
360
+
361
+ async def _stream_synthesis(
362
+ self,
363
+ text: str,
364
+ opts: _TTSOptions,
365
+ conn_options: APIConnectOptions,
366
+ output_emitter: tts.AudioEmitter,
367
+ *,
368
+ on_started: Callable[[], None] | None = None,
369
+ ) -> None:
370
+ """Synthesize one request over the pooled WebSocket and push PCM out.
371
+
372
+ Shared by SynthesizeStream (full-response mode: one request for the whole
373
+ reply) and ChunkedStream (per-sentence StreamAdapter mode, or a
374
+ FallbackAdapter: one request per fixed piece of text).
375
+ """
376
+ try:
377
+ prepared_text = _prepare_text(text, opts)
378
+ payload = _synthesis_payload(prepared_text, opts)
379
+
380
+ # Reconnect-on-stale: a pooled socket may have been idle-closed by Rumik.
381
+ # Detect that before sending and re-mint a fresh session once.
382
+ for attempt in range(2):
383
+ try:
384
+ async with self._pool.connection(timeout=conn_options.timeout) as ws:
385
+ if ws.closed:
386
+ raise _StaleConnectionError
387
+ await self._stream_on_ws(
388
+ ws, payload, conn_options, output_emitter, on_started
389
+ )
390
+ return
391
+ except _StaleConnectionError:
392
+ if attempt == 1:
393
+ raise APIConnectionError("Rumik AI WebSocket is unavailable") from None
394
+ except asyncio.TimeoutError:
395
+ raise APITimeoutError() from None
396
+ except (APIConnectionError, APIStatusError, APITimeoutError):
397
+ raise
398
+ except aiohttp.ClientResponseError as e:
399
+ raise APIStatusError(message=e.message, status_code=e.status, body=None) from None
400
+ except Exception as e:
401
+ raise APIConnectionError() from e
402
+
403
+ async def _stream_on_ws(
404
+ self,
405
+ ws: aiohttp.ClientWebSocketResponse,
406
+ payload: dict[str, Any],
407
+ conn_options: APIConnectOptions,
408
+ output_emitter: tts.AudioEmitter,
409
+ on_started: Callable[[], None] | None,
410
+ ) -> None:
411
+ if on_started is not None:
412
+ on_started()
413
+ await ws.send_str(json.dumps(payload))
414
+
415
+ received_audio = False
416
+ while True:
417
+ msg = await ws.receive(timeout=conn_options.timeout)
418
+ if msg.type == aiohttp.WSMsgType.BINARY:
419
+ received_audio = True
420
+ output_emitter.push(msg.data)
421
+ elif msg.type == aiohttp.WSMsgType.TEXT:
422
+ event = _loads_event(msg.data)
423
+ event_type = event.get("type")
424
+ if event_type == "done":
425
+ # rtf > 1.0 means Rumik generated slower than real time, which can
426
+ # starve playback and sound robotic; surface it for diagnosis.
427
+ logger.debug(
428
+ "Rumik AI TTS synthesis complete",
429
+ extra={
430
+ "audio_duration": event.get("audio_duration"),
431
+ "rtf": event.get("rtf"),
432
+ },
433
+ )
434
+ return
435
+ if event_type == "error" or "error" in event:
436
+ raise _provider_error(event)
437
+ if event_type == "timeout":
438
+ # Server idle-closed the session; the pool evicts it on the raise.
439
+ raise APIConnectionError("Rumik AI WebSocket idle timeout")
440
+ # "queued" and any other informational events are ignored.
441
+ logger.debug("Ignoring Rumik AI TTS event", extra={"event": event})
442
+ elif msg.type == aiohttp.WSMsgType.ERROR:
443
+ raise APIConnectionError(f"Rumik AI WebSocket error: {ws.exception()!r}")
444
+ elif msg.type in (
445
+ aiohttp.WSMsgType.CLOSE,
446
+ aiohttp.WSMsgType.CLOSED,
447
+ aiohttp.WSMsgType.CLOSING,
448
+ ):
449
+ close_code = getattr(ws, "close_code", None) or getattr(msg, "data", None)
450
+ # The server closed the socket, so it cannot be reused.
451
+ self._pool.remove(ws)
452
+ if received_audio and close_code in (None, 1000):
453
+ return
454
+ raise APIConnectionError(
455
+ f"Rumik AI WebSocket closed unexpectedly: code={close_code!r}"
456
+ )
457
+
458
+
459
+ class ChunkedStream(tts.ChunkedStream):
460
+ """One Rumik request for a fixed piece of text (a StreamAdapter sentence or fallback)."""
461
+
462
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
463
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
464
+ self._tts: TTS = tts
465
+ self._opts = replace(tts._opts)
466
+
467
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
468
+ output_emitter.initialize(
469
+ request_id=utils.shortuuid(),
470
+ sample_rate=DEFAULT_SAMPLE_RATE,
471
+ num_channels=NUM_CHANNELS,
472
+ mime_type="audio/pcm",
473
+ )
474
+ await self._tts._stream_synthesis(
475
+ self._input_text, self._opts, self._conn_options, output_emitter
476
+ )
477
+
478
+
479
+ class SynthesizeStream(tts.SynthesizeStream):
480
+ """Synthesizes the full LLM response as a single Rumik request (one segment)."""
481
+
482
+ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions) -> None:
483
+ super().__init__(tts=tts, conn_options=conn_options)
484
+ self._tts: TTS = tts
485
+ self._opts = replace(tts._opts)
486
+
487
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
488
+ output_emitter.initialize(
489
+ request_id=utils.shortuuid(),
490
+ sample_rate=DEFAULT_SAMPLE_RATE,
491
+ num_channels=NUM_CHANNELS,
492
+ mime_type="audio/pcm",
493
+ stream=True,
494
+ )
495
+
496
+ # The framework feeds one segment per stream, so we synthesize the whole
497
+ # response in a single request once the input is drained.
498
+ parts: list[str] = []
499
+ async for input_data in self._input_ch:
500
+ if isinstance(input_data, str):
501
+ parts.append(input_data)
502
+
503
+ text = "".join(parts).strip()
504
+ if not text:
505
+ return
506
+
507
+ output_emitter.start_segment(segment_id=utils.shortuuid())
508
+ try:
509
+ await self._tts._stream_synthesis(
510
+ text,
511
+ self._opts,
512
+ self._conn_options,
513
+ output_emitter,
514
+ on_started=self._mark_started,
515
+ )
516
+ finally:
517
+ output_emitter.end_segment()
518
+
519
+
520
+ def _validate_options(opts: _TTSOptions) -> None:
521
+ if opts.model not in {"muga", "mulberry"}:
522
+ raise ValueError("Rumik AI model must be 'muga' or 'mulberry'")
523
+
524
+ if opts.model == "muga":
525
+ if opts.tone is not None and opts.tone not in MUGA_TONES:
526
+ raise ValueError(f"Unsupported Rumik AI Muga tone: {opts.tone}")
527
+ if is_given(opts.description):
528
+ raise ValueError("description is only supported with Rumik AI Mulberry")
529
+ if is_given(opts.speaker):
530
+ raise ValueError("speaker is only supported with Rumik AI Mulberry")
531
+ if is_given(opts.f0_up_key):
532
+ raise ValueError("f0_up_key is only supported with Rumik AI Mulberry")
533
+ return
534
+
535
+ if opts.tone is not None:
536
+ raise ValueError("tone is only supported with Rumik AI Muga")
537
+ if is_given(opts.speaker) and opts.speaker not in MULBERRY_SPEAKERS:
538
+ raise ValueError(
539
+ "Rumik AI Mulberry speaker must be speaker_1, speaker_2, speaker_3, or speaker_4"
540
+ )
541
+ if is_given(opts.f0_up_key) and not -12 <= opts.f0_up_key <= 12:
542
+ raise ValueError("Rumik AI Mulberry f0_up_key must be between -12 and 12")
543
+
544
+
545
+ def _prepare_text(text: str, opts: _TTSOptions) -> str:
546
+ # Collapse every run of whitespace (including newlines from buffered LLM output)
547
+ # to a single space: Rumik gets clean text, and muga's "exactly one space after
548
+ # the [tone] tag" rule holds even when the model emits a newline after the tag.
549
+ text = re.sub(r"\s+", " ", text).strip()
550
+ if not text:
551
+ raise ValueError("Rumik AI TTS text must not be empty")
552
+
553
+ if opts.model == "muga":
554
+ text = _prepare_muga_text(text, opts)
555
+ elif opts.model == "mulberry":
556
+ text = _prepare_mulberry_text(text)
557
+
558
+ if len(text) > MAX_TEXT_LENGTH:
559
+ raise ValueError("Rumik AI TTS text must be 2000 characters or fewer")
560
+
561
+ return text
562
+
563
+
564
+ def _prepare_muga_text(text: str, opts: _TTSOptions) -> str:
565
+ if _DEVANAGARI_RE.search(text):
566
+ raise ValueError("Rumik AI Muga expects Hinglish in Roman script")
567
+
568
+ square_tags = _SQUARE_TAG_RE.findall(text)
569
+ if len(square_tags) > 1:
570
+ raise ValueError("Rumik AI Muga text must contain exactly one global tone tag")
571
+
572
+ tone = opts.tone
573
+ match = _TONE_PREFIX_RE.match(text)
574
+ if match:
575
+ text_tone = match.group(1)
576
+ after_tag = match.group(2)
577
+ if text_tone not in MUGA_TONES:
578
+ raise ValueError(f"Unsupported Rumik AI Muga tone tag: [{text_tone}]")
579
+ if tone is not None and text_tone != tone:
580
+ raise ValueError("Rumik AI Muga text tone tag must match the configured tone")
581
+ if not after_tag.startswith(" "):
582
+ raise ValueError("Rumik AI Muga tone tag must be followed by one space")
583
+ tone = text_tone
584
+ elif square_tags:
585
+ raise ValueError("Rumik AI Muga tone tag must be at the start of the text")
586
+ else:
587
+ if tone is None:
588
+ raise ValueError(
589
+ "Rumik AI Muga text must start with one global tone tag "
590
+ "when no fallback tone is configured"
591
+ )
592
+ text = f"[{tone}] {text}"
593
+
594
+ assert tone is not None
595
+ events = _EVENT_TAG_RE.findall(text)
596
+ for event in events:
597
+ if event not in MUGA_EVENTS:
598
+ raise ValueError(f"Unsupported Rumik AI Muga event tag: <{event}>")
599
+ if event not in MUGA_EVENT_COMPATIBILITY[tone]:
600
+ raise ValueError(f"Rumik AI Muga event <{event}> is not compatible with [{tone}]")
601
+
602
+ if _has_too_many_stacked_events(text):
603
+ raise ValueError("Rumik AI Muga supports at most two stacked event tags")
604
+
605
+ return text
606
+
607
+
608
+ def _prepare_mulberry_text(text: str) -> str:
609
+ match = _TONE_PREFIX_RE.match(text)
610
+ if match and match.group(1) in MUGA_TONES:
611
+ raise ValueError("Rumik AI Mulberry does not support Muga tone tags")
612
+
613
+ events = _EVENT_TAG_RE.findall(text)
614
+ unsupported_events = sorted(event for event in events if event in MUGA_EVENTS)
615
+ if unsupported_events:
616
+ raise ValueError("Rumik AI Mulberry does not support Muga event tags")
617
+
618
+ return text
619
+
620
+
621
+ def _has_too_many_stacked_events(text: str) -> bool:
622
+ consecutive = 0
623
+ last_end = -1
624
+ for match in _EVENT_TAG_RE.finditer(text):
625
+ if match.group(1) not in MUGA_EVENTS:
626
+ consecutive = 0
627
+ last_end = match.end()
628
+ continue
629
+
630
+ between = text[last_end : match.start()] if last_end >= 0 else ""
631
+ if last_end >= 0 and between.strip() == "":
632
+ consecutive += 1
633
+ else:
634
+ consecutive = 1
635
+ if consecutive > 2:
636
+ return True
637
+ last_end = match.end()
638
+
639
+ return False
640
+
641
+
642
+ def _synthesis_payload(text: str, opts: _TTSOptions) -> dict[str, Any]:
643
+ # The model is fixed when the session is minted, so it is not resent here. Only
644
+ # explicitly-set parameters are included; Rumik AI applies its own defaults for
645
+ # the rest.
646
+ payload: dict[str, Any] = {"text": text}
647
+
648
+ if is_given(opts.description):
649
+ payload["description"] = opts.description
650
+ if is_given(opts.speaker):
651
+ payload["speaker"] = opts.speaker
652
+ if is_given(opts.f0_up_key):
653
+ payload["f0_up_key"] = opts.f0_up_key
654
+ if is_given(opts.temperature):
655
+ payload["temperature"] = opts.temperature
656
+ if is_given(opts.top_p):
657
+ payload["top_p"] = opts.top_p
658
+ if is_given(opts.top_k):
659
+ payload["top_k"] = opts.top_k
660
+ if is_given(opts.repetition_penalty):
661
+ payload["repetition_penalty"] = opts.repetition_penalty
662
+ if is_given(opts.max_new_tokens):
663
+ payload["max_new_tokens"] = opts.max_new_tokens
664
+
665
+ return payload
666
+
667
+
668
+ async def _read_response_body(resp: aiohttp.ClientResponse) -> object:
669
+ try:
670
+ return await resp.json()
671
+ except Exception:
672
+ return await resp.text()
673
+
674
+
675
+ def _loads_event(data: str) -> dict[str, Any]:
676
+ try:
677
+ event = json.loads(data)
678
+ except json.JSONDecodeError as e:
679
+ raise APIConnectionError("Rumik AI WebSocket returned invalid JSON") from e
680
+ if not isinstance(event, dict):
681
+ raise APIConnectionError("Rumik AI WebSocket returned an invalid event")
682
+ return event
683
+
684
+
685
+ def _provider_error(event: dict[str, Any]) -> APIStatusError:
686
+ message = _error_message(event)
687
+ status_code = _event_status_code(event)
688
+ return APIStatusError(
689
+ message=f"Rumik AI TTS error: {message}",
690
+ status_code=status_code,
691
+ body=event,
692
+ retryable=status_code in (408, 429, 499, 503),
693
+ )
694
+
695
+
696
+ def _error_message(body: object) -> str:
697
+ if isinstance(body, dict):
698
+ error = body.get("error")
699
+ code = body.get("code")
700
+ if error and code:
701
+ return f"{error} ({code})"
702
+ if error:
703
+ return str(error)
704
+ if isinstance(body, str) and body:
705
+ return body
706
+ return "Rumik AI request failed"
707
+
708
+
709
+ def _event_status_code(event: dict[str, Any]) -> int:
710
+ status = event.get("status") or event.get("status_code")
711
+ if isinstance(status, int):
712
+ return status
713
+ if isinstance(status, str) and status.isdecimal():
714
+ return int(status)
715
+ return -1
@@ -0,0 +1,15 @@
1
+ # Copyright 2026 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.1.1"
@@ -0,0 +1,68 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "livekit-plugins-rumik-ai"
7
+ dynamic = ["version"]
8
+ description = "LiveKit Agents plugin for text-to-speech with Rumik AI (muga & mulberry)."
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.10.0"
13
+ authors = [{ name = "Rumik AI", email = "hello@rumik.ai" }]
14
+ keywords = [
15
+ "webrtc",
16
+ "realtime",
17
+ "audio",
18
+ "livekit",
19
+ "rumik-ai",
20
+ "tts",
21
+ "text-to-speech",
22
+ "hinglish",
23
+ ]
24
+ classifiers = [
25
+ "Intended Audience :: Developers",
26
+ "License :: OSI Approved :: Apache Software License",
27
+ "Topic :: Multimedia :: Sound/Audio",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Programming Language :: Python :: 3 :: Only",
34
+ ]
35
+ dependencies = ["livekit-agents[codecs]>=1.5,<2"]
36
+
37
+ [project.optional-dependencies]
38
+ dev = ["pytest>=8", "pytest-asyncio>=0.23", "aiohttp", "ruff", "mypy"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://rumik.ai/"
42
+ Source = "https://github.com/rumik-ai/livekit-plugins-rumik-ai"
43
+ Issues = "https://github.com/rumik-ai/livekit-plugins-rumik-ai/issues"
44
+ Documentation = "https://docs.livekit.io/agents/integrations/tts/"
45
+
46
+ [tool.hatch.version]
47
+ path = "livekit/plugins/rumik_ai/version.py"
48
+
49
+ [tool.hatch.build.targets.wheel]
50
+ packages = ["livekit"]
51
+
52
+ [tool.hatch.build.targets.sdist]
53
+ include = ["/livekit", "/README.md", "/LICENSE"]
54
+
55
+ [tool.pytest.ini_options]
56
+ asyncio_mode = "strict"
57
+ testpaths = ["tests"]
58
+
59
+ [tool.ruff]
60
+ line-length = 100
61
+ target-version = "py310"
62
+
63
+ [tool.mypy]
64
+ python_version = "3.10"
65
+ strict = true
66
+ # Match the livekit-agents monorepo config so this plugin type-checks identically.
67
+ disallow_any_generics = false
68
+ plugins = ["pydantic.mypy"]