livekit-plugins-sarvam 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ **/.vscode
2
+ **/.DS_Store
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py,cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ #poetry.lock
106
+
107
+ # pdm
108
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
+ #pdm.lock
110
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
+ # in version control.
112
+ # https://pdm.fming.dev/#use-with-ide
113
+ .pdm.toml
114
+
115
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116
+ __pypackages__/
117
+
118
+ # Celery stuff
119
+ celerybeat-schedule
120
+ celerybeat.pid
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # trunk
150
+ .trunk/
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ .idea/
167
+
168
+ node_modules
169
+
170
+ credentials.json
171
+ pyrightconfig.json
172
+ docs/
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-sarvam
3
+ Version: 1.1.0
4
+ Summary: Agent Framework plugin for services using Sarvam.ai's API.
5
+ Project-URL: Documentation, https://docs.livekit.io
6
+ Project-URL: Website, https://livekit.io/
7
+ Project-URL: Source, https://github.com/livekit/agents
8
+ Author-email: LiveKit <hello@livekit.io>
9
+ License-Expression: Apache-2.0
10
+ Keywords: audio,indian-languages,indic,livekit,realtime,sarvam,stt,tts,video,webrtc
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Topic :: Multimedia :: Sound/Audio
18
+ Classifier: Topic :: Multimedia :: Video
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9.0
21
+ Requires-Dist: livekit-agents[codecs]>=1.1.0
22
+ Requires-Dist: numpy>=1.26
23
+ Description-Content-Type: text/markdown
24
+
25
+ # Sarvam.ai Plugin for LiveKit Agents
26
+
27
+ This plugin provides integration with [Sarvam.ai](https://sarvam.ai) services for LiveKit Agents.
28
+
29
+ ## Features
30
+
31
+ - **Speech-to-Text (STT)**: Convert audio to text using Sarvam's "Saarika" models
32
+ - **Text-to-Speech (TTS)**: Convert text to audio using Sarvam's "Bulbul" models
33
+
34
+ Sarvam.ai specializes in high-quality STT and TTS for Indian languages, supporting multiple dialects and accents.
35
+
36
+ ## Requirements
37
+
38
+ - Sarvam.ai API key
39
+ - Python 3.9 or higher
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install livekit-plugins-sarvam
45
+ ```
@@ -0,0 +1,21 @@
1
+ # Sarvam.ai Plugin for LiveKit Agents
2
+
3
+ This plugin provides integration with [Sarvam.ai](https://sarvam.ai) services for LiveKit Agents.
4
+
5
+ ## Features
6
+
7
+ - **Speech-to-Text (STT)**: Convert audio to text using Sarvam's "Saarika" models
8
+ - **Text-to-Speech (TTS)**: Convert text to audio using Sarvam's "Bulbul" models
9
+
10
+ Sarvam.ai specializes in high-quality STT and TTS for Indian languages, supporting multiple dialects and accents.
11
+
12
+ ## Requirements
13
+
14
+ - Sarvam.ai API key
15
+ - Python 3.9 or higher
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install livekit-plugins-sarvam
21
+ ```
@@ -0,0 +1,50 @@
1
+ # Copyright 2025 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Sarvam.ai plugin for LiveKit Agents
16
+
17
+ Support for speech-to-text and text-to-speech with [Sarvam.ai](https://sarvam.ai/).
18
+
19
+ Sarvam.ai provides high-quality STT and TTS for Indian languages.
20
+
21
+ For API access, visit https://sarvam.ai/
22
+ """
23
+
24
+ from .stt import STT
25
+ from .tts import TTS
26
+ from .version import __version__
27
+
28
+ __all__ = ["STT", "TTS", "__version__"]
29
+
30
+
31
+ from livekit.agents import Plugin
32
+
33
+ from .log import logger
34
+
35
+
36
+ class SarvamPlugin(Plugin):
37
+ def __init__(self) -> None:
38
+ super().__init__(__name__, __version__, __package__, logger)
39
+
40
+
41
+ Plugin.register_plugin(SarvamPlugin())
42
+
43
+ # Cleanup docs of unexported modules
44
+ _module = dir()
45
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
46
+
47
+ __pdoc__ = {}
48
+
49
+ for n in NOT_IN_ALL:
50
+ __pdoc__[n] = False
@@ -0,0 +1,4 @@
1
+ # Simple logger for the plugin
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
@@ -0,0 +1,244 @@
1
+ # Copyright 2025 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Speech-to-Text implementation for Sarvam.ai
16
+
17
+ This module provides an STT implementation that uses the Sarvam.ai API.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import os
24
+ from dataclasses import dataclass
25
+ from typing import Literal
26
+
27
+ import aiohttp
28
+
29
+ from livekit import rtc
30
+ from livekit.agents import (
31
+ DEFAULT_API_CONNECT_OPTIONS,
32
+ APIConnectionError,
33
+ APIConnectOptions,
34
+ APIStatusError,
35
+ APITimeoutError,
36
+ stt,
37
+ utils,
38
+ )
39
+ from livekit.agents.types import NOT_GIVEN, NotGivenOr
40
+ from livekit.agents.utils import AudioBuffer
41
+
42
+ from .log import logger
43
+
44
+ # Sarvam API details
45
+ SARVAM_STT_BASE_URL = "https://api.sarvam.ai/speech-to-text"
46
+
47
+ # Supported Sarvam models
48
+ SarvamSTTModels = Literal["saarika:v1", "saarika:v2", "saarika:flash"]
49
+
50
+
51
+ @dataclass
52
+ class SarvamSTTOptions:
53
+ """Options for the Sarvam.ai STT service.
54
+
55
+ Args:
56
+ language: BCP-47 language code, e.g., "hi-IN", "en-IN"
57
+ model: The Sarvam STT model to use
58
+ api_key: Sarvam.ai API key
59
+ base_url: API endpoint URL
60
+ """
61
+
62
+ language: str # BCP-47 language code, e.g., "hi-IN", "en-IN"
63
+ model: SarvamSTTModels | str = "saarika:v2"
64
+ api_key: str | None = None
65
+ base_url: str = SARVAM_STT_BASE_URL
66
+
67
+
68
+ class STT(stt.STT):
69
+ """Sarvam.ai Speech-to-Text implementation.
70
+
71
+ This class provides speech-to-text functionality using the Sarvam.ai API.
72
+ Sarvam.ai specializes in high-quality STT for Indian languages.
73
+
74
+ Args:
75
+ language: BCP-47 language code, e.g., "hi-IN", "en-IN"
76
+ model: The Sarvam STT model to use
77
+ api_key: Sarvam.ai API key (falls back to SARVAM_API_KEY env var)
78
+ base_url: API endpoint URL
79
+ http_session: Optional aiohttp session to use
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ *,
85
+ language: str,
86
+ model: SarvamSTTModels | str = "saarika:v2",
87
+ api_key: str | None = None,
88
+ base_url: str = SARVAM_STT_BASE_URL,
89
+ http_session: aiohttp.ClientSession | None = None,
90
+ ) -> None:
91
+ super().__init__(capabilities=stt.STTCapabilities(streaming=False, interim_results=False))
92
+
93
+ self._api_key = api_key or os.environ.get("SARVAM_API_KEY")
94
+ if not self._api_key:
95
+ raise ValueError(
96
+ "Sarvam API key is required. "
97
+ "Provide it directly or set SARVAM_API_KEY environment variable."
98
+ )
99
+
100
+ self._opts = SarvamSTTOptions(
101
+ language=language,
102
+ model=model,
103
+ api_key=self._api_key,
104
+ base_url=base_url,
105
+ )
106
+ self._session = http_session
107
+ self._logger = logger.getChild(self.__class__.__name__)
108
+
109
+ def _ensure_session(self) -> aiohttp.ClientSession:
110
+ if not self._session:
111
+ self._session = utils.http_context.http_session()
112
+ return self._session
113
+
114
+ async def _recognize_impl(
115
+ self,
116
+ buffer: AudioBuffer,
117
+ *,
118
+ language: NotGivenOr[str] = NOT_GIVEN,
119
+ model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN,
120
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
121
+ ) -> stt.SpeechEvent:
122
+ """Recognize speech using Sarvam.ai API.
123
+
124
+ Args:
125
+ buffer: Audio buffer containing speech data
126
+ language: BCP-47 language code (overrides the one set in constructor)
127
+ model: Sarvam model to use (overrides the one set in constructor)
128
+ conn_options: Connection options for API requests
129
+
130
+ Returns:
131
+ A SpeechEvent containing the transcription result
132
+
133
+ Raises:
134
+ APIConnectionError: On network connection errors
135
+ APIStatusError: On API errors (non-200 status)
136
+ APITimeoutError: On API timeout
137
+ """
138
+ opts_language = self._opts.language if isinstance(language, type(NOT_GIVEN)) else language
139
+ opts_model = self._opts.model if isinstance(model, type(NOT_GIVEN)) else model
140
+
141
+ wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()
142
+
143
+ form_data = aiohttp.FormData()
144
+ form_data.add_field("file", wav_bytes, filename="audio.wav", content_type="audio/wav")
145
+
146
+ # Add model and language_code to the form data if specified
147
+ # Sarvam API docs state language_code is optional for saarika:v2 but mandatory for v1
148
+ # Model is also optional, defaults to saarika:v2
149
+ if opts_language:
150
+ form_data.add_field("language_code", opts_language)
151
+ if opts_model:
152
+ form_data.add_field("model", str(opts_model))
153
+
154
+ headers = {"api-subscription-key": self._opts.api_key}
155
+
156
+ try:
157
+ async with self._ensure_session().post(
158
+ url=self._opts.base_url,
159
+ data=form_data,
160
+ headers=headers,
161
+ timeout=aiohttp.ClientTimeout(
162
+ total=conn_options.timeout,
163
+ sock_connect=conn_options.timeout,
164
+ ),
165
+ ) as res:
166
+ if res.status != 200:
167
+ error_text = await res.text()
168
+ self._logger.error(f"Sarvam API error: {res.status} - {error_text}")
169
+ raise APIStatusError(
170
+ message=f"Sarvam API Error: {error_text}",
171
+ status_code=res.status,
172
+ )
173
+
174
+ response_json = await res.json()
175
+ self._logger.debug(f"Sarvam API response: {response_json}")
176
+
177
+ transcript_text = response_json.get("transcript", "")
178
+ request_id = response_json.get("request_id", "")
179
+ detected_language = response_json.get("language_code")
180
+ if not isinstance(detected_language, str):
181
+ detected_language = opts_language or ""
182
+
183
+ start_time = 0.0
184
+ end_time = 0.0
185
+
186
+ # Try to get timestamps if available
187
+ timestamps_data = response_json.get("timestamps")
188
+ if timestamps_data and isinstance(timestamps_data, dict):
189
+ words_ts_start = timestamps_data.get("start_time_seconds")
190
+ words_ts_end = timestamps_data.get("end_time_seconds")
191
+ if isinstance(words_ts_start, list) and len(words_ts_start) > 0:
192
+ start_time = words_ts_start[0]
193
+ if isinstance(words_ts_end, list) and len(words_ts_end) > 0:
194
+ end_time = words_ts_end[-1]
195
+
196
+ # If start/end times are still 0, use buffer duration as an estimate for end_time
197
+ if start_time == 0.0 and end_time == 0.0:
198
+ # Calculate duration from buffer - AudioBuffer can be list[AudioFrame]
199
+ # or AudioFrame
200
+ try:
201
+ if isinstance(buffer, list):
202
+ # Calculate total duration from all frames
203
+ total_samples = sum(frame.samples_per_channel for frame in buffer)
204
+ if buffer and total_samples > 0:
205
+ sample_rate = buffer[0].sample_rate
206
+ end_time = total_samples / sample_rate
207
+ elif hasattr(buffer, "duration"):
208
+ end_time = buffer.duration / 1000.0 # buffer.duration is in ms
209
+ elif hasattr(buffer, "samples_per_channel") and hasattr(
210
+ buffer, "sample_rate"
211
+ ):
212
+ # Single AudioFrame
213
+ end_time = buffer.samples_per_channel / buffer.sample_rate
214
+ except Exception as duration_error:
215
+ self._logger.warning(
216
+ f"Could not calculate audio duration: {duration_error}"
217
+ )
218
+ end_time = 0.0
219
+
220
+ alternatives = [
221
+ stt.SpeechData(
222
+ language=detected_language,
223
+ text=transcript_text,
224
+ start_time=start_time,
225
+ end_time=end_time,
226
+ confidence=1.0, # Sarvam doesn't provide confidence score in this response
227
+ )
228
+ ]
229
+
230
+ return stt.SpeechEvent(
231
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
232
+ request_id=request_id,
233
+ alternatives=alternatives,
234
+ )
235
+
236
+ except asyncio.TimeoutError as e:
237
+ self._logger.error(f"Sarvam API timeout: {e}")
238
+ raise APITimeoutError("Sarvam API request timed out") from e
239
+ except aiohttp.ClientError as e:
240
+ self._logger.error(f"Sarvam API client error: {e}")
241
+ raise APIConnectionError(f"Sarvam API connection error: {e}") from e
242
+ except Exception as e:
243
+ self._logger.error(f"Error during Sarvam STT processing: {e}")
244
+ raise APIConnectionError(f"Unexpected error in Sarvam STT: {e}") from e
@@ -0,0 +1,274 @@
1
+ # Copyright 2025 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Text-to-Speech implementation for Sarvam.ai
16
+
17
+ This module provides a TTS implementation that uses the Sarvam.ai API.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import base64
24
+ import os
25
+ from dataclasses import dataclass, replace
26
+ from typing import Literal
27
+
28
+ import aiohttp
29
+
30
+ from livekit.agents import (
31
+ DEFAULT_API_CONNECT_OPTIONS,
32
+ APIConnectionError,
33
+ APIConnectOptions,
34
+ APIStatusError,
35
+ APITimeoutError,
36
+ tts,
37
+ utils,
38
+ )
39
+
40
+ from .log import logger
41
+
42
+ SARVAM_TTS_BASE_URL = "https://api.sarvam.ai/text-to-speech"
43
+
44
+ # Sarvam TTS specific models and speakers
45
+ SarvamTTSModels = Literal["bulbul:v2"]
46
+
47
+ # Supported languages in BCP-47 format
48
+ SarvamTTSLanguages = Literal[
49
+ "bn-IN", # Bengali
50
+ "en-IN", # English (India)
51
+ "gu-IN", # Gujarati
52
+ "hi-IN", # Hindi
53
+ "kn-IN", # Kannada
54
+ "ml-IN", # Malayalam
55
+ "mr-IN", # Marathi
56
+ "od-IN", # Odia
57
+ "pa-IN", # Punjabi
58
+ "ta-IN", # Tamil
59
+ "te-IN", # Telugu
60
+ ]
61
+
62
+ SarvamTTSSpeakers = Literal[
63
+ # bulbul:v2 Female (lowercase)
64
+ "anushka",
65
+ "manisha",
66
+ "vidya",
67
+ "arya",
68
+ # bulbul:v2 Male (lowercase)
69
+ "abhilash",
70
+ "karun",
71
+ "hitesh",
72
+ ]
73
+
74
+ # Model-Speaker compatibility mapping
75
+ MODEL_SPEAKER_COMPATIBILITY = {
76
+ "bulbul:v2": {
77
+ "female": ["anushka", "manisha", "vidya", "arya"],
78
+ "male": ["abhilash", "karun", "hitesh"],
79
+ "all": ["anushka", "manisha", "vidya", "arya", "abhilash", "karun", "hitesh"],
80
+ },
81
+ }
82
+
83
+
84
+ def validate_model_speaker_compatibility(model: str, speaker: str) -> bool:
85
+ """Validate that the speaker is compatible with the model version."""
86
+ if model not in MODEL_SPEAKER_COMPATIBILITY:
87
+ logger.warning(f"Unknown model '{model}', skipping compatibility check")
88
+ return True
89
+
90
+ compatible_speakers = MODEL_SPEAKER_COMPATIBILITY[model]["all"]
91
+ if speaker.lower() not in compatible_speakers:
92
+ logger.error(
93
+ f"Speaker '{speaker}' is not compatible with model '{model}'. "
94
+ f"Compatible speakers for {model}: {', '.join(compatible_speakers)}"
95
+ )
96
+ return False
97
+ return True
98
+
99
+
100
+ @dataclass
101
+ class SarvamTTSOptions:
102
+ """Options for the Sarvam.ai TTS service.
103
+
104
+ Args:
105
+ target_language_code: BCP-47 language code for supported Indian languages
106
+ api_key: Sarvam.ai API key
107
+ text: The text to synthesize (will be provided by stream adapter)
108
+ speaker: Voice to use for synthesis
109
+ pitch: Voice pitch adjustment (-20.0 to 20.0)
110
+ pace: Speech rate multiplier (0.5 to 2.0)
111
+ loudness: Volume multiplier (0.5 to 2.0)
112
+ speech_sample_rate: Audio sample rate (8000, 16000, 22050, or 24000)
113
+ enable_preprocessing: Whether to use text preprocessing
114
+ model: The Sarvam TTS model to use
115
+ base_url: API endpoint URL
116
+ """
117
+
118
+ target_language_code: SarvamTTSLanguages | str # BCP-47 for supported Indian languages
119
+ api_key: str # Sarvam.ai API key
120
+ text: str | None = None # Will be provided by the stream adapter
121
+ speaker: SarvamTTSSpeakers | str = "manisha" # Default speaker compatible with v2
122
+ pitch: float = 0.0
123
+ pace: float = 1.0
124
+ loudness: float = 1.0
125
+ speech_sample_rate: int = 22050 # Default 22050 Hz
126
+ enable_preprocessing: bool = False
127
+ model: SarvamTTSModels | str = "bulbul:v2" # Default to v2 as it has more recent speakers
128
+ base_url: str = SARVAM_TTS_BASE_URL
129
+
130
+
131
+ class TTS(tts.TTS):
132
+ """Sarvam.ai Text-to-Speech implementation.
133
+
134
+ This class provides text-to-speech functionality using the Sarvam.ai API.
135
+ Sarvam.ai specializes in high-quality TTS for Indian languages.
136
+
137
+ Args:
138
+ target_language_code: BCP-47 language code for supported Indian languages
139
+ model: Sarvam TTS model to use (only bulbul:v2 supported)
140
+ speaker: Voice to use for synthesis
141
+ speech_sample_rate: Audio sample rate in Hz
142
+ num_channels: Number of audio channels (Sarvam outputs mono)
143
+ pitch: Voice pitch adjustment (-20.0 to 20.0)
144
+ pace: Speech rate multiplier (0.5 to 2.0)
145
+ loudness: Volume multiplier (0.5 to 2.0)
146
+ enable_preprocessing: Whether to use text preprocessing
147
+ api_key: Sarvam.ai API key (required)
148
+ base_url: API endpoint URL
149
+ http_session: Optional aiohttp session to use
150
+ """
151
+
152
+ def __init__(
153
+ self,
154
+ *,
155
+ target_language_code: SarvamTTSLanguages | str,
156
+ model: SarvamTTSModels | str = "bulbul:v2",
157
+ speaker: SarvamTTSSpeakers | str = "manisha",
158
+ speech_sample_rate: int = 22050,
159
+ num_channels: int = 1, # Sarvam output is mono WAV
160
+ pitch: float = 0.0,
161
+ pace: float = 1.0,
162
+ loudness: float = 1.0,
163
+ enable_preprocessing: bool = False,
164
+ api_key: str,
165
+ base_url: str = SARVAM_TTS_BASE_URL,
166
+ http_session: aiohttp.ClientSession | None = None,
167
+ ) -> None:
168
+ super().__init__(
169
+ capabilities=tts.TTSCapabilities(streaming=False),
170
+ sample_rate=speech_sample_rate,
171
+ num_channels=num_channels,
172
+ )
173
+
174
+ self._api_key = api_key or os.environ.get("SARVAM_API_KEY")
175
+ if not self._api_key:
176
+ raise ValueError(
177
+ "Sarvam API key is required. Provide it directly or set SARVAM_API_KEY env var."
178
+ )
179
+
180
+ # Validate model-speaker compatibility
181
+ if not validate_model_speaker_compatibility(model, speaker):
182
+ compatible_speakers = MODEL_SPEAKER_COMPATIBILITY.get(model, {}).get("all", [])
183
+ raise ValueError(
184
+ f"Speaker '{speaker}' is not compatible with model '{model}'. "
185
+ f"Please choose a compatible speaker from: {', '.join(compatible_speakers)}"
186
+ )
187
+
188
+ self._opts = SarvamTTSOptions(
189
+ target_language_code=target_language_code,
190
+ model=model,
191
+ speaker=speaker,
192
+ speech_sample_rate=speech_sample_rate,
193
+ pitch=pitch,
194
+ pace=pace,
195
+ loudness=loudness,
196
+ enable_preprocessing=enable_preprocessing,
197
+ api_key=self._api_key,
198
+ base_url=base_url,
199
+ )
200
+ self._session = http_session
201
+
202
+ def _ensure_session(self) -> aiohttp.ClientSession:
203
+ if not self._session:
204
+ self._session = utils.http_context.http_session()
205
+ return self._session
206
+
207
+ # Implement the abstract synthesize method
208
+ def synthesize(
209
+ self, text: str, *, conn_options: APIConnectOptions | None = None
210
+ ) -> ChunkedStream:
211
+ """Synthesize text to audio using Sarvam.ai TTS API."""
212
+ if conn_options is None:
213
+ conn_options = DEFAULT_API_CONNECT_OPTIONS
214
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
215
+
216
+
217
+ class ChunkedStream(tts.ChunkedStream):
218
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
219
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
220
+ self._tts: TTS = tts
221
+ self._opts = replace(tts._opts)
222
+
223
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
224
+ """Run the Sarvam.ai TTS request and emit audio via the output emitter."""
225
+ payload = {
226
+ "target_language_code": self._opts.target_language_code,
227
+ "text": self._input_text,
228
+ "speaker": self._opts.speaker,
229
+ "pitch": self._opts.pitch,
230
+ "pace": self._opts.pace,
231
+ "loudness": self._opts.loudness,
232
+ "speech_sample_rate": self._opts.speech_sample_rate,
233
+ "enable_preprocessing": self._opts.enable_preprocessing,
234
+ "model": self._opts.model,
235
+ }
236
+ headers = {
237
+ "api-subscription-key": self._opts.api_key,
238
+ "Content-Type": "application/json",
239
+ }
240
+ try:
241
+ async with self._tts._ensure_session().post(
242
+ url=self._opts.base_url,
243
+ json=payload,
244
+ headers=headers,
245
+ timeout=aiohttp.ClientTimeout(
246
+ total=self._conn_options.timeout,
247
+ sock_connect=self._conn_options.timeout,
248
+ ),
249
+ ) as res:
250
+ if res.status != 200:
251
+ error_text = await res.text()
252
+ logger.error(f"Sarvam TTS API error: {res.status} - {error_text}")
253
+ raise APIStatusError(
254
+ message=f"Sarvam TTS API Error: {error_text}", status_code=res.status
255
+ )
256
+
257
+ response_json = await res.json()
258
+ request_id = response_json.get("request_id", "")
259
+ audios = response_json.get("audios", [])
260
+ if not audios or not isinstance(audios, list):
261
+ raise APIConnectionError("Sarvam TTS API response invalid: no audio data")
262
+
263
+ wav_bytes = base64.b64decode(audios[0])
264
+ output_emitter.initialize(
265
+ request_id=request_id or "unknown",
266
+ sample_rate=self._tts.sample_rate,
267
+ num_channels=self._tts.num_channels,
268
+ mime_type="audio/wav",
269
+ )
270
+ output_emitter.push(wav_bytes)
271
+ except asyncio.TimeoutError as e:
272
+ raise APITimeoutError("Sarvam TTS API request timed out") from e
273
+ except aiohttp.ClientError as e:
274
+ raise APIConnectionError(f"Sarvam TTS API connection error: {e}") from e
@@ -0,0 +1,15 @@
1
+ # Copyright 2025 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "1.1.0"
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "livekit-plugins-sarvam"
7
+ dynamic = ["version"]
8
+ description = "Agent Framework plugin for services using Sarvam.ai's API."
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.9.0"
12
+ authors = [{ name = "LiveKit", email = "hello@livekit.io" }]
13
+ keywords = ["webrtc", "realtime", "audio", "video", "livekit", "sarvam", "indian-languages", "indic", "tts", "stt"]
14
+ classifiers = [
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Topic :: Multimedia :: Sound/Audio",
18
+ "Topic :: Multimedia :: Video",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3 :: Only",
24
+ ]
25
+ dependencies = ["livekit-agents[codecs]>=1.1.0", "numpy>=1.26"]
26
+
27
+ [project.urls]
28
+ Documentation = "https://docs.livekit.io"
29
+ Website = "https://livekit.io/"
30
+ Source = "https://github.com/livekit/agents"
31
+
32
+ [tool.hatch.version]
33
+ path = "livekit/plugins/sarvam/version.py"
34
+
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["livekit"]
37
+
38
+ [tool.hatch.build.targets.sdist]
39
+ include = ["/livekit"]