livekit-plugins-gnani 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ .eggs/
10
+ *.egg
11
+
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ .env
17
+ .env.*
18
+ *.ini
19
+ *.cfg
20
+ !pyproject.toml
21
+
22
+ *.log
23
+ .mypy_cache/
24
+ .pytest_cache/
25
+ .ruff_cache/
26
+ .tox/
27
+ .coverage
28
+ htmlcov/
29
+
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+ .DS_Store
36
+ Thumbs.db
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
17
+ Copyright 2025-2026 Gnani.ai
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-gnani
3
+ Version: 0.1.0
4
+ Summary: LiveKit Agents plugin for Gnani Vachana speech AI — STT & TTS for Indian languages
5
+ Project-URL: Homepage, https://gnani.ai
6
+ Project-URL: Documentation, https://docs.inya.ai/vachana
7
+ Project-URL: Repository, https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani
8
+ Project-URL: Issues, https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani/issues
9
+ Author-email: Genvoice <speechstack@gnani.ai>
10
+ License-Expression: Apache-2.0
11
+ License-File: LICENSE
12
+ Keywords: audio,gnani,indian-languages,indic,livekit,livekit-agents,multilingual,realtime,speech-to-text,streaming,stt,text-to-speech,tts,vachana,webrtc,websocket
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Multimedia :: Sound/Audio
24
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Requires-Python: >=3.10
27
+ Requires-Dist: gnani-vachana<1.0,>=0.2.2
28
+ Requires-Dist: livekit-agents[codecs]>=1.5.8
29
+ Requires-Dist: websockets<16.0,>=13.1
30
+ Description-Content-Type: text/markdown
31
+
32
+ # livekit-plugins-gnani
33
+
34
+ [![PyPI](https://img.shields.io/pypi/v/livekit-plugins-gnani)](https://pypi.org/project/livekit-plugins-gnani/)
35
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
36
+
37
+ [LiveKit Agents](https://github.com/livekit/agents) plugin for **[Gnani Vachana](https://gnani.ai/)** — high-accuracy Speech-to-Text and low-latency Text-to-Speech for Indian languages.
38
+
39
+ > **Vachana** is a production-ready speech AI platform by [Gnani.ai](https://gnani.ai) supporting 10+ Indian languages with real-time streaming, multilingual transcription, and code-switching capabilities.
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install livekit-plugins-gnani
45
+ ```
46
+
47
+ This will also install the [`gnani-vachana`](https://pypi.org/project/gnani-vachana/) core SDK as a dependency.
48
+
49
+ ## Prerequisites
50
+
51
+ You need a Gnani API key. Email **[speechstack@gnani.ai](mailto:speechstack@gnani.ai)** to get started — all new accounts receive free credits, no credit card required.
52
+
53
+ Set your credentials as environment variables:
54
+
55
+ ```bash
56
+ export GNANI_API_KEY="your-api-key"
57
+
58
+ # For REST STT only (optional):
59
+ export GNANI_ORGANIZATION_ID="your-org-id"
60
+ export GNANI_USER_ID="your-user-id"
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ### Speech-to-Text
66
+
67
+ ```python
68
+ from livekit.plugins.gnani import STT
69
+
70
+ stt = STT(language="hi-IN")
71
+
72
+ # Use with a LiveKit voice agent pipeline
73
+ ```
74
+
75
+ ### Text-to-Speech
76
+
77
+ ```python
78
+ from livekit.plugins.gnani import TTS
79
+
80
+ tts = TTS(voice="sia")
81
+
82
+ # Use with a LiveKit voice agent pipeline
83
+ ```
84
+
85
+ ## Features
86
+
87
+ ### STT
88
+
89
+ - **Batch recognition** — REST API (`POST /stt/v3`) for file-based transcription
90
+ - **Real-time streaming** — WebSocket API for live audio transcription with VAD
91
+ - **10 Indian languages** — bn-IN, en-IN, gu-IN, hi-IN, kn-IN, ml-IN, mr-IN, pa-IN, ta-IN, te-IN
92
+ - **Code-switching** — Hinglish (en-hi-IN-latn) and Hindi-English mixed (en-hi-in-cm) for streaming
93
+ - **Sample rates** — 8 kHz and 16 kHz
94
+
95
+ ### TTS
96
+
97
+ - **Chunked synthesis** — REST API for single-request audio generation
98
+ - **Real-time streaming** — WebSocket API for low-latency streaming synthesis
99
+ - **8 voices** — sia, raju, kanika, nikita, ravan, simran, karan, neha
100
+ - **Configurable output** — sample rate (8000–44100), encoding (linear_pcm, oggopus), container (raw, mp3, wav, mulaw, ogg)
101
+
102
+ ## Supported Languages
103
+
104
+ | Language | Code |
105
+ |-----------------|---------|
106
+ | Bengali | `bn-IN` |
107
+ | English (India) | `en-IN` |
108
+ | Gujarati | `gu-IN` |
109
+ | Hindi | `hi-IN` |
110
+ | Kannada | `kn-IN` |
111
+ | Malayalam | `ml-IN` |
112
+ | Marathi | `mr-IN` |
113
+ | Punjabi | `pa-IN` |
114
+ | Tamil | `ta-IN` |
115
+ | Telugu | `te-IN` |
116
+
117
+ ## Available Voices
118
+
119
+ | Voice | ID |
120
+ |---------|-----------|
121
+ | Sia | `sia` |
122
+ | Raju | `raju` |
123
+ | Kanika | `kanika` |
124
+ | Nikita | `nikita` |
125
+ | Ravan | `ravan` |
126
+ | Simran | `simran` |
127
+ | Karan | `karan` |
128
+ | Neha | `neha` |
129
+
130
+ ## Architecture
131
+
132
+ ```
133
+ gnani-vachana ← Core SDK (REST, WebSocket, SSE clients)
134
+
135
+ livekit-plugins-gnani ← This package (LiveKit Agents adapter)
136
+ ```
137
+
138
+ This plugin is a thin adapter that wraps the `gnani-vachana` SDK into LiveKit's `stt.STT` and `tts.TTS` base classes. All connection logic, authentication, and audio format handling lives in the core SDK.
139
+
140
+ ## Documentation
141
+
142
+ - [Vachana API Docs](https://docs.inya.ai/vachana/introduction/introduction)
143
+ - [LiveKit Agents Docs](https://docs.livekit.io/agents/)
144
+ - [gnani-vachana SDK](https://pypi.org/project/gnani-vachana/)
145
+
146
+ ## License
147
+
148
+ Apache 2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,117 @@
1
+ # livekit-plugins-gnani
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/livekit-plugins-gnani)](https://pypi.org/project/livekit-plugins-gnani/)
4
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
5
+
6
+ [LiveKit Agents](https://github.com/livekit/agents) plugin for **[Gnani Vachana](https://gnani.ai/)** — high-accuracy Speech-to-Text and low-latency Text-to-Speech for Indian languages.
7
+
8
+ > **Vachana** is a production-ready speech AI platform by [Gnani.ai](https://gnani.ai) supporting 10+ Indian languages with real-time streaming, multilingual transcription, and code-switching capabilities.
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ pip install livekit-plugins-gnani
14
+ ```
15
+
16
+ This will also install the [`gnani-vachana`](https://pypi.org/project/gnani-vachana/) core SDK as a dependency.
17
+
18
+ ## Prerequisites
19
+
20
+ You need a Gnani API key. Email **[speechstack@gnani.ai](mailto:speechstack@gnani.ai)** to get started — all new accounts receive free credits, no credit card required.
21
+
22
+ Set your credentials as environment variables:
23
+
24
+ ```bash
25
+ export GNANI_API_KEY="your-api-key"
26
+
27
+ # For REST STT only (optional):
28
+ export GNANI_ORGANIZATION_ID="your-org-id"
29
+ export GNANI_USER_ID="your-user-id"
30
+ ```
31
+
32
+ ## Quick Start
33
+
34
+ ### Speech-to-Text
35
+
36
+ ```python
37
+ from livekit.plugins.gnani import STT
38
+
39
+ stt = STT(language="hi-IN")
40
+
41
+ # Use with a LiveKit voice agent pipeline
42
+ ```
43
+
44
+ ### Text-to-Speech
45
+
46
+ ```python
47
+ from livekit.plugins.gnani import TTS
48
+
49
+ tts = TTS(voice="sia")
50
+
51
+ # Use with a LiveKit voice agent pipeline
52
+ ```
53
+
54
+ ## Features
55
+
56
+ ### STT
57
+
58
+ - **Batch recognition** — REST API (`POST /stt/v3`) for file-based transcription
59
+ - **Real-time streaming** — WebSocket API for live audio transcription with VAD
60
+ - **10 Indian languages** — bn-IN, en-IN, gu-IN, hi-IN, kn-IN, ml-IN, mr-IN, pa-IN, ta-IN, te-IN
61
+ - **Code-switching** — Hinglish (en-hi-IN-latn) and Hindi-English mixed (en-hi-in-cm) for streaming
62
+ - **Sample rates** — 8 kHz and 16 kHz
63
+
64
+ ### TTS
65
+
66
+ - **Chunked synthesis** — REST API for single-request audio generation
67
+ - **Real-time streaming** — WebSocket API for low-latency streaming synthesis
68
+ - **8 voices** — sia, raju, kanika, nikita, ravan, simran, karan, neha
69
+ - **Configurable output** — sample rate (8000–44100), encoding (linear_pcm, oggopus), container (raw, mp3, wav, mulaw, ogg)
70
+
71
+ ## Supported Languages
72
+
73
+ | Language | Code |
74
+ |-----------------|---------|
75
+ | Bengali | `bn-IN` |
76
+ | English (India) | `en-IN` |
77
+ | Gujarati | `gu-IN` |
78
+ | Hindi | `hi-IN` |
79
+ | Kannada | `kn-IN` |
80
+ | Malayalam | `ml-IN` |
81
+ | Marathi | `mr-IN` |
82
+ | Punjabi | `pa-IN` |
83
+ | Tamil | `ta-IN` |
84
+ | Telugu | `te-IN` |
85
+
86
+ ## Available Voices
87
+
88
+ | Voice | ID |
89
+ |---------|-----------|
90
+ | Sia | `sia` |
91
+ | Raju | `raju` |
92
+ | Kanika | `kanika` |
93
+ | Nikita | `nikita` |
94
+ | Ravan | `ravan` |
95
+ | Simran | `simran` |
96
+ | Karan | `karan` |
97
+ | Neha | `neha` |
98
+
99
+ ## Architecture
100
+
101
+ ```
102
+ gnani-vachana ← Core SDK (REST, WebSocket, SSE clients)
103
+
104
+ livekit-plugins-gnani ← This package (LiveKit Agents adapter)
105
+ ```
106
+
107
+ This plugin is a thin adapter that wraps the `gnani-vachana` SDK into LiveKit's `stt.STT` and `tts.TTS` base classes. All connection logic, authentication, and audio format handling lives in the core SDK.
108
+
109
+ ## Documentation
110
+
111
+ - [Vachana API Docs](https://docs.inya.ai/vachana/introduction/introduction)
112
+ - [LiveKit Agents Docs](https://docs.livekit.io/agents/)
113
+ - [gnani-vachana SDK](https://pypi.org/project/gnani-vachana/)
114
+
115
+ ## License
116
+
117
+ Apache 2.0 — see [LICENSE](LICENSE).
File without changes
@@ -0,0 +1,36 @@
1
+ """Gnani Vachana plugin for LiveKit Agents
2
+
3
+ Support for speech-to-text and text-to-speech with [Gnani's Vachana platform](https://gnani.ai/).
4
+
5
+ Vachana provides high-accuracy STT and low-latency TTS for Indian languages,
6
+ including multilingual and code-switching scenarios.
7
+
8
+ For API access, email speechstack@gnani.ai
9
+ """
10
+
11
+ from .stt import STT
12
+ from .tts import TTS
13
+ from .version import __version__
14
+
15
+ __all__ = ["STT", "TTS", "__version__"]
16
+
17
+
18
+ from livekit.agents import Plugin
19
+
20
+ from .log import logger
21
+
22
+
23
+ class GnaniPlugin(Plugin):
24
+ def __init__(self) -> None:
25
+ super().__init__(__name__, __version__, __package__, logger)
26
+
27
+
28
+ Plugin.register_plugin(GnaniPlugin())
29
+
30
+ _module = dir()
31
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
32
+
33
+ __pdoc__ = {}
34
+
35
+ for n in NOT_IN_ALL:
36
+ __pdoc__[n] = False
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
@@ -0,0 +1,410 @@
1
+ """Speech-to-Text implementation for Gnani Vachana
2
+
3
+ This module provides an STT implementation that uses the Gnani Vachana API,
4
+ supporting both batch recognition (REST) and real-time streaming (WebSocket).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ import os
12
+ from dataclasses import dataclass
13
+ from typing import Literal
14
+
15
+ from livekit import rtc
16
+ from livekit.agents import (
17
+ DEFAULT_API_CONNECT_OPTIONS,
18
+ APIConnectionError,
19
+ APIConnectOptions,
20
+ APIStatusError,
21
+ APITimeoutError,
22
+ stt,
23
+ utils,
24
+ )
25
+ from livekit.agents.types import NOT_GIVEN, NotGivenOr
26
+ from livekit.agents.utils import AudioBuffer
27
+ from livekit.agents.utils.misc import is_given
28
+
29
+ from .log import logger
30
+
31
+ GNANI_STT_BASE_URL = "https://api.vachana.ai"
32
+
33
+ GnaniSTTLanguages = Literal[
34
+ "bn-IN", "en-IN", "gu-IN", "hi-IN", "kn-IN",
35
+ "ml-IN", "mr-IN", "pa-IN", "ta-IN", "te-IN",
36
+ ]
37
+
38
+ SUPPORTED_LANGUAGES: set[str] = {
39
+ "bn-IN", "en-IN", "gu-IN", "hi-IN", "kn-IN",
40
+ "ml-IN", "mr-IN", "pa-IN", "ta-IN", "te-IN",
41
+ }
42
+
43
+ STREAM_SUPPORTED_LANGUAGES: set[str] = SUPPORTED_LANGUAGES | {
44
+ "en-hi-IN-latn", "en-hi-in-cm",
45
+ }
46
+
47
+ SAMPLE_RATE_16K = 16000
48
+ SAMPLE_RATE_8K = 8000
49
+ STREAM_CHUNK_BYTES = 1024
50
+
51
+
52
+ @dataclass
53
+ class GnaniSTTOptions:
54
+ api_key: str
55
+ language: str
56
+ sample_rate: int = SAMPLE_RATE_16K
57
+ base_url: str = GNANI_STT_BASE_URL
58
+ organization_id: str | None = None
59
+ user_id: str | None = None
60
+
61
+
62
+ class STT(stt.STT):
63
+ """Gnani Vachana Speech-to-Text implementation.
64
+
65
+ Provides speech-to-text functionality using Gnani's Vachana platform.
66
+ Supports batch recognition via REST API and real-time streaming via WebSocket.
67
+
68
+ Args:
69
+ language: BCP-47 language code (e.g. "hi-IN", "en-IN").
70
+ api_key: Gnani API key (falls back to GNANI_API_KEY env var).
71
+ sample_rate: Audio sample rate for streaming (8000 or 16000).
72
+ base_url: Vachana API base URL.
73
+ organization_id: Organization ID for REST API (falls back to GNANI_ORGANIZATION_ID).
74
+ user_id: User ID for REST API (falls back to GNANI_USER_ID).
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ *,
80
+ language: str = "en-IN",
81
+ api_key: str | None = None,
82
+ sample_rate: int = SAMPLE_RATE_16K,
83
+ base_url: str = GNANI_STT_BASE_URL,
84
+ organization_id: str | None = None,
85
+ user_id: str | None = None,
86
+ http_session: None = None,
87
+ ) -> None:
88
+ super().__init__(
89
+ capabilities=stt.STTCapabilities(
90
+ streaming=True,
91
+ interim_results=False,
92
+ aligned_transcript=False,
93
+ )
94
+ )
95
+
96
+ self._api_key = api_key or os.environ.get("GNANI_API_KEY")
97
+ if not self._api_key:
98
+ raise ValueError(
99
+ "Gnani API key is required. "
100
+ "Provide it directly or set GNANI_API_KEY environment variable."
101
+ )
102
+
103
+ if sample_rate not in (SAMPLE_RATE_8K, SAMPLE_RATE_16K):
104
+ raise ValueError("sample_rate must be 8000 or 16000")
105
+
106
+ self._opts = GnaniSTTOptions(
107
+ api_key=self._api_key,
108
+ language=language,
109
+ sample_rate=sample_rate,
110
+ base_url=base_url,
111
+ organization_id=organization_id or os.environ.get("GNANI_ORGANIZATION_ID"),
112
+ user_id=user_id or os.environ.get("GNANI_USER_ID"),
113
+ )
114
+ self._session: utils.aiohttp.ClientSession | None = None
115
+
116
+ @property
117
+ def model(self) -> str:
118
+ return "vachana-stt-v3"
119
+
120
+ @property
121
+ def provider(self) -> str:
122
+ return "Gnani"
123
+
124
+ def _ensure_session(self) -> utils.aiohttp.ClientSession:
125
+ if not self._session:
126
+ self._session = utils.http_context.http_session()
127
+ return self._session
128
+
129
+ @staticmethod
130
+ def _single_attempt(conn_options: APIConnectOptions) -> APIConnectOptions:
131
+ return APIConnectOptions(
132
+ max_retry=0,
133
+ retry_interval=conn_options.retry_interval,
134
+ timeout=conn_options.timeout,
135
+ )
136
+
137
+ async def recognize(
138
+ self,
139
+ buffer: AudioBuffer,
140
+ *,
141
+ language: NotGivenOr[str] = NOT_GIVEN,
142
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
143
+ ) -> stt.SpeechEvent:
144
+ return await super().recognize(
145
+ buffer,
146
+ language=language,
147
+ conn_options=self._single_attempt(conn_options),
148
+ )
149
+
150
+ async def _recognize_impl(
151
+ self,
152
+ buffer: AudioBuffer,
153
+ *,
154
+ language: NotGivenOr[str] = NOT_GIVEN,
155
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
156
+ ) -> stt.SpeechEvent:
157
+ import aiohttp
158
+
159
+ lang = language if is_given(language) else self._opts.language
160
+
161
+ wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()
162
+
163
+ form_data = aiohttp.FormData()
164
+ form_data.add_field(
165
+ "audio_file", wav_bytes, filename="audio.wav", content_type="audio/wav"
166
+ )
167
+ form_data.add_field("language_code", lang)
168
+
169
+ headers: dict[str, str] = {
170
+ "X-API-Key-ID": self._opts.api_key,
171
+ }
172
+ if self._opts.organization_id:
173
+ headers["X-Organization-ID"] = self._opts.organization_id
174
+ if self._opts.user_id:
175
+ headers["X-API-User-ID"] = self._opts.user_id
176
+
177
+ try:
178
+ async with self._ensure_session().post(
179
+ url=f"{self._opts.base_url}/stt/v3",
180
+ data=form_data,
181
+ headers=headers,
182
+ timeout=aiohttp.ClientTimeout(
183
+ total=conn_options.timeout,
184
+ sock_connect=conn_options.timeout,
185
+ ),
186
+ ) as res:
187
+ if res.status != 200:
188
+ error_text = await res.text()
189
+ logger.error(f"Gnani STT API error: {res.status} - {error_text}")
190
+ raise APIStatusError(
191
+ message=f"Gnani STT API Error ({res.status}): {error_text}",
192
+ status_code=res.status,
193
+ body=error_text,
194
+ )
195
+
196
+ response_json = await res.json()
197
+ transcript = response_json.get("transcript", "")
198
+ request_id = response_json.get("request_id", "")
199
+
200
+ return stt.SpeechEvent(
201
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
202
+ request_id=request_id,
203
+ alternatives=[
204
+ stt.SpeechData(
205
+ language=lang,
206
+ text=transcript,
207
+ confidence=1.0,
208
+ )
209
+ ],
210
+ )
211
+
212
+ except asyncio.TimeoutError as e:
213
+ raise APITimeoutError("Gnani STT API request timed out") from e
214
+ except (APIStatusError, APIConnectionError, APITimeoutError):
215
+ raise
216
+ except Exception as e:
217
+ raise APIConnectionError(f"Gnani STT error: {e}") from e
218
+
219
+ def stream(
220
+ self,
221
+ *,
222
+ language: NotGivenOr[str] = NOT_GIVEN,
223
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
224
+ ) -> SpeechStream:
225
+ lang = language if is_given(language) else self._opts.language
226
+ return SpeechStream(
227
+ stt=self,
228
+ opts=GnaniSTTOptions(
229
+ api_key=self._opts.api_key,
230
+ language=lang,
231
+ sample_rate=self._opts.sample_rate,
232
+ base_url=self._opts.base_url,
233
+ organization_id=self._opts.organization_id,
234
+ user_id=self._opts.user_id,
235
+ ),
236
+ conn_options=self._single_attempt(conn_options),
237
+ )
238
+
239
+ async def aclose(self) -> None:
240
+ pass
241
+
242
+
243
+ class SpeechStream(stt.RecognizeStream):
244
+ """WebSocket-based streaming STT for Gnani Vachana.
245
+
246
+ Connects to wss://api.vachana.ai/stt/v3/stream and sends raw PCM audio
247
+ in 1024-byte chunks (512 samples, 16-bit mono).
248
+ """
249
+
250
+ def __init__(
251
+ self,
252
+ *,
253
+ stt: STT,
254
+ opts: GnaniSTTOptions,
255
+ conn_options: APIConnectOptions,
256
+ ) -> None:
257
+ super().__init__(
258
+ stt=stt,
259
+ conn_options=conn_options,
260
+ sample_rate=opts.sample_rate,
261
+ )
262
+ self._opts = opts
263
+
264
+ def _build_ws_url(self) -> str:
265
+ base = self._opts.base_url
266
+ if base.startswith("https://"):
267
+ ws_base = "wss://" + base[len("https://"):]
268
+ elif base.startswith("http://"):
269
+ ws_base = "ws://" + base[len("http://"):]
270
+ else:
271
+ ws_base = "wss://" + base
272
+ return f"{ws_base}/stt/v3/stream"
273
+
274
+ async def _run(self) -> None:
275
+ import websockets
276
+
277
+ ws_url = self._build_ws_url()
278
+ headers = {
279
+ "x-api-key-id": self._opts.api_key,
280
+ "lang_code": self._opts.language,
281
+ }
282
+
283
+ try:
284
+ async with websockets.connect(
285
+ ws_url,
286
+ additional_headers=headers,
287
+ ping_interval=20,
288
+ ping_timeout=20,
289
+ close_timeout=10,
290
+ ) as ws:
291
+ connected_msg = await asyncio.wait_for(ws.recv(), timeout=10)
292
+ connected_data = json.loads(connected_msg)
293
+ if connected_data.get("type") != "connected":
294
+ logger.warning(
295
+ f"Unexpected first message from Gnani STT: {connected_data}"
296
+ )
297
+
298
+ send_task = asyncio.create_task(
299
+ self._send_audio(ws), name="gnani-stt-send"
300
+ )
301
+ recv_task = asyncio.create_task(
302
+ self._recv_messages(ws), name="gnani-stt-recv"
303
+ )
304
+
305
+ try:
306
+ await asyncio.gather(send_task, recv_task)
307
+ finally:
308
+ send_task.cancel()
309
+ recv_task.cancel()
310
+ with utils.aio.suppress(asyncio.CancelledError):
311
+ await send_task
312
+ with utils.aio.suppress(asyncio.CancelledError):
313
+ await recv_task
314
+
315
+ except websockets.exceptions.ConnectionClosed as e:
316
+ raise APIConnectionError(
317
+ f"Gnani STT WebSocket closed unexpectedly: {e}"
318
+ ) from e
319
+ except asyncio.TimeoutError as e:
320
+ raise APITimeoutError("Gnani STT WebSocket connection timed out") from e
321
+ except (APIConnectionError, APIStatusError, APITimeoutError):
322
+ raise
323
+ except Exception as e:
324
+ raise APIConnectionError(f"Gnani STT WebSocket error: {e}") from e
325
+
326
+ async def _send_audio(self, ws) -> None:
327
+ audio_buffer = bytearray()
328
+
329
+ async for data in self._input_ch:
330
+ if isinstance(data, self._FlushSentinel):
331
+ if audio_buffer:
332
+ await ws.send(bytes(audio_buffer))
333
+ audio_buffer.clear()
334
+ continue
335
+
336
+ frame: rtc.AudioFrame = data
337
+ raw_pcm = frame.data.tobytes()
338
+ audio_buffer.extend(raw_pcm)
339
+
340
+ while len(audio_buffer) >= STREAM_CHUNK_BYTES:
341
+ chunk = bytes(audio_buffer[:STREAM_CHUNK_BYTES])
342
+ audio_buffer = audio_buffer[STREAM_CHUNK_BYTES:]
343
+ await ws.send(chunk)
344
+
345
+ if audio_buffer:
346
+ await ws.send(bytes(audio_buffer))
347
+
348
+ await ws.close()
349
+
350
+ async def _recv_messages(self, ws) -> None:
351
+ try:
352
+ async for msg in ws:
353
+ if isinstance(msg, bytes):
354
+ continue
355
+
356
+ data = json.loads(msg)
357
+ msg_type = data.get("type", "")
358
+
359
+ if msg_type == "transcript":
360
+ text = data.get("text", "")
361
+ if not text:
362
+ continue
363
+
364
+ self._event_ch.send_nowait(
365
+ stt.SpeechEvent(
366
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
367
+ alternatives=[
368
+ stt.SpeechData(
369
+ language=self._opts.language,
370
+ text=text,
371
+ confidence=1.0,
372
+ )
373
+ ],
374
+ )
375
+ )
376
+
377
+ elif msg_type in ("speech_start", "vad_start"):
378
+ self._event_ch.send_nowait(
379
+ stt.SpeechEvent(
380
+ type=stt.SpeechEventType.START_OF_SPEECH,
381
+ )
382
+ )
383
+
384
+ elif msg_type in ("speech_end", "vad_end"):
385
+ self._event_ch.send_nowait(
386
+ stt.SpeechEvent(
387
+ type=stt.SpeechEventType.END_OF_SPEECH,
388
+ )
389
+ )
390
+
391
+ elif msg_type == "processing":
392
+ pass
393
+
394
+ elif msg_type == "error":
395
+ error_msg = data.get("message", "Unknown error")
396
+ logger.error(f"Gnani STT stream error: {error_msg}")
397
+ raise APIStatusError(
398
+ message=f"Gnani STT stream error: {error_msg}",
399
+ status_code=500,
400
+ body=error_msg,
401
+ )
402
+
403
+ except asyncio.CancelledError:
404
+ raise
405
+ except (APIStatusError, APIConnectionError):
406
+ raise
407
+ except Exception as e:
408
+ raise APIConnectionError(
409
+ f"Error receiving Gnani STT messages: {e}"
410
+ ) from e
@@ -0,0 +1,376 @@
1
+ """Text-to-Speech implementation for Gnani Vachana
2
+
3
+ This module provides a TTS implementation that uses the Gnani Vachana API,
4
+ supporting both chunked synthesis (REST) and real-time streaming (WebSocket).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import base64
11
+ import json
12
+ import os
13
+ from dataclasses import dataclass, replace
14
+ from typing import Literal
15
+
16
+ from livekit.agents import (
17
+ DEFAULT_API_CONNECT_OPTIONS,
18
+ APIConnectionError,
19
+ APIConnectOptions,
20
+ APIStatusError,
21
+ APITimeoutError,
22
+ tokenize,
23
+ tts,
24
+ utils,
25
+ )
26
+
27
+ from .log import logger
28
+
29
+ GNANI_TTS_BASE_URL = "https://api.vachana.ai"
30
+
31
+ GnaniTTSVoices = Literal[
32
+ "sia", "raju", "kanika", "nikita", "ravan", "simran", "karan", "neha",
33
+ ]
34
+
35
+ SUPPORTED_VOICES: set[str] = {
36
+ "sia", "raju", "kanika", "nikita", "ravan", "simran", "karan", "neha",
37
+ }
38
+
39
+ GnaniTTSEncodings = Literal["linear_pcm", "oggopus"]
40
+ GnaniTTSContainers = Literal["raw", "mp3", "wav", "mulaw", "ogg"]
41
+
42
+
43
+ @dataclass
44
+ class GnaniTTSOptions:
45
+ api_key: str
46
+ voice: str = "sia"
47
+ model: str = "vachana-voice-v2"
48
+ sample_rate: int = 24000
49
+ encoding: str = "linear_pcm"
50
+ container: str = "wav"
51
+ num_channels: int = 1
52
+ sample_width: int = 2
53
+ base_url: str = GNANI_TTS_BASE_URL
54
+ language: str = "IND-IN"
55
+
56
+
57
+ class TTS(tts.TTS):
58
+ """Gnani Vachana Text-to-Speech implementation.
59
+
60
+ Provides text-to-speech functionality using Gnani's Vachana platform.
61
+ Supports batch synthesis via REST API and real-time streaming via WebSocket.
62
+
63
+ Args:
64
+ voice: Voice to use for synthesis (sia, raju, kanika, etc.).
65
+ model: TTS model name (default: vachana-voice-v2).
66
+ sample_rate: Audio output sample rate (8000-44100).
67
+ encoding: Audio encoding (linear_pcm or oggopus).
68
+ container: Audio container format (raw, mp3, wav, mulaw, ogg).
69
+ api_key: Gnani API key (falls back to GNANI_API_KEY env var).
70
+ base_url: Vachana API base URL.
71
+ language: Language code for WebSocket TTS (default: IND-IN).
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ *,
77
+ voice: GnaniTTSVoices | str = "sia",
78
+ model: str = "vachana-voice-v2",
79
+ sample_rate: int = 24000,
80
+ num_channels: int = 1,
81
+ encoding: GnaniTTSEncodings | str = "linear_pcm",
82
+ container: GnaniTTSContainers | str = "wav",
83
+ api_key: str | None = None,
84
+ base_url: str = GNANI_TTS_BASE_URL,
85
+ language: str = "IND-IN",
86
+ ) -> None:
87
+ super().__init__(
88
+ capabilities=tts.TTSCapabilities(streaming=True),
89
+ sample_rate=sample_rate,
90
+ num_channels=num_channels,
91
+ )
92
+
93
+ self._api_key = api_key or os.environ.get("GNANI_API_KEY")
94
+ if not self._api_key:
95
+ raise ValueError(
96
+ "Gnani API key is required. "
97
+ "Provide it directly or set GNANI_API_KEY environment variable."
98
+ )
99
+
100
+ if voice not in SUPPORTED_VOICES:
101
+ raise ValueError(
102
+ f"Voice '{voice}' not supported. Choose from: {sorted(SUPPORTED_VOICES)}"
103
+ )
104
+
105
+ self._opts = GnaniTTSOptions(
106
+ api_key=self._api_key,
107
+ voice=voice,
108
+ model=model,
109
+ sample_rate=sample_rate,
110
+ encoding=encoding,
111
+ container=container,
112
+ num_channels=num_channels,
113
+ base_url=base_url,
114
+ language=language,
115
+ )
116
+ self._session = None
117
+
118
+ @property
119
+ def model(self) -> str:
120
+ return self._opts.model
121
+
122
+ @property
123
+ def provider(self) -> str:
124
+ return "Gnani"
125
+
126
+ def _ensure_session(self):
127
+ if not self._session:
128
+ self._session = utils.http_context.http_session()
129
+ return self._session
130
+
131
+ def synthesize(
132
+ self, text: str, *, conn_options: APIConnectOptions | None = None
133
+ ) -> ChunkedStream:
134
+ if conn_options is None:
135
+ conn_options = DEFAULT_API_CONNECT_OPTIONS
136
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
137
+
138
+ def stream(
139
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
140
+ ) -> SynthesizeStream:
141
+ return SynthesizeStream(tts=self, conn_options=conn_options)
142
+
143
+ def update_options(
144
+ self,
145
+ *,
146
+ voice: str | None = None,
147
+ model: str | None = None,
148
+ language: str | None = None,
149
+ ) -> None:
150
+ if voice is not None:
151
+ if voice not in SUPPORTED_VOICES:
152
+ raise ValueError(
153
+ f"Voice '{voice}' not supported. Choose from: {sorted(SUPPORTED_VOICES)}"
154
+ )
155
+ self._opts.voice = voice
156
+ if model is not None:
157
+ self._opts.model = model
158
+ if language is not None:
159
+ self._opts.language = language
160
+
161
+ async def aclose(self) -> None:
162
+ pass
163
+
164
+
165
+ class ChunkedStream(tts.ChunkedStream):
166
+ """REST-based chunked TTS for Gnani Vachana.
167
+
168
+ Uses POST /api/v1/tts/inference to synthesize text in a single request.
169
+ """
170
+
171
+ def __init__(
172
+ self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions
173
+ ) -> None:
174
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
175
+ self._tts: TTS = tts
176
+ self._opts = replace(tts._opts)
177
+
178
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
179
+ import aiohttp
180
+
181
+ payload = {
182
+ "text": self._input_text,
183
+ "voice": self._opts.voice,
184
+ "model": self._opts.model,
185
+ "audio_config": {
186
+ "sample_rate": self._opts.sample_rate,
187
+ "encoding": self._opts.encoding,
188
+ "num_channels": self._opts.num_channels,
189
+ "sample_width": self._opts.sample_width,
190
+ "container": self._opts.container,
191
+ },
192
+ }
193
+
194
+ headers = {
195
+ "X-API-Key-ID": self._opts.api_key,
196
+ "Content-Type": "application/json",
197
+ }
198
+
199
+ mime_type = f"audio/{self._opts.container}"
200
+ if self._opts.container == "raw":
201
+ mime_type = "audio/pcm"
202
+
203
+ try:
204
+ async with self._tts._ensure_session().post(
205
+ url=f"{self._opts.base_url}/api/v1/tts/inference",
206
+ json=payload,
207
+ headers=headers,
208
+ timeout=aiohttp.ClientTimeout(
209
+ total=self._conn_options.timeout,
210
+ sock_connect=self._conn_options.timeout,
211
+ ),
212
+ ) as res:
213
+ if res.status != 200:
214
+ error_text = await res.text()
215
+ logger.error(f"Gnani TTS API error: {res.status} - {error_text}")
216
+ raise APIStatusError(
217
+ message=f"Gnani TTS API Error ({res.status}): {error_text}",
218
+ status_code=res.status,
219
+ body=error_text,
220
+ )
221
+
222
+ audio_bytes = await res.read()
223
+
224
+ output_emitter.initialize(
225
+ request_id="gnani-tts",
226
+ sample_rate=self._tts.sample_rate,
227
+ num_channels=self._tts.num_channels,
228
+ mime_type=mime_type,
229
+ )
230
+ output_emitter.push(audio_bytes)
231
+
232
+ except asyncio.TimeoutError as e:
233
+ raise APITimeoutError("Gnani TTS API request timed out") from e
234
+ except (APIStatusError, APIConnectionError, APITimeoutError):
235
+ raise
236
+ except Exception as e:
237
+ raise APIConnectionError(f"Gnani TTS error: {e}") from e
238
+
239
+
240
+ class SynthesizeStream(tts.SynthesizeStream):
241
+ """WebSocket-based streaming TTS for Gnani Vachana.
242
+
243
+ Opens a WebSocket to wss://api.vachana.ai/api/v1/tts and streams
244
+ audio chunks back as they are synthesized.
245
+ """
246
+
247
+ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
248
+ super().__init__(tts=tts, conn_options=conn_options)
249
+ self._tts: TTS = tts
250
+ self._opts = replace(tts._opts)
251
+
252
+ def _build_ws_url(self) -> str:
253
+ base = self._opts.base_url
254
+ if base.startswith("https://"):
255
+ ws_base = "wss://" + base[len("https://"):]
256
+ elif base.startswith("http://"):
257
+ ws_base = "ws://" + base[len("http://"):]
258
+ else:
259
+ ws_base = "wss://" + base
260
+ return f"{ws_base}/api/v1/tts"
261
+
262
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
263
+ import websockets
264
+
265
+ token_buf = ""
266
+ word_stream = tokenize.basic.SentenceTokenizer().stream()
267
+ _flushing = False
268
+
269
+ async def _input_task():
270
+ nonlocal _flushing
271
+ async for data in self._input:
272
+ if isinstance(data, str):
273
+ word_stream.push_text(data)
274
+ elif isinstance(data, self._FlushSentinel):
275
+ word_stream.flush()
276
+ _flushing = True
277
+ word_stream.end_input()
278
+
279
+ input_task = asyncio.create_task(_input_task(), name="gnani-tts-input")
280
+
281
+ try:
282
+ async for ev in word_stream:
283
+ text = ev.token
284
+ if not text.strip():
285
+ continue
286
+ await self._synthesize_segment(text, output_emitter)
287
+ finally:
288
+ input_task.cancel()
289
+ with utils.aio.suppress(asyncio.CancelledError):
290
+ await input_task
291
+
292
+ async def _synthesize_segment(
293
+ self, text: str, output_emitter: tts.AudioEmitter
294
+ ) -> None:
295
+ import websockets
296
+
297
+ ws_url = self._build_ws_url()
298
+ headers = {
299
+ "Content-Type": "application/json",
300
+ "X-API-Key-ID": self._opts.api_key,
301
+ }
302
+
303
+ mime_type = f"audio/{self._opts.container}"
304
+ if self._opts.container == "raw":
305
+ mime_type = "audio/pcm"
306
+
307
+ output_emitter.initialize(
308
+ request_id="gnani-tts-stream",
309
+ sample_rate=self._tts.sample_rate,
310
+ num_channels=self._tts.num_channels,
311
+ mime_type=mime_type,
312
+ )
313
+
314
+ try:
315
+ async with websockets.connect(
316
+ ws_url,
317
+ additional_headers=headers,
318
+ ping_interval=20,
319
+ ping_timeout=20,
320
+ close_timeout=10,
321
+ ) as ws:
322
+ request_body = {
323
+ "text": text,
324
+ "voice": self._opts.voice,
325
+ "model": self._opts.model,
326
+ "language": self._opts.language,
327
+ "audio_config": {
328
+ "sample_rate": self._opts.sample_rate,
329
+ "encoding": self._opts.encoding,
330
+ "num_channels": self._opts.num_channels,
331
+ "sample_width": self._opts.sample_width,
332
+ "container": self._opts.container,
333
+ },
334
+ }
335
+ await ws.send(json.dumps(request_body))
336
+
337
+ async for msg in ws:
338
+ if isinstance(msg, bytes):
339
+ output_emitter.push(msg)
340
+ continue
341
+
342
+ data = json.loads(msg)
343
+ msg_type = data.get("type", "")
344
+
345
+ if msg_type == "audio":
346
+ audio_b64 = data.get("audio", "")
347
+ if audio_b64:
348
+ output_emitter.push(base64.b64decode(audio_b64))
349
+
350
+ elif msg_type == "complete":
351
+ audio_b64 = data.get("audio", "")
352
+ if audio_b64:
353
+ output_emitter.push(base64.b64decode(audio_b64))
354
+ break
355
+
356
+ elif msg_type == "error":
357
+ error_msg = data.get("message", "Unknown error")
358
+ logger.error(f"Gnani TTS stream error: {error_msg}")
359
+ raise APIStatusError(
360
+ message=f"Gnani TTS stream error: {error_msg}",
361
+ status_code=500,
362
+ body=error_msg,
363
+ )
364
+
365
+ except websockets.exceptions.ConnectionClosed as e:
366
+ raise APIConnectionError(
367
+ f"Gnani TTS WebSocket closed: {e}"
368
+ ) from e
369
+ except asyncio.TimeoutError as e:
370
+ raise APITimeoutError(
371
+ "Gnani TTS WebSocket timed out"
372
+ ) from e
373
+ except (APIStatusError, APIConnectionError, APITimeoutError):
374
+ raise
375
+ except Exception as e:
376
+ raise APIConnectionError(f"Gnani TTS WebSocket error: {e}") from e
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,78 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "livekit-plugins-gnani"
7
+ version = "0.1.0"
8
+ description = "LiveKit Agents plugin for Gnani Vachana speech AI — STT & TTS for Indian languages"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Genvoice", email = "speechstack@gnani.ai" },
14
+ ]
15
+ keywords = [
16
+ "webrtc", "realtime", "audio", "livekit", "livekit-agents",
17
+ "gnani", "vachana", "indian-languages", "indic",
18
+ "stt", "tts", "speech-to-text", "text-to-speech",
19
+ "multilingual", "streaming", "websocket",
20
+ ]
21
+ classifiers = [
22
+ "Development Status :: 4 - Beta",
23
+ "Intended Audience :: Developers",
24
+ "License :: OSI Approved :: Apache Software License",
25
+ "Operating System :: OS Independent",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ "Programming Language :: Python :: 3 :: Only",
32
+ "Topic :: Multimedia :: Sound/Audio",
33
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
34
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
35
+ ]
36
+ dependencies = [
37
+ "livekit-agents[codecs]>=1.5.8",
38
+ "gnani-vachana>=0.2.2,<1.0",
39
+ "websockets>=13.1,<16.0",
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://gnani.ai"
44
+ Documentation = "https://docs.inya.ai/vachana"
45
+ Repository = "https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani"
46
+ Issues = "https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani/issues"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["livekit"]
50
+
51
+ [tool.hatch.build.targets.sdist]
52
+ include = ["/livekit"]
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Ruff
56
+ # ---------------------------------------------------------------------------
57
+ [tool.ruff]
58
+ target-version = "py310"
59
+ line-length = 100
60
+
61
+ [tool.ruff.lint]
62
+ select = ["E", "W", "F", "I", "N", "UP", "B", "SIM", "TCH", "RUF"]
63
+ ignore = ["E501"]
64
+
65
+ [tool.ruff.lint.isort]
66
+ known-first-party = ["livekit"]
67
+
68
+ [tool.ruff.format]
69
+ quote-style = "double"
70
+ indent-style = "space"
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Mypy
74
+ # ---------------------------------------------------------------------------
75
+ [tool.mypy]
76
+ python_version = "3.10"
77
+ warn_return_any = true
78
+ warn_unused_configs = true