livekit-plugins-gnani 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit_plugins_gnani-0.1.0/.gitignore +36 -0
- livekit_plugins_gnani-0.1.0/LICENSE +17 -0
- livekit_plugins_gnani-0.1.0/PKG-INFO +148 -0
- livekit_plugins_gnani-0.1.0/README.md +117 -0
- livekit_plugins_gnani-0.1.0/livekit/__init__.py +0 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/__init__.py +0 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/__init__.py +36 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/log.py +3 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/py.typed +0 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/stt.py +410 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/tts.py +376 -0
- livekit_plugins_gnani-0.1.0/livekit/plugins/gnani/version.py +1 -0
- livekit_plugins_gnani-0.1.0/pyproject.toml +78 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
|
|
6
|
+
*.egg-info/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.eggs/
|
|
10
|
+
*.egg
|
|
11
|
+
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
.env
|
|
17
|
+
.env.*
|
|
18
|
+
*.ini
|
|
19
|
+
*.cfg
|
|
20
|
+
!pyproject.toml
|
|
21
|
+
|
|
22
|
+
*.log
|
|
23
|
+
.mypy_cache/
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.ruff_cache/
|
|
26
|
+
.tox/
|
|
27
|
+
.coverage
|
|
28
|
+
htmlcov/
|
|
29
|
+
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
.DS_Store
|
|
36
|
+
Thumbs.db
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
16
|
+
|
|
17
|
+
Copyright 2025-2026 Gnani.ai
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: livekit-plugins-gnani
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LiveKit Agents plugin for Gnani Vachana speech AI — STT & TTS for Indian languages
|
|
5
|
+
Project-URL: Homepage, https://gnani.ai
|
|
6
|
+
Project-URL: Documentation, https://docs.inya.ai/vachana
|
|
7
|
+
Project-URL: Repository, https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani
|
|
8
|
+
Project-URL: Issues, https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani/issues
|
|
9
|
+
Author-email: Genvoice <speechstack@gnani.ai>
|
|
10
|
+
License-Expression: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: audio,gnani,indian-languages,indic,livekit,livekit-agents,multilingual,realtime,speech-to-text,streaming,stt,text-to-speech,tts,vachana,webrtc,websocket
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
24
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: gnani-vachana<1.0,>=0.2.2
|
|
28
|
+
Requires-Dist: livekit-agents[codecs]>=1.5.8
|
|
29
|
+
Requires-Dist: websockets<16.0,>=13.1
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# livekit-plugins-gnani
|
|
33
|
+
|
|
34
|
+
[](https://pypi.org/project/livekit-plugins-gnani/)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
[LiveKit Agents](https://github.com/livekit/agents) plugin for **[Gnani Vachana](https://gnani.ai/)** — high-accuracy Speech-to-Text and low-latency Text-to-Speech for Indian languages.
|
|
38
|
+
|
|
39
|
+
> **Vachana** is a production-ready speech AI platform by [Gnani.ai](https://gnani.ai) supporting 10+ Indian languages with real-time streaming, multilingual transcription, and code-switching capabilities.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install livekit-plugins-gnani
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This will also install the [`gnani-vachana`](https://pypi.org/project/gnani-vachana/) core SDK as a dependency.
|
|
48
|
+
|
|
49
|
+
## Prerequisites
|
|
50
|
+
|
|
51
|
+
You need a Gnani API key. Email **[speechstack@gnani.ai](mailto:speechstack@gnani.ai)** to get started — all new accounts receive free credits, no credit card required.
|
|
52
|
+
|
|
53
|
+
Set your credentials as environment variables:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
export GNANI_API_KEY="your-api-key"
|
|
57
|
+
|
|
58
|
+
# For REST STT only (optional):
|
|
59
|
+
export GNANI_ORGANIZATION_ID="your-org-id"
|
|
60
|
+
export GNANI_USER_ID="your-user-id"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### Speech-to-Text
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from livekit.plugins.gnani import STT
|
|
69
|
+
|
|
70
|
+
stt = STT(language="hi-IN")
|
|
71
|
+
|
|
72
|
+
# Use with a LiveKit voice agent pipeline
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Text-to-Speech
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from livekit.plugins.gnani import TTS
|
|
79
|
+
|
|
80
|
+
tts = TTS(voice="sia")
|
|
81
|
+
|
|
82
|
+
# Use with a LiveKit voice agent pipeline
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Features
|
|
86
|
+
|
|
87
|
+
### STT
|
|
88
|
+
|
|
89
|
+
- **Batch recognition** — REST API (`POST /stt/v3`) for file-based transcription
|
|
90
|
+
- **Real-time streaming** — WebSocket API for live audio transcription with VAD
|
|
91
|
+
- **10 Indian languages** — bn-IN, en-IN, gu-IN, hi-IN, kn-IN, ml-IN, mr-IN, pa-IN, ta-IN, te-IN
|
|
92
|
+
- **Code-switching** — Hinglish (en-hi-IN-latn) and Hindi-English mixed (en-hi-in-cm) for streaming
|
|
93
|
+
- **Sample rates** — 8 kHz and 16 kHz
|
|
94
|
+
|
|
95
|
+
### TTS
|
|
96
|
+
|
|
97
|
+
- **Chunked synthesis** — REST API for single-request audio generation
|
|
98
|
+
- **Real-time streaming** — WebSocket API for low-latency streaming synthesis
|
|
99
|
+
- **8 voices** — sia, raju, kanika, nikita, ravan, simran, karan, neha
|
|
100
|
+
- **Configurable output** — sample rate (8000–44100), encoding (linear_pcm, oggopus), container (raw, mp3, wav, mulaw, ogg)
|
|
101
|
+
|
|
102
|
+
## Supported Languages
|
|
103
|
+
|
|
104
|
+
| Language | Code |
|
|
105
|
+
|-----------------|---------|
|
|
106
|
+
| Bengali | `bn-IN` |
|
|
107
|
+
| English (India) | `en-IN` |
|
|
108
|
+
| Gujarati | `gu-IN` |
|
|
109
|
+
| Hindi | `hi-IN` |
|
|
110
|
+
| Kannada | `kn-IN` |
|
|
111
|
+
| Malayalam | `ml-IN` |
|
|
112
|
+
| Marathi | `mr-IN` |
|
|
113
|
+
| Punjabi | `pa-IN` |
|
|
114
|
+
| Tamil | `ta-IN` |
|
|
115
|
+
| Telugu | `te-IN` |
|
|
116
|
+
|
|
117
|
+
## Available Voices
|
|
118
|
+
|
|
119
|
+
| Voice | ID |
|
|
120
|
+
|---------|-----------|
|
|
121
|
+
| Sia | `sia` |
|
|
122
|
+
| Raju | `raju` |
|
|
123
|
+
| Kanika | `kanika` |
|
|
124
|
+
| Nikita | `nikita` |
|
|
125
|
+
| Ravan | `ravan` |
|
|
126
|
+
| Simran | `simran` |
|
|
127
|
+
| Karan | `karan` |
|
|
128
|
+
| Neha | `neha` |
|
|
129
|
+
|
|
130
|
+
## Architecture
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
gnani-vachana ← Core SDK (REST, WebSocket, SSE clients)
|
|
134
|
+
↑
|
|
135
|
+
livekit-plugins-gnani ← This package (LiveKit Agents adapter)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
This plugin is a thin adapter that wraps the `gnani-vachana` SDK into LiveKit's `stt.STT` and `tts.TTS` base classes. All connection logic, authentication, and audio format handling lives in the core SDK.
|
|
139
|
+
|
|
140
|
+
## Documentation
|
|
141
|
+
|
|
142
|
+
- [Vachana API Docs](https://docs.inya.ai/vachana/introduction/introduction)
|
|
143
|
+
- [LiveKit Agents Docs](https://docs.livekit.io/agents/)
|
|
144
|
+
- [gnani-vachana SDK](https://pypi.org/project/gnani-vachana/)
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# livekit-plugins-gnani
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/livekit-plugins-gnani/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
[LiveKit Agents](https://github.com/livekit/agents) plugin for **[Gnani Vachana](https://gnani.ai/)** — high-accuracy Speech-to-Text and low-latency Text-to-Speech for Indian languages.
|
|
7
|
+
|
|
8
|
+
> **Vachana** is a production-ready speech AI platform by [Gnani.ai](https://gnani.ai) supporting 10+ Indian languages with real-time streaming, multilingual transcription, and code-switching capabilities.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install livekit-plugins-gnani
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
This will also install the [`gnani-vachana`](https://pypi.org/project/gnani-vachana/) core SDK as a dependency.
|
|
17
|
+
|
|
18
|
+
## Prerequisites
|
|
19
|
+
|
|
20
|
+
You need a Gnani API key. Email **[speechstack@gnani.ai](mailto:speechstack@gnani.ai)** to get started — all new accounts receive free credits, no credit card required.
|
|
21
|
+
|
|
22
|
+
Set your credentials as environment variables:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
export GNANI_API_KEY="your-api-key"
|
|
26
|
+
|
|
27
|
+
# For REST STT only (optional):
|
|
28
|
+
export GNANI_ORGANIZATION_ID="your-org-id"
|
|
29
|
+
export GNANI_USER_ID="your-user-id"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
### Speech-to-Text
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from livekit.plugins.gnani import STT
|
|
38
|
+
|
|
39
|
+
stt = STT(language="hi-IN")
|
|
40
|
+
|
|
41
|
+
# Use with a LiveKit voice agent pipeline
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Text-to-Speech
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from livekit.plugins.gnani import TTS
|
|
48
|
+
|
|
49
|
+
tts = TTS(voice="sia")
|
|
50
|
+
|
|
51
|
+
# Use with a LiveKit voice agent pipeline
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
### STT
|
|
57
|
+
|
|
58
|
+
- **Batch recognition** — REST API (`POST /stt/v3`) for file-based transcription
|
|
59
|
+
- **Real-time streaming** — WebSocket API for live audio transcription with VAD
|
|
60
|
+
- **10 Indian languages** — bn-IN, en-IN, gu-IN, hi-IN, kn-IN, ml-IN, mr-IN, pa-IN, ta-IN, te-IN
|
|
61
|
+
- **Code-switching** — Hinglish (en-hi-IN-latn) and Hindi-English mixed (en-hi-in-cm) for streaming
|
|
62
|
+
- **Sample rates** — 8 kHz and 16 kHz
|
|
63
|
+
|
|
64
|
+
### TTS
|
|
65
|
+
|
|
66
|
+
- **Chunked synthesis** — REST API for single-request audio generation
|
|
67
|
+
- **Real-time streaming** — WebSocket API for low-latency streaming synthesis
|
|
68
|
+
- **8 voices** — sia, raju, kanika, nikita, ravan, simran, karan, neha
|
|
69
|
+
- **Configurable output** — sample rate (8000–44100), encoding (linear_pcm, oggopus), container (raw, mp3, wav, mulaw, ogg)
|
|
70
|
+
|
|
71
|
+
## Supported Languages
|
|
72
|
+
|
|
73
|
+
| Language | Code |
|
|
74
|
+
|-----------------|---------|
|
|
75
|
+
| Bengali | `bn-IN` |
|
|
76
|
+
| English (India) | `en-IN` |
|
|
77
|
+
| Gujarati | `gu-IN` |
|
|
78
|
+
| Hindi | `hi-IN` |
|
|
79
|
+
| Kannada | `kn-IN` |
|
|
80
|
+
| Malayalam | `ml-IN` |
|
|
81
|
+
| Marathi | `mr-IN` |
|
|
82
|
+
| Punjabi | `pa-IN` |
|
|
83
|
+
| Tamil | `ta-IN` |
|
|
84
|
+
| Telugu | `te-IN` |
|
|
85
|
+
|
|
86
|
+
## Available Voices
|
|
87
|
+
|
|
88
|
+
| Voice | ID |
|
|
89
|
+
|---------|-----------|
|
|
90
|
+
| Sia | `sia` |
|
|
91
|
+
| Raju | `raju` |
|
|
92
|
+
| Kanika | `kanika` |
|
|
93
|
+
| Nikita | `nikita` |
|
|
94
|
+
| Ravan | `ravan` |
|
|
95
|
+
| Simran | `simran` |
|
|
96
|
+
| Karan | `karan` |
|
|
97
|
+
| Neha | `neha` |
|
|
98
|
+
|
|
99
|
+
## Architecture
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
gnani-vachana ← Core SDK (REST, WebSocket, SSE clients)
|
|
103
|
+
↑
|
|
104
|
+
livekit-plugins-gnani ← This package (LiveKit Agents adapter)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
This plugin is a thin adapter that wraps the `gnani-vachana` SDK into LiveKit's `stt.STT` and `tts.TTS` base classes. All connection logic, authentication, and audio format handling lives in the core SDK.
|
|
108
|
+
|
|
109
|
+
## Documentation
|
|
110
|
+
|
|
111
|
+
- [Vachana API Docs](https://docs.inya.ai/vachana/introduction/introduction)
|
|
112
|
+
- [LiveKit Agents Docs](https://docs.livekit.io/agents/)
|
|
113
|
+
- [gnani-vachana SDK](https://pypi.org/project/gnani-vachana/)
|
|
114
|
+
|
|
115
|
+
## License
|
|
116
|
+
|
|
117
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Gnani Vachana plugin for LiveKit Agents
|
|
2
|
+
|
|
3
|
+
Support for speech-to-text and text-to-speech with [Gnani's Vachana platform](https://gnani.ai/).
|
|
4
|
+
|
|
5
|
+
Vachana provides high-accuracy STT and low-latency TTS for Indian languages,
|
|
6
|
+
including multilingual and code-switching scenarios.
|
|
7
|
+
|
|
8
|
+
For API access, email speechstack@gnani.ai
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .stt import STT
|
|
12
|
+
from .tts import TTS
|
|
13
|
+
from .version import __version__
|
|
14
|
+
|
|
15
|
+
__all__ = ["STT", "TTS", "__version__"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from livekit.agents import Plugin
|
|
19
|
+
|
|
20
|
+
from .log import logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GnaniPlugin(Plugin):
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
super().__init__(__name__, __version__, __package__, logger)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Plugin.register_plugin(GnaniPlugin())
|
|
29
|
+
|
|
30
|
+
_module = dir()
|
|
31
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
|
32
|
+
|
|
33
|
+
__pdoc__ = {}
|
|
34
|
+
|
|
35
|
+
for n in NOT_IN_ALL:
|
|
36
|
+
__pdoc__[n] = False
|
|
File without changes
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
"""Speech-to-Text implementation for Gnani Vachana
|
|
2
|
+
|
|
3
|
+
This module provides an STT implementation that uses the Gnani Vachana API,
|
|
4
|
+
supporting both batch recognition (REST) and real-time streaming (WebSocket).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
from livekit import rtc
|
|
16
|
+
from livekit.agents import (
|
|
17
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
|
18
|
+
APIConnectionError,
|
|
19
|
+
APIConnectOptions,
|
|
20
|
+
APIStatusError,
|
|
21
|
+
APITimeoutError,
|
|
22
|
+
stt,
|
|
23
|
+
utils,
|
|
24
|
+
)
|
|
25
|
+
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
|
26
|
+
from livekit.agents.utils import AudioBuffer
|
|
27
|
+
from livekit.agents.utils.misc import is_given
|
|
28
|
+
|
|
29
|
+
from .log import logger
|
|
30
|
+
|
|
31
|
+
GNANI_STT_BASE_URL = "https://api.vachana.ai"
|
|
32
|
+
|
|
33
|
+
GnaniSTTLanguages = Literal[
|
|
34
|
+
"bn-IN", "en-IN", "gu-IN", "hi-IN", "kn-IN",
|
|
35
|
+
"ml-IN", "mr-IN", "pa-IN", "ta-IN", "te-IN",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
SUPPORTED_LANGUAGES: set[str] = {
|
|
39
|
+
"bn-IN", "en-IN", "gu-IN", "hi-IN", "kn-IN",
|
|
40
|
+
"ml-IN", "mr-IN", "pa-IN", "ta-IN", "te-IN",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
STREAM_SUPPORTED_LANGUAGES: set[str] = SUPPORTED_LANGUAGES | {
|
|
44
|
+
"en-hi-IN-latn", "en-hi-in-cm",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
SAMPLE_RATE_16K = 16000
|
|
48
|
+
SAMPLE_RATE_8K = 8000
|
|
49
|
+
STREAM_CHUNK_BYTES = 1024
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class GnaniSTTOptions:
|
|
54
|
+
api_key: str
|
|
55
|
+
language: str
|
|
56
|
+
sample_rate: int = SAMPLE_RATE_16K
|
|
57
|
+
base_url: str = GNANI_STT_BASE_URL
|
|
58
|
+
organization_id: str | None = None
|
|
59
|
+
user_id: str | None = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class STT(stt.STT):
|
|
63
|
+
"""Gnani Vachana Speech-to-Text implementation.
|
|
64
|
+
|
|
65
|
+
Provides speech-to-text functionality using Gnani's Vachana platform.
|
|
66
|
+
Supports batch recognition via REST API and real-time streaming via WebSocket.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
language: BCP-47 language code (e.g. "hi-IN", "en-IN").
|
|
70
|
+
api_key: Gnani API key (falls back to GNANI_API_KEY env var).
|
|
71
|
+
sample_rate: Audio sample rate for streaming (8000 or 16000).
|
|
72
|
+
base_url: Vachana API base URL.
|
|
73
|
+
organization_id: Organization ID for REST API (falls back to GNANI_ORGANIZATION_ID).
|
|
74
|
+
user_id: User ID for REST API (falls back to GNANI_USER_ID).
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
*,
|
|
80
|
+
language: str = "en-IN",
|
|
81
|
+
api_key: str | None = None,
|
|
82
|
+
sample_rate: int = SAMPLE_RATE_16K,
|
|
83
|
+
base_url: str = GNANI_STT_BASE_URL,
|
|
84
|
+
organization_id: str | None = None,
|
|
85
|
+
user_id: str | None = None,
|
|
86
|
+
http_session: None = None,
|
|
87
|
+
) -> None:
|
|
88
|
+
super().__init__(
|
|
89
|
+
capabilities=stt.STTCapabilities(
|
|
90
|
+
streaming=True,
|
|
91
|
+
interim_results=False,
|
|
92
|
+
aligned_transcript=False,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self._api_key = api_key or os.environ.get("GNANI_API_KEY")
|
|
97
|
+
if not self._api_key:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
"Gnani API key is required. "
|
|
100
|
+
"Provide it directly or set GNANI_API_KEY environment variable."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if sample_rate not in (SAMPLE_RATE_8K, SAMPLE_RATE_16K):
|
|
104
|
+
raise ValueError("sample_rate must be 8000 or 16000")
|
|
105
|
+
|
|
106
|
+
self._opts = GnaniSTTOptions(
|
|
107
|
+
api_key=self._api_key,
|
|
108
|
+
language=language,
|
|
109
|
+
sample_rate=sample_rate,
|
|
110
|
+
base_url=base_url,
|
|
111
|
+
organization_id=organization_id or os.environ.get("GNANI_ORGANIZATION_ID"),
|
|
112
|
+
user_id=user_id or os.environ.get("GNANI_USER_ID"),
|
|
113
|
+
)
|
|
114
|
+
self._session: utils.aiohttp.ClientSession | None = None
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def model(self) -> str:
|
|
118
|
+
return "vachana-stt-v3"
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def provider(self) -> str:
|
|
122
|
+
return "Gnani"
|
|
123
|
+
|
|
124
|
+
def _ensure_session(self) -> utils.aiohttp.ClientSession:
|
|
125
|
+
if not self._session:
|
|
126
|
+
self._session = utils.http_context.http_session()
|
|
127
|
+
return self._session
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _single_attempt(conn_options: APIConnectOptions) -> APIConnectOptions:
|
|
131
|
+
return APIConnectOptions(
|
|
132
|
+
max_retry=0,
|
|
133
|
+
retry_interval=conn_options.retry_interval,
|
|
134
|
+
timeout=conn_options.timeout,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
async def recognize(
|
|
138
|
+
self,
|
|
139
|
+
buffer: AudioBuffer,
|
|
140
|
+
*,
|
|
141
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
|
142
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
143
|
+
) -> stt.SpeechEvent:
|
|
144
|
+
return await super().recognize(
|
|
145
|
+
buffer,
|
|
146
|
+
language=language,
|
|
147
|
+
conn_options=self._single_attempt(conn_options),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
async def _recognize_impl(
|
|
151
|
+
self,
|
|
152
|
+
buffer: AudioBuffer,
|
|
153
|
+
*,
|
|
154
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
|
155
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
156
|
+
) -> stt.SpeechEvent:
|
|
157
|
+
import aiohttp
|
|
158
|
+
|
|
159
|
+
lang = language if is_given(language) else self._opts.language
|
|
160
|
+
|
|
161
|
+
wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()
|
|
162
|
+
|
|
163
|
+
form_data = aiohttp.FormData()
|
|
164
|
+
form_data.add_field(
|
|
165
|
+
"audio_file", wav_bytes, filename="audio.wav", content_type="audio/wav"
|
|
166
|
+
)
|
|
167
|
+
form_data.add_field("language_code", lang)
|
|
168
|
+
|
|
169
|
+
headers: dict[str, str] = {
|
|
170
|
+
"X-API-Key-ID": self._opts.api_key,
|
|
171
|
+
}
|
|
172
|
+
if self._opts.organization_id:
|
|
173
|
+
headers["X-Organization-ID"] = self._opts.organization_id
|
|
174
|
+
if self._opts.user_id:
|
|
175
|
+
headers["X-API-User-ID"] = self._opts.user_id
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
async with self._ensure_session().post(
|
|
179
|
+
url=f"{self._opts.base_url}/stt/v3",
|
|
180
|
+
data=form_data,
|
|
181
|
+
headers=headers,
|
|
182
|
+
timeout=aiohttp.ClientTimeout(
|
|
183
|
+
total=conn_options.timeout,
|
|
184
|
+
sock_connect=conn_options.timeout,
|
|
185
|
+
),
|
|
186
|
+
) as res:
|
|
187
|
+
if res.status != 200:
|
|
188
|
+
error_text = await res.text()
|
|
189
|
+
logger.error(f"Gnani STT API error: {res.status} - {error_text}")
|
|
190
|
+
raise APIStatusError(
|
|
191
|
+
message=f"Gnani STT API Error ({res.status}): {error_text}",
|
|
192
|
+
status_code=res.status,
|
|
193
|
+
body=error_text,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
response_json = await res.json()
|
|
197
|
+
transcript = response_json.get("transcript", "")
|
|
198
|
+
request_id = response_json.get("request_id", "")
|
|
199
|
+
|
|
200
|
+
return stt.SpeechEvent(
|
|
201
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
202
|
+
request_id=request_id,
|
|
203
|
+
alternatives=[
|
|
204
|
+
stt.SpeechData(
|
|
205
|
+
language=lang,
|
|
206
|
+
text=transcript,
|
|
207
|
+
confidence=1.0,
|
|
208
|
+
)
|
|
209
|
+
],
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
except asyncio.TimeoutError as e:
|
|
213
|
+
raise APITimeoutError("Gnani STT API request timed out") from e
|
|
214
|
+
except (APIStatusError, APIConnectionError, APITimeoutError):
|
|
215
|
+
raise
|
|
216
|
+
except Exception as e:
|
|
217
|
+
raise APIConnectionError(f"Gnani STT error: {e}") from e
|
|
218
|
+
|
|
219
|
+
def stream(
|
|
220
|
+
self,
|
|
221
|
+
*,
|
|
222
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
|
223
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
224
|
+
) -> SpeechStream:
|
|
225
|
+
lang = language if is_given(language) else self._opts.language
|
|
226
|
+
return SpeechStream(
|
|
227
|
+
stt=self,
|
|
228
|
+
opts=GnaniSTTOptions(
|
|
229
|
+
api_key=self._opts.api_key,
|
|
230
|
+
language=lang,
|
|
231
|
+
sample_rate=self._opts.sample_rate,
|
|
232
|
+
base_url=self._opts.base_url,
|
|
233
|
+
organization_id=self._opts.organization_id,
|
|
234
|
+
user_id=self._opts.user_id,
|
|
235
|
+
),
|
|
236
|
+
conn_options=self._single_attempt(conn_options),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
async def aclose(self) -> None:
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class SpeechStream(stt.RecognizeStream):
|
|
244
|
+
"""WebSocket-based streaming STT for Gnani Vachana.
|
|
245
|
+
|
|
246
|
+
Connects to wss://api.vachana.ai/stt/v3/stream and sends raw PCM audio
|
|
247
|
+
in 1024-byte chunks (512 samples, 16-bit mono).
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
def __init__(
|
|
251
|
+
self,
|
|
252
|
+
*,
|
|
253
|
+
stt: STT,
|
|
254
|
+
opts: GnaniSTTOptions,
|
|
255
|
+
conn_options: APIConnectOptions,
|
|
256
|
+
) -> None:
|
|
257
|
+
super().__init__(
|
|
258
|
+
stt=stt,
|
|
259
|
+
conn_options=conn_options,
|
|
260
|
+
sample_rate=opts.sample_rate,
|
|
261
|
+
)
|
|
262
|
+
self._opts = opts
|
|
263
|
+
|
|
264
|
+
def _build_ws_url(self) -> str:
|
|
265
|
+
base = self._opts.base_url
|
|
266
|
+
if base.startswith("https://"):
|
|
267
|
+
ws_base = "wss://" + base[len("https://"):]
|
|
268
|
+
elif base.startswith("http://"):
|
|
269
|
+
ws_base = "ws://" + base[len("http://"):]
|
|
270
|
+
else:
|
|
271
|
+
ws_base = "wss://" + base
|
|
272
|
+
return f"{ws_base}/stt/v3/stream"
|
|
273
|
+
|
|
274
|
+
async def _run(self) -> None:
|
|
275
|
+
import websockets
|
|
276
|
+
|
|
277
|
+
ws_url = self._build_ws_url()
|
|
278
|
+
headers = {
|
|
279
|
+
"x-api-key-id": self._opts.api_key,
|
|
280
|
+
"lang_code": self._opts.language,
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
async with websockets.connect(
|
|
285
|
+
ws_url,
|
|
286
|
+
additional_headers=headers,
|
|
287
|
+
ping_interval=20,
|
|
288
|
+
ping_timeout=20,
|
|
289
|
+
close_timeout=10,
|
|
290
|
+
) as ws:
|
|
291
|
+
connected_msg = await asyncio.wait_for(ws.recv(), timeout=10)
|
|
292
|
+
connected_data = json.loads(connected_msg)
|
|
293
|
+
if connected_data.get("type") != "connected":
|
|
294
|
+
logger.warning(
|
|
295
|
+
f"Unexpected first message from Gnani STT: {connected_data}"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
send_task = asyncio.create_task(
|
|
299
|
+
self._send_audio(ws), name="gnani-stt-send"
|
|
300
|
+
)
|
|
301
|
+
recv_task = asyncio.create_task(
|
|
302
|
+
self._recv_messages(ws), name="gnani-stt-recv"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
await asyncio.gather(send_task, recv_task)
|
|
307
|
+
finally:
|
|
308
|
+
send_task.cancel()
|
|
309
|
+
recv_task.cancel()
|
|
310
|
+
with utils.aio.suppress(asyncio.CancelledError):
|
|
311
|
+
await send_task
|
|
312
|
+
with utils.aio.suppress(asyncio.CancelledError):
|
|
313
|
+
await recv_task
|
|
314
|
+
|
|
315
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
316
|
+
raise APIConnectionError(
|
|
317
|
+
f"Gnani STT WebSocket closed unexpectedly: {e}"
|
|
318
|
+
) from e
|
|
319
|
+
except asyncio.TimeoutError as e:
|
|
320
|
+
raise APITimeoutError("Gnani STT WebSocket connection timed out") from e
|
|
321
|
+
except (APIConnectionError, APIStatusError, APITimeoutError):
|
|
322
|
+
raise
|
|
323
|
+
except Exception as e:
|
|
324
|
+
raise APIConnectionError(f"Gnani STT WebSocket error: {e}") from e
|
|
325
|
+
|
|
326
|
+
async def _send_audio(self, ws) -> None:
|
|
327
|
+
audio_buffer = bytearray()
|
|
328
|
+
|
|
329
|
+
async for data in self._input_ch:
|
|
330
|
+
if isinstance(data, self._FlushSentinel):
|
|
331
|
+
if audio_buffer:
|
|
332
|
+
await ws.send(bytes(audio_buffer))
|
|
333
|
+
audio_buffer.clear()
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
frame: rtc.AudioFrame = data
|
|
337
|
+
raw_pcm = frame.data.tobytes()
|
|
338
|
+
audio_buffer.extend(raw_pcm)
|
|
339
|
+
|
|
340
|
+
while len(audio_buffer) >= STREAM_CHUNK_BYTES:
|
|
341
|
+
chunk = bytes(audio_buffer[:STREAM_CHUNK_BYTES])
|
|
342
|
+
audio_buffer = audio_buffer[STREAM_CHUNK_BYTES:]
|
|
343
|
+
await ws.send(chunk)
|
|
344
|
+
|
|
345
|
+
if audio_buffer:
|
|
346
|
+
await ws.send(bytes(audio_buffer))
|
|
347
|
+
|
|
348
|
+
await ws.close()
|
|
349
|
+
|
|
350
|
+
async def _recv_messages(self, ws) -> None:
|
|
351
|
+
try:
|
|
352
|
+
async for msg in ws:
|
|
353
|
+
if isinstance(msg, bytes):
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
data = json.loads(msg)
|
|
357
|
+
msg_type = data.get("type", "")
|
|
358
|
+
|
|
359
|
+
if msg_type == "transcript":
|
|
360
|
+
text = data.get("text", "")
|
|
361
|
+
if not text:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
self._event_ch.send_nowait(
|
|
365
|
+
stt.SpeechEvent(
|
|
366
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
367
|
+
alternatives=[
|
|
368
|
+
stt.SpeechData(
|
|
369
|
+
language=self._opts.language,
|
|
370
|
+
text=text,
|
|
371
|
+
confidence=1.0,
|
|
372
|
+
)
|
|
373
|
+
],
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
elif msg_type in ("speech_start", "vad_start"):
|
|
378
|
+
self._event_ch.send_nowait(
|
|
379
|
+
stt.SpeechEvent(
|
|
380
|
+
type=stt.SpeechEventType.START_OF_SPEECH,
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
elif msg_type in ("speech_end", "vad_end"):
|
|
385
|
+
self._event_ch.send_nowait(
|
|
386
|
+
stt.SpeechEvent(
|
|
387
|
+
type=stt.SpeechEventType.END_OF_SPEECH,
|
|
388
|
+
)
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
elif msg_type == "processing":
|
|
392
|
+
pass
|
|
393
|
+
|
|
394
|
+
elif msg_type == "error":
|
|
395
|
+
error_msg = data.get("message", "Unknown error")
|
|
396
|
+
logger.error(f"Gnani STT stream error: {error_msg}")
|
|
397
|
+
raise APIStatusError(
|
|
398
|
+
message=f"Gnani STT stream error: {error_msg}",
|
|
399
|
+
status_code=500,
|
|
400
|
+
body=error_msg,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
except asyncio.CancelledError:
|
|
404
|
+
raise
|
|
405
|
+
except (APIStatusError, APIConnectionError):
|
|
406
|
+
raise
|
|
407
|
+
except Exception as e:
|
|
408
|
+
raise APIConnectionError(
|
|
409
|
+
f"Error receiving Gnani STT messages: {e}"
|
|
410
|
+
) from e
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""Text-to-Speech implementation for Gnani Vachana
|
|
2
|
+
|
|
3
|
+
This module provides a TTS implementation that uses the Gnani Vachana API,
|
|
4
|
+
supporting both chunked synthesis (REST) and real-time streaming (WebSocket).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import base64
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
from dataclasses import dataclass, replace
|
|
14
|
+
from typing import Literal
|
|
15
|
+
|
|
16
|
+
from livekit.agents import (
|
|
17
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
|
18
|
+
APIConnectionError,
|
|
19
|
+
APIConnectOptions,
|
|
20
|
+
APIStatusError,
|
|
21
|
+
APITimeoutError,
|
|
22
|
+
tokenize,
|
|
23
|
+
tts,
|
|
24
|
+
utils,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from .log import logger
|
|
28
|
+
|
|
29
|
+
GNANI_TTS_BASE_URL = "https://api.vachana.ai"
|
|
30
|
+
|
|
31
|
+
GnaniTTSVoices = Literal[
|
|
32
|
+
"sia", "raju", "kanika", "nikita", "ravan", "simran", "karan", "neha",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
SUPPORTED_VOICES: set[str] = {
|
|
36
|
+
"sia", "raju", "kanika", "nikita", "ravan", "simran", "karan", "neha",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
GnaniTTSEncodings = Literal["linear_pcm", "oggopus"]
|
|
40
|
+
GnaniTTSContainers = Literal["raw", "mp3", "wav", "mulaw", "ogg"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class GnaniTTSOptions:
|
|
45
|
+
api_key: str
|
|
46
|
+
voice: str = "sia"
|
|
47
|
+
model: str = "vachana-voice-v2"
|
|
48
|
+
sample_rate: int = 24000
|
|
49
|
+
encoding: str = "linear_pcm"
|
|
50
|
+
container: str = "wav"
|
|
51
|
+
num_channels: int = 1
|
|
52
|
+
sample_width: int = 2
|
|
53
|
+
base_url: str = GNANI_TTS_BASE_URL
|
|
54
|
+
language: str = "IND-IN"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class TTS(tts.TTS):
|
|
58
|
+
"""Gnani Vachana Text-to-Speech implementation.
|
|
59
|
+
|
|
60
|
+
Provides text-to-speech functionality using Gnani's Vachana platform.
|
|
61
|
+
Supports batch synthesis via REST API and real-time streaming via WebSocket.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
voice: Voice to use for synthesis (sia, raju, kanika, etc.).
|
|
65
|
+
model: TTS model name (default: vachana-voice-v2).
|
|
66
|
+
sample_rate: Audio output sample rate (8000-44100).
|
|
67
|
+
encoding: Audio encoding (linear_pcm or oggopus).
|
|
68
|
+
container: Audio container format (raw, mp3, wav, mulaw, ogg).
|
|
69
|
+
api_key: Gnani API key (falls back to GNANI_API_KEY env var).
|
|
70
|
+
base_url: Vachana API base URL.
|
|
71
|
+
language: Language code for WebSocket TTS (default: IND-IN).
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
*,
|
|
77
|
+
voice: GnaniTTSVoices | str = "sia",
|
|
78
|
+
model: str = "vachana-voice-v2",
|
|
79
|
+
sample_rate: int = 24000,
|
|
80
|
+
num_channels: int = 1,
|
|
81
|
+
encoding: GnaniTTSEncodings | str = "linear_pcm",
|
|
82
|
+
container: GnaniTTSContainers | str = "wav",
|
|
83
|
+
api_key: str | None = None,
|
|
84
|
+
base_url: str = GNANI_TTS_BASE_URL,
|
|
85
|
+
language: str = "IND-IN",
|
|
86
|
+
) -> None:
|
|
87
|
+
super().__init__(
|
|
88
|
+
capabilities=tts.TTSCapabilities(streaming=True),
|
|
89
|
+
sample_rate=sample_rate,
|
|
90
|
+
num_channels=num_channels,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
self._api_key = api_key or os.environ.get("GNANI_API_KEY")
|
|
94
|
+
if not self._api_key:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
"Gnani API key is required. "
|
|
97
|
+
"Provide it directly or set GNANI_API_KEY environment variable."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if voice not in SUPPORTED_VOICES:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"Voice '{voice}' not supported. Choose from: {sorted(SUPPORTED_VOICES)}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
self._opts = GnaniTTSOptions(
|
|
106
|
+
api_key=self._api_key,
|
|
107
|
+
voice=voice,
|
|
108
|
+
model=model,
|
|
109
|
+
sample_rate=sample_rate,
|
|
110
|
+
encoding=encoding,
|
|
111
|
+
container=container,
|
|
112
|
+
num_channels=num_channels,
|
|
113
|
+
base_url=base_url,
|
|
114
|
+
language=language,
|
|
115
|
+
)
|
|
116
|
+
self._session = None
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def model(self) -> str:
|
|
120
|
+
return self._opts.model
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def provider(self) -> str:
|
|
124
|
+
return "Gnani"
|
|
125
|
+
|
|
126
|
+
def _ensure_session(self):
|
|
127
|
+
if not self._session:
|
|
128
|
+
self._session = utils.http_context.http_session()
|
|
129
|
+
return self._session
|
|
130
|
+
|
|
131
|
+
def synthesize(
|
|
132
|
+
self, text: str, *, conn_options: APIConnectOptions | None = None
|
|
133
|
+
) -> ChunkedStream:
|
|
134
|
+
if conn_options is None:
|
|
135
|
+
conn_options = DEFAULT_API_CONNECT_OPTIONS
|
|
136
|
+
return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
|
|
137
|
+
|
|
138
|
+
def stream(
|
|
139
|
+
self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
|
|
140
|
+
) -> SynthesizeStream:
|
|
141
|
+
return SynthesizeStream(tts=self, conn_options=conn_options)
|
|
142
|
+
|
|
143
|
+
def update_options(
|
|
144
|
+
self,
|
|
145
|
+
*,
|
|
146
|
+
voice: str | None = None,
|
|
147
|
+
model: str | None = None,
|
|
148
|
+
language: str | None = None,
|
|
149
|
+
) -> None:
|
|
150
|
+
if voice is not None:
|
|
151
|
+
if voice not in SUPPORTED_VOICES:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
f"Voice '{voice}' not supported. Choose from: {sorted(SUPPORTED_VOICES)}"
|
|
154
|
+
)
|
|
155
|
+
self._opts.voice = voice
|
|
156
|
+
if model is not None:
|
|
157
|
+
self._opts.model = model
|
|
158
|
+
if language is not None:
|
|
159
|
+
self._opts.language = language
|
|
160
|
+
|
|
161
|
+
async def aclose(self) -> None:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class ChunkedStream(tts.ChunkedStream):
|
|
166
|
+
"""REST-based chunked TTS for Gnani Vachana.
|
|
167
|
+
|
|
168
|
+
Uses POST /api/v1/tts/inference to synthesize text in a single request.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(
|
|
172
|
+
self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions
|
|
173
|
+
) -> None:
|
|
174
|
+
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
|
175
|
+
self._tts: TTS = tts
|
|
176
|
+
self._opts = replace(tts._opts)
|
|
177
|
+
|
|
178
|
+
async def _run(self, output_emitter: tts.AudioEmitter) -> None:
|
|
179
|
+
import aiohttp
|
|
180
|
+
|
|
181
|
+
payload = {
|
|
182
|
+
"text": self._input_text,
|
|
183
|
+
"voice": self._opts.voice,
|
|
184
|
+
"model": self._opts.model,
|
|
185
|
+
"audio_config": {
|
|
186
|
+
"sample_rate": self._opts.sample_rate,
|
|
187
|
+
"encoding": self._opts.encoding,
|
|
188
|
+
"num_channels": self._opts.num_channels,
|
|
189
|
+
"sample_width": self._opts.sample_width,
|
|
190
|
+
"container": self._opts.container,
|
|
191
|
+
},
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
headers = {
|
|
195
|
+
"X-API-Key-ID": self._opts.api_key,
|
|
196
|
+
"Content-Type": "application/json",
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
mime_type = f"audio/{self._opts.container}"
|
|
200
|
+
if self._opts.container == "raw":
|
|
201
|
+
mime_type = "audio/pcm"
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
async with self._tts._ensure_session().post(
|
|
205
|
+
url=f"{self._opts.base_url}/api/v1/tts/inference",
|
|
206
|
+
json=payload,
|
|
207
|
+
headers=headers,
|
|
208
|
+
timeout=aiohttp.ClientTimeout(
|
|
209
|
+
total=self._conn_options.timeout,
|
|
210
|
+
sock_connect=self._conn_options.timeout,
|
|
211
|
+
),
|
|
212
|
+
) as res:
|
|
213
|
+
if res.status != 200:
|
|
214
|
+
error_text = await res.text()
|
|
215
|
+
logger.error(f"Gnani TTS API error: {res.status} - {error_text}")
|
|
216
|
+
raise APIStatusError(
|
|
217
|
+
message=f"Gnani TTS API Error ({res.status}): {error_text}",
|
|
218
|
+
status_code=res.status,
|
|
219
|
+
body=error_text,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
audio_bytes = await res.read()
|
|
223
|
+
|
|
224
|
+
output_emitter.initialize(
|
|
225
|
+
request_id="gnani-tts",
|
|
226
|
+
sample_rate=self._tts.sample_rate,
|
|
227
|
+
num_channels=self._tts.num_channels,
|
|
228
|
+
mime_type=mime_type,
|
|
229
|
+
)
|
|
230
|
+
output_emitter.push(audio_bytes)
|
|
231
|
+
|
|
232
|
+
except asyncio.TimeoutError as e:
|
|
233
|
+
raise APITimeoutError("Gnani TTS API request timed out") from e
|
|
234
|
+
except (APIStatusError, APIConnectionError, APITimeoutError):
|
|
235
|
+
raise
|
|
236
|
+
except Exception as e:
|
|
237
|
+
raise APIConnectionError(f"Gnani TTS error: {e}") from e
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class SynthesizeStream(tts.SynthesizeStream):
|
|
241
|
+
"""WebSocket-based streaming TTS for Gnani Vachana.
|
|
242
|
+
|
|
243
|
+
Opens a WebSocket to wss://api.vachana.ai/api/v1/tts and streams
|
|
244
|
+
audio chunks back as they are synthesized.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
|
|
248
|
+
super().__init__(tts=tts, conn_options=conn_options)
|
|
249
|
+
self._tts: TTS = tts
|
|
250
|
+
self._opts = replace(tts._opts)
|
|
251
|
+
|
|
252
|
+
def _build_ws_url(self) -> str:
|
|
253
|
+
base = self._opts.base_url
|
|
254
|
+
if base.startswith("https://"):
|
|
255
|
+
ws_base = "wss://" + base[len("https://"):]
|
|
256
|
+
elif base.startswith("http://"):
|
|
257
|
+
ws_base = "ws://" + base[len("http://"):]
|
|
258
|
+
else:
|
|
259
|
+
ws_base = "wss://" + base
|
|
260
|
+
return f"{ws_base}/api/v1/tts"
|
|
261
|
+
|
|
262
|
+
async def _run(self, output_emitter: tts.AudioEmitter) -> None:
|
|
263
|
+
import websockets
|
|
264
|
+
|
|
265
|
+
token_buf = ""
|
|
266
|
+
word_stream = tokenize.basic.SentenceTokenizer().stream()
|
|
267
|
+
_flushing = False
|
|
268
|
+
|
|
269
|
+
async def _input_task():
|
|
270
|
+
nonlocal _flushing
|
|
271
|
+
async for data in self._input:
|
|
272
|
+
if isinstance(data, str):
|
|
273
|
+
word_stream.push_text(data)
|
|
274
|
+
elif isinstance(data, self._FlushSentinel):
|
|
275
|
+
word_stream.flush()
|
|
276
|
+
_flushing = True
|
|
277
|
+
word_stream.end_input()
|
|
278
|
+
|
|
279
|
+
input_task = asyncio.create_task(_input_task(), name="gnani-tts-input")
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
async for ev in word_stream:
|
|
283
|
+
text = ev.token
|
|
284
|
+
if not text.strip():
|
|
285
|
+
continue
|
|
286
|
+
await self._synthesize_segment(text, output_emitter)
|
|
287
|
+
finally:
|
|
288
|
+
input_task.cancel()
|
|
289
|
+
with utils.aio.suppress(asyncio.CancelledError):
|
|
290
|
+
await input_task
|
|
291
|
+
|
|
292
|
+
async def _synthesize_segment(
|
|
293
|
+
self, text: str, output_emitter: tts.AudioEmitter
|
|
294
|
+
) -> None:
|
|
295
|
+
import websockets
|
|
296
|
+
|
|
297
|
+
ws_url = self._build_ws_url()
|
|
298
|
+
headers = {
|
|
299
|
+
"Content-Type": "application/json",
|
|
300
|
+
"X-API-Key-ID": self._opts.api_key,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
mime_type = f"audio/{self._opts.container}"
|
|
304
|
+
if self._opts.container == "raw":
|
|
305
|
+
mime_type = "audio/pcm"
|
|
306
|
+
|
|
307
|
+
output_emitter.initialize(
|
|
308
|
+
request_id="gnani-tts-stream",
|
|
309
|
+
sample_rate=self._tts.sample_rate,
|
|
310
|
+
num_channels=self._tts.num_channels,
|
|
311
|
+
mime_type=mime_type,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
async with websockets.connect(
|
|
316
|
+
ws_url,
|
|
317
|
+
additional_headers=headers,
|
|
318
|
+
ping_interval=20,
|
|
319
|
+
ping_timeout=20,
|
|
320
|
+
close_timeout=10,
|
|
321
|
+
) as ws:
|
|
322
|
+
request_body = {
|
|
323
|
+
"text": text,
|
|
324
|
+
"voice": self._opts.voice,
|
|
325
|
+
"model": self._opts.model,
|
|
326
|
+
"language": self._opts.language,
|
|
327
|
+
"audio_config": {
|
|
328
|
+
"sample_rate": self._opts.sample_rate,
|
|
329
|
+
"encoding": self._opts.encoding,
|
|
330
|
+
"num_channels": self._opts.num_channels,
|
|
331
|
+
"sample_width": self._opts.sample_width,
|
|
332
|
+
"container": self._opts.container,
|
|
333
|
+
},
|
|
334
|
+
}
|
|
335
|
+
await ws.send(json.dumps(request_body))
|
|
336
|
+
|
|
337
|
+
async for msg in ws:
|
|
338
|
+
if isinstance(msg, bytes):
|
|
339
|
+
output_emitter.push(msg)
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
data = json.loads(msg)
|
|
343
|
+
msg_type = data.get("type", "")
|
|
344
|
+
|
|
345
|
+
if msg_type == "audio":
|
|
346
|
+
audio_b64 = data.get("audio", "")
|
|
347
|
+
if audio_b64:
|
|
348
|
+
output_emitter.push(base64.b64decode(audio_b64))
|
|
349
|
+
|
|
350
|
+
elif msg_type == "complete":
|
|
351
|
+
audio_b64 = data.get("audio", "")
|
|
352
|
+
if audio_b64:
|
|
353
|
+
output_emitter.push(base64.b64decode(audio_b64))
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
elif msg_type == "error":
|
|
357
|
+
error_msg = data.get("message", "Unknown error")
|
|
358
|
+
logger.error(f"Gnani TTS stream error: {error_msg}")
|
|
359
|
+
raise APIStatusError(
|
|
360
|
+
message=f"Gnani TTS stream error: {error_msg}",
|
|
361
|
+
status_code=500,
|
|
362
|
+
body=error_msg,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
366
|
+
raise APIConnectionError(
|
|
367
|
+
f"Gnani TTS WebSocket closed: {e}"
|
|
368
|
+
) from e
|
|
369
|
+
except asyncio.TimeoutError as e:
|
|
370
|
+
raise APITimeoutError(
|
|
371
|
+
"Gnani TTS WebSocket timed out"
|
|
372
|
+
) from e
|
|
373
|
+
except (APIStatusError, APIConnectionError, APITimeoutError):
|
|
374
|
+
raise
|
|
375
|
+
except Exception as e:
|
|
376
|
+
raise APIConnectionError(f"Gnani TTS WebSocket error: {e}") from e
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "livekit-plugins-gnani"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "LiveKit Agents plugin for Gnani Vachana speech AI — STT & TTS for Indian languages"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Genvoice", email = "speechstack@gnani.ai" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"webrtc", "realtime", "audio", "livekit", "livekit-agents",
|
|
17
|
+
"gnani", "vachana", "indian-languages", "indic",
|
|
18
|
+
"stt", "tts", "speech-to-text", "text-to-speech",
|
|
19
|
+
"multilingual", "streaming", "websocket",
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"Intended Audience :: Developers",
|
|
24
|
+
"License :: OSI Approved :: Apache Software License",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
32
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
33
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
34
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
35
|
+
]
|
|
36
|
+
dependencies = [
|
|
37
|
+
"livekit-agents[codecs]>=1.5.8",
|
|
38
|
+
"gnani-vachana>=0.2.2,<1.0",
|
|
39
|
+
"websockets>=13.1,<16.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://gnani.ai"
|
|
44
|
+
Documentation = "https://docs.inya.ai/vachana"
|
|
45
|
+
Repository = "https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani"
|
|
46
|
+
Issues = "https://github.com/Gnani-AI-Mintlify/livekit-plugins-gnani/issues"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["livekit"]
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.sdist]
|
|
52
|
+
include = ["/livekit"]
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Ruff
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
target-version = "py310"
|
|
59
|
+
line-length = 100
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["E", "W", "F", "I", "N", "UP", "B", "SIM", "TCH", "RUF"]
|
|
63
|
+
ignore = ["E501"]
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint.isort]
|
|
66
|
+
known-first-party = ["livekit"]
|
|
67
|
+
|
|
68
|
+
[tool.ruff.format]
|
|
69
|
+
quote-style = "double"
|
|
70
|
+
indent-style = "space"
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Mypy
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
[tool.mypy]
|
|
76
|
+
python_version = "3.10"
|
|
77
|
+
warn_return_any = true
|
|
78
|
+
warn_unused_configs = true
|