dwani 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dwani-0.1.4/PKG-INFO +70 -0
- dwani-0.1.4/README.md +34 -0
- dwani-0.1.2/dwani/vision.py → dwani-0.1.4/dwani/asr.py +7 -8
- {dwani-0.1.2 → dwani-0.1.4}/dwani/audio.py +6 -6
- dwani-0.1.4/dwani/chat.py +25 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani/client.py +6 -7
- dwani-0.1.4/dwani/vision.py +31 -0
- dwani-0.1.4/dwani.egg-info/PKG-INFO +70 -0
- dwani-0.1.4/pyproject.toml +23 -0
- dwani-0.1.2/PKG-INFO +0 -188
- dwani-0.1.2/README.md +0 -152
- dwani-0.1.2/dwani/asr.py +0 -37
- dwani-0.1.2/dwani/chat.py +0 -17
- dwani-0.1.2/dwani.egg-info/PKG-INFO +0 -188
- dwani-0.1.2/pyproject.toml +0 -23
- {dwani-0.1.2 → dwani-0.1.4}/LICENSE +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani/__init__.py +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani/docs.py +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani/exceptions.py +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani.egg-info/SOURCES.txt +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani.egg-info/dependency_links.txt +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani.egg-info/requires.txt +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/dwani.egg-info/top_level.txt +0 -0
- {dwani-0.1.2 → dwani-0.1.4}/setup.cfg +0 -0
dwani-0.1.4/PKG-INFO
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: dwani
|
3
|
+
Version: 0.1.4
|
4
|
+
Summary: Multimodal API for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
|
5
|
+
Author-email: sachin <python@dwani.ai>
|
6
|
+
License: MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2025 Sachin Shetty
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
Project-URL: Homepage, https://github.com/dwani-ai/dwani-python
|
29
|
+
Project-URL: Source, https://github.com/dwani-ai/dwani-python
|
30
|
+
Project-URL: Issues, https://github.com/dwani-ai/dwani-python/issues
|
31
|
+
Requires-Python: >=3.8
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
License-File: LICENSE
|
34
|
+
Requires-Dist: requests>=2.25.0
|
35
|
+
Dynamic: license-file
|
36
|
+
|
37
|
+
# dwani.ai - python library
|
38
|
+
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install dwani
|
42
|
+
```
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
```python
|
47
|
+
import dwani
|
48
|
+
import os
|
49
|
+
|
50
|
+
dwani.api_key = os.getenv("DWANI_API_KEY")
|
51
|
+
|
52
|
+
dwani.api_base = os.getenv("DWANI_API_BASE_URL")
|
53
|
+
|
54
|
+
resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
|
55
|
+
print(resp)
|
56
|
+
```
|
57
|
+
|
58
|
+
|
59
|
+
<!--
|
60
|
+
## local development
|
61
|
+
pip install -e .
|
62
|
+
|
63
|
+
|
64
|
+
pip install twine build
|
65
|
+
rm -rf dist/
|
66
|
+
python -m build
|
67
|
+
|
68
|
+
python -m twine upload dist/*
|
69
|
+
|
70
|
+
-->
|
dwani-0.1.4/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# dwani.ai - python library
|
2
|
+
|
3
|
+
|
4
|
+
```bash
|
5
|
+
pip install dwani
|
6
|
+
```
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
```python
|
11
|
+
import dwani
|
12
|
+
import os
|
13
|
+
|
14
|
+
dwani.api_key = os.getenv("DWANI_API_KEY")
|
15
|
+
|
16
|
+
dwani.api_base = os.getenv("DWANI_API_BASE_URL")
|
17
|
+
|
18
|
+
resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
|
19
|
+
print(resp)
|
20
|
+
```
|
21
|
+
|
22
|
+
|
23
|
+
<!--
|
24
|
+
## local development
|
25
|
+
pip install -e .
|
26
|
+
|
27
|
+
|
28
|
+
pip install twine build
|
29
|
+
rm -rf dist/
|
30
|
+
python -m build
|
31
|
+
|
32
|
+
python -m twine upload dist/*
|
33
|
+
|
34
|
+
-->
|
@@ -1,21 +1,20 @@
|
|
1
1
|
from .exceptions import DhwaniAPIError
|
2
2
|
import requests
|
3
|
-
def
|
3
|
+
def asr_transcribe(client, file_path, language):
|
4
4
|
with open(file_path, "rb") as f:
|
5
5
|
files = {"file": f}
|
6
|
-
data = {"length": length}
|
7
6
|
resp = requests.post(
|
8
|
-
f"{client.api_base}/
|
7
|
+
f"{client.api_base}/v1/transcribe/?language={language}",
|
9
8
|
headers=client._headers(),
|
10
|
-
files=files
|
11
|
-
data=data
|
9
|
+
files=files
|
12
10
|
)
|
13
11
|
if resp.status_code != 200:
|
14
12
|
raise DhwaniAPIError(resp)
|
15
13
|
return resp.json()
|
16
14
|
|
17
|
-
class
|
15
|
+
class ASR:
|
18
16
|
@staticmethod
|
19
|
-
def
|
17
|
+
def transcribe(*args, **kwargs):
|
20
18
|
from . import _get_client
|
21
|
-
return _get_client().
|
19
|
+
return _get_client().transcribe(*args, **kwargs)
|
20
|
+
|
@@ -1,16 +1,16 @@
|
|
1
1
|
from .exceptions import DhwaniAPIError
|
2
2
|
import requests
|
3
|
-
|
4
|
-
|
3
|
+
|
4
|
+
def audio_speech(client, input, response_format="mp3", output_file=None):
|
5
|
+
params = {
|
5
6
|
"input": input,
|
6
|
-
"voice": voice,
|
7
|
-
"model": model,
|
8
7
|
"response_format": response_format
|
9
8
|
}
|
10
9
|
resp = requests.post(
|
11
10
|
f"{client.api_base}/v1/audio/speech",
|
12
|
-
headers={**client._headers(), "
|
13
|
-
|
11
|
+
headers={**client._headers(), "accept": "application/json"},
|
12
|
+
params=params,
|
13
|
+
data='', # Empty body, as in the curl example
|
14
14
|
stream=True
|
15
15
|
)
|
16
16
|
if resp.status_code != 200:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from .exceptions import DhwaniAPIError
|
2
|
+
import requests
|
3
|
+
|
4
|
+
def chat_create(client, prompt, src_lang, tgt_lang, **kwargs):
|
5
|
+
url = f"{client.api_base}/v1/indic_chat"
|
6
|
+
payload = {
|
7
|
+
"prompt": prompt,
|
8
|
+
"src_lang": src_lang,
|
9
|
+
"tgt_lang": tgt_lang
|
10
|
+
}
|
11
|
+
payload.update(kwargs)
|
12
|
+
resp = requests.post(
|
13
|
+
url,
|
14
|
+
headers={**client._headers(), "Content-Type": "application/json"},
|
15
|
+
json=payload
|
16
|
+
)
|
17
|
+
if resp.status_code != 200:
|
18
|
+
raise DhwaniAPIError(resp)
|
19
|
+
return resp.json()
|
20
|
+
|
21
|
+
class Chat:
|
22
|
+
@staticmethod
|
23
|
+
def create(prompt, src_lang, tgt_lang, **kwargs):
|
24
|
+
from . import _get_client
|
25
|
+
return _get_client().chat(prompt, src_lang, tgt_lang, **kwargs)
|
@@ -4,26 +4,25 @@ from .exceptions import DhwaniAPIError
|
|
4
4
|
|
5
5
|
class DhwaniClient:
|
6
6
|
def __init__(self, api_key=None, api_base=None):
|
7
|
-
self.api_key = api_key or os.getenv("
|
8
|
-
self.api_base = "
|
9
|
-
#self.api_base = api_base or os.getenv("DHWANI_API_BASE", "http://localhost:7860")
|
7
|
+
self.api_key = api_key or os.getenv("DWANI_API_KEY")
|
8
|
+
self.api_base = api_base or os.getenv("DWANI_API_BASE_URL", "http://localhost:7860")
|
10
9
|
if not self.api_key:
|
11
10
|
raise ValueError("DHWANI_API_KEY not set")
|
12
11
|
|
13
12
|
def _headers(self):
|
14
13
|
return {"X-API-Key": self.api_key}
|
15
14
|
|
16
|
-
def chat(self, prompt, **kwargs):
|
15
|
+
def chat(self, prompt, src_lang, tgt_lang, **kwargs):
|
17
16
|
from .chat import chat_create
|
18
|
-
return chat_create(self, prompt, **kwargs)
|
17
|
+
return chat_create(self, prompt, src_lang, tgt_lang, **kwargs)
|
19
18
|
|
20
19
|
def speech(self, *args, **kwargs):
|
21
20
|
from .audio import audio_speech
|
22
21
|
return audio_speech(self, *args, **kwargs)
|
23
22
|
|
24
|
-
def caption(self,
|
23
|
+
def caption(self, file_path, query="describe the image", src_lang="eng_Latn", tgt_lang="kan_Knda"):
|
25
24
|
from .vision import vision_caption
|
26
|
-
return vision_caption(self,
|
25
|
+
return vision_caption(self, file_path, query, src_lang, tgt_lang)
|
27
26
|
|
28
27
|
def transcribe(self, *args, **kwargs):
|
29
28
|
from .asr import asr_transcribe
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from .exceptions import DhwaniAPIError
|
2
|
+
import requests
|
3
|
+
def vision_caption(client, file_path, query="describe the image", src_lang="eng_Latn", tgt_lang="kan_Knda"):
|
4
|
+
# Build the endpoint using the client's api_base
|
5
|
+
url = (
|
6
|
+
f"{client.api_base}/v1/indic_visual_query"
|
7
|
+
f"?src_lang={src_lang}&tgt_lang={tgt_lang}"
|
8
|
+
)
|
9
|
+
headers = {
|
10
|
+
**client._headers(),
|
11
|
+
"accept": "application/json"
|
12
|
+
# Note: 'Content-Type' will be set automatically by requests when using 'files'
|
13
|
+
}
|
14
|
+
with open(file_path, "rb") as f:
|
15
|
+
files = {"file": (file_path, f, "image/png")}
|
16
|
+
data = {"query": query}
|
17
|
+
resp = requests.post(
|
18
|
+
url,
|
19
|
+
headers=headers,
|
20
|
+
files=files,
|
21
|
+
data=data
|
22
|
+
)
|
23
|
+
if resp.status_code != 200:
|
24
|
+
raise DhwaniAPIError(resp)
|
25
|
+
return resp.json()
|
26
|
+
|
27
|
+
class Vision:
|
28
|
+
@staticmethod
|
29
|
+
def caption(*args, **kwargs):
|
30
|
+
from . import _get_client
|
31
|
+
return _get_client().caption(*args, **kwargs)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: dwani
|
3
|
+
Version: 0.1.4
|
4
|
+
Summary: Multimodal API for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
|
5
|
+
Author-email: sachin <python@dwani.ai>
|
6
|
+
License: MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2025 Sachin Shetty
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
Project-URL: Homepage, https://github.com/dwani-ai/dwani-python
|
29
|
+
Project-URL: Source, https://github.com/dwani-ai/dwani-python
|
30
|
+
Project-URL: Issues, https://github.com/dwani-ai/dwani-python/issues
|
31
|
+
Requires-Python: >=3.8
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
License-File: LICENSE
|
34
|
+
Requires-Dist: requests>=2.25.0
|
35
|
+
Dynamic: license-file
|
36
|
+
|
37
|
+
# dwani.ai - python library
|
38
|
+
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install dwani
|
42
|
+
```
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
```python
|
47
|
+
import dwani
|
48
|
+
import os
|
49
|
+
|
50
|
+
dwani.api_key = os.getenv("DWANI_API_KEY")
|
51
|
+
|
52
|
+
dwani.api_base = os.getenv("DWANI_API_BASE_URL")
|
53
|
+
|
54
|
+
resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
|
55
|
+
print(resp)
|
56
|
+
```
|
57
|
+
|
58
|
+
|
59
|
+
<!--
|
60
|
+
## local development
|
61
|
+
pip install -e .
|
62
|
+
|
63
|
+
|
64
|
+
pip install twine build
|
65
|
+
rm -rf dist/
|
66
|
+
python -m build
|
67
|
+
|
68
|
+
python -m twine upload dist/*
|
69
|
+
|
70
|
+
-->
|
@@ -0,0 +1,23 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "dwani"
|
7
|
+
version = "0.1.4"
|
8
|
+
description = "Multimodal API for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)"
|
9
|
+
authors = [
|
10
|
+
{ name="sachin", email="python@dwani.ai" }
|
11
|
+
]
|
12
|
+
readme = "README.md"
|
13
|
+
license = { file = "LICENSE" }
|
14
|
+
requires-python = ">=3.8"
|
15
|
+
|
16
|
+
dependencies = [
|
17
|
+
"requests>=2.25.0",
|
18
|
+
]
|
19
|
+
|
20
|
+
[project.urls]
|
21
|
+
Homepage = "https://github.com/dwani-ai/dwani-python"
|
22
|
+
Source = "https://github.com/dwani-ai/dwani-python"
|
23
|
+
Issues = "https://github.com/dwani-ai/dwani-python/issues"
|
dwani-0.1.2/PKG-INFO
DELETED
@@ -1,188 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: dwani
|
3
|
-
Version: 0.1.2
|
4
|
-
Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
|
5
|
-
Author-email: sachin <python@dwani.ai>
|
6
|
-
License: MIT License
|
7
|
-
|
8
|
-
Copyright (c) 2025 Sachin Shetty
|
9
|
-
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
12
|
-
in the Software without restriction, including without limitation the rights
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
15
|
-
furnished to do so, subject to the following conditions:
|
16
|
-
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
18
|
-
copies or substantial portions of the Software.
|
19
|
-
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
-
SOFTWARE.
|
27
|
-
|
28
|
-
Project-URL: Homepage, https://github.com/dwani-ai/dwani-server
|
29
|
-
Project-URL: Source, https://github.com/dwani-ai/dwani-server
|
30
|
-
Project-URL: Issues, https://github.com/dwani-ai/dwani-server/issues
|
31
|
-
Requires-Python: >=3.10
|
32
|
-
Description-Content-Type: text/markdown
|
33
|
-
License-File: LICENSE
|
34
|
-
Requires-Dist: requests>=2.25.0
|
35
|
-
Dynamic: license-file
|
36
|
-
|
37
|
-
# Dhwani Server
|
38
|
-
|
39
|
-
Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
|
40
|
-
|
41
|
-
## Features
|
42
|
-
- **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
|
43
|
-
- **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
|
44
|
-
- **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
|
45
|
-
- **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
|
46
|
-
- **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
|
47
|
-
|
48
|
-
## Prerequisites
|
49
|
-
- **System Requirements - User **:
|
50
|
-
- **Python**: 3.10
|
51
|
-
- Ubuntu 22.04
|
52
|
-
- git
|
53
|
-
- vscode
|
54
|
-
- **System Requirements - Server **:
|
55
|
-
- Ubuntu with sufficient RAM (16GB+ recommended for models).
|
56
|
-
- Optional: NVIDIA GPU with CUDA support for faster inference.
|
57
|
-
- **FFmpeg**: Required for audio processing (ASR).
|
58
|
-
|
59
|
-
- Server Setup
|
60
|
-
```bash
|
61
|
-
export HF_HOME=/home/ubuntu/data-dhwani-models
|
62
|
-
export HF_TOKEN='YOur-HF-token'
|
63
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
64
|
-
```
|
65
|
-
## Installation
|
66
|
-
|
67
|
-
1. **Clone the Repository**:
|
68
|
-
```bash
|
69
|
-
git clone https://github.com/slabstech/dhwani-server
|
70
|
-
cd dhwani-server
|
71
|
-
```
|
72
|
-
|
73
|
-
2. Install Libraries:
|
74
|
-
- On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
|
75
|
-
|
76
|
-
3. Set Up Virtual Environment:
|
77
|
-
```bash
|
78
|
-
python -m venv venv
|
79
|
-
source venv/bin/activate
|
80
|
-
```
|
81
|
-
4. Install Dependencies:
|
82
|
-
```bash
|
83
|
-
sudo apt-get install -y ffmpeg build-essential
|
84
|
-
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
|
85
|
-
. "$HOME/.cargo/env"
|
86
|
-
export CC=/usr/bin/gcc
|
87
|
-
export ENV CXX=/usr/bin/g++
|
88
|
-
```
|
89
|
-
```bash
|
90
|
-
pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
|
91
|
-
pip install --no-cache-dir flash-attn --no-build-isolation
|
92
|
-
```
|
93
|
-
|
94
|
-
```bash
|
95
|
-
pip install -r requirements.txt
|
96
|
-
```
|
97
|
-
|
98
|
-
4. Set Environment Variable:
|
99
|
-
Create a .env file in the root directory and add your API key:
|
100
|
-
plaintext
|
101
|
-
```bash
|
102
|
-
API_KEY=your_secret_key
|
103
|
-
```
|
104
|
-
|
105
|
-
5. Running the Server
|
106
|
-
- Start the Server:
|
107
|
-
```bash
|
108
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
109
|
-
```
|
110
|
-
|
111
|
-
- The server starts with models loaded on start
|
112
|
-
- Access the interactive API docs at http://localhost:7860/docs.
|
113
|
-
|
114
|
-
- (Optional) Load All Models:
|
115
|
-
Preload all models (LLM, Translation, TTS, VLM, ASR) with:
|
116
|
-
-
|
117
|
-
```bash
|
118
|
-
curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
|
119
|
-
```
|
120
|
-
|
121
|
-
- Usage
|
122
|
-
- Endpoints
|
123
|
-
- All endpoints require the X-API-Key header with the value from your .env file.
|
124
|
-
|
125
|
-
- Health Check: GET /health
|
126
|
-
```bash
|
127
|
-
curl "http://localhost:7860/health"
|
128
|
-
```
|
129
|
-
- Response:
|
130
|
-
```bash
|
131
|
-
{"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
|
132
|
-
```
|
133
|
-
|
134
|
-
- Text-to-Speech: POST /v1/audio/speech
|
135
|
-
``` bash
|
136
|
-
curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
|
137
|
-
```
|
138
|
-
- Chat: POST /chat
|
139
|
-
``` bash
|
140
|
-
curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
|
141
|
-
```
|
142
|
-
|
143
|
-
- Response:
|
144
|
-
```{"response": "<Kannada response>"}```
|
145
|
-
- Image Captioning: POST /caption/
|
146
|
-
```bash
|
147
|
-
curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
|
148
|
-
```
|
149
|
-
- Response:``` {"caption": "<short caption>"}```
|
150
|
-
- Visual Query: POST /visual_query/
|
151
|
-
```bash
|
152
|
-
curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
|
153
|
-
```
|
154
|
-
- Response: ```{"answer": "<answer>"}```
|
155
|
-
- Object Detection: POST /detect/
|
156
|
-
```bash
|
157
|
-
curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
|
158
|
-
```
|
159
|
-
- Response: ```{"objects": [<list of detected objects>]}```
|
160
|
-
- Object Pointing: POST /point/
|
161
|
-
```bash
|
162
|
-
|
163
|
-
curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
|
164
|
-
```
|
165
|
-
- Response: ```{"points": [<list of points>]}```
|
166
|
-
- Transcription: POST /transcribe/
|
167
|
-
```bash
|
168
|
-
curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
|
169
|
-
```
|
170
|
-
- Response: ```{"text": "<transcribed text>"}```
|
171
|
-
- Batch Transcription: POST /transcribe_batch/
|
172
|
-
```bash
|
173
|
-
curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
|
174
|
-
```
|
175
|
-
- Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
|
176
|
-
|
177
|
-
- Notes
|
178
|
-
- Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
|
179
|
-
Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
|
180
|
-
Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
|
181
|
-
Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
|
182
|
-
|
183
|
-
- Troubleshooting
|
184
|
-
|
185
|
-
- Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
|
186
|
-
FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
|
187
|
-
Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
|
188
|
-
|
dwani-0.1.2/README.md
DELETED
@@ -1,152 +0,0 @@
|
|
1
|
-
# Dhwani Server
|
2
|
-
|
3
|
-
Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
|
4
|
-
|
5
|
-
## Features
|
6
|
-
- **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
|
7
|
-
- **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
|
8
|
-
- **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
|
9
|
-
- **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
|
10
|
-
- **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
|
11
|
-
|
12
|
-
## Prerequisites
|
13
|
-
- **System Requirements - User **:
|
14
|
-
- **Python**: 3.10
|
15
|
-
- Ubuntu 22.04
|
16
|
-
- git
|
17
|
-
- vscode
|
18
|
-
- **System Requirements - Server **:
|
19
|
-
- Ubuntu with sufficient RAM (16GB+ recommended for models).
|
20
|
-
- Optional: NVIDIA GPU with CUDA support for faster inference.
|
21
|
-
- **FFmpeg**: Required for audio processing (ASR).
|
22
|
-
|
23
|
-
- Server Setup
|
24
|
-
```bash
|
25
|
-
export HF_HOME=/home/ubuntu/data-dhwani-models
|
26
|
-
export HF_TOKEN='YOur-HF-token'
|
27
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
28
|
-
```
|
29
|
-
## Installation
|
30
|
-
|
31
|
-
1. **Clone the Repository**:
|
32
|
-
```bash
|
33
|
-
git clone https://github.com/slabstech/dhwani-server
|
34
|
-
cd dhwani-server
|
35
|
-
```
|
36
|
-
|
37
|
-
2. Install Libraries:
|
38
|
-
- On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
|
39
|
-
|
40
|
-
3. Set Up Virtual Environment:
|
41
|
-
```bash
|
42
|
-
python -m venv venv
|
43
|
-
source venv/bin/activate
|
44
|
-
```
|
45
|
-
4. Install Dependencies:
|
46
|
-
```bash
|
47
|
-
sudo apt-get install -y ffmpeg build-essential
|
48
|
-
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
|
49
|
-
. "$HOME/.cargo/env"
|
50
|
-
export CC=/usr/bin/gcc
|
51
|
-
export ENV CXX=/usr/bin/g++
|
52
|
-
```
|
53
|
-
```bash
|
54
|
-
pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
|
55
|
-
pip install --no-cache-dir flash-attn --no-build-isolation
|
56
|
-
```
|
57
|
-
|
58
|
-
```bash
|
59
|
-
pip install -r requirements.txt
|
60
|
-
```
|
61
|
-
|
62
|
-
4. Set Environment Variable:
|
63
|
-
Create a .env file in the root directory and add your API key:
|
64
|
-
plaintext
|
65
|
-
```bash
|
66
|
-
API_KEY=your_secret_key
|
67
|
-
```
|
68
|
-
|
69
|
-
5. Running the Server
|
70
|
-
- Start the Server:
|
71
|
-
```bash
|
72
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
73
|
-
```
|
74
|
-
|
75
|
-
- The server starts with models loaded on start
|
76
|
-
- Access the interactive API docs at http://localhost:7860/docs.
|
77
|
-
|
78
|
-
- (Optional) Load All Models:
|
79
|
-
Preload all models (LLM, Translation, TTS, VLM, ASR) with:
|
80
|
-
-
|
81
|
-
```bash
|
82
|
-
curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
|
83
|
-
```
|
84
|
-
|
85
|
-
- Usage
|
86
|
-
- Endpoints
|
87
|
-
- All endpoints require the X-API-Key header with the value from your .env file.
|
88
|
-
|
89
|
-
- Health Check: GET /health
|
90
|
-
```bash
|
91
|
-
curl "http://localhost:7860/health"
|
92
|
-
```
|
93
|
-
- Response:
|
94
|
-
```bash
|
95
|
-
{"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
|
96
|
-
```
|
97
|
-
|
98
|
-
- Text-to-Speech: POST /v1/audio/speech
|
99
|
-
``` bash
|
100
|
-
curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
|
101
|
-
```
|
102
|
-
- Chat: POST /chat
|
103
|
-
``` bash
|
104
|
-
curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
|
105
|
-
```
|
106
|
-
|
107
|
-
- Response:
|
108
|
-
```{"response": "<Kannada response>"}```
|
109
|
-
- Image Captioning: POST /caption/
|
110
|
-
```bash
|
111
|
-
curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
|
112
|
-
```
|
113
|
-
- Response:``` {"caption": "<short caption>"}```
|
114
|
-
- Visual Query: POST /visual_query/
|
115
|
-
```bash
|
116
|
-
curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
|
117
|
-
```
|
118
|
-
- Response: ```{"answer": "<answer>"}```
|
119
|
-
- Object Detection: POST /detect/
|
120
|
-
```bash
|
121
|
-
curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
|
122
|
-
```
|
123
|
-
- Response: ```{"objects": [<list of detected objects>]}```
|
124
|
-
- Object Pointing: POST /point/
|
125
|
-
```bash
|
126
|
-
|
127
|
-
curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
|
128
|
-
```
|
129
|
-
- Response: ```{"points": [<list of points>]}```
|
130
|
-
- Transcription: POST /transcribe/
|
131
|
-
```bash
|
132
|
-
curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
|
133
|
-
```
|
134
|
-
- Response: ```{"text": "<transcribed text>"}```
|
135
|
-
- Batch Transcription: POST /transcribe_batch/
|
136
|
-
```bash
|
137
|
-
curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
|
138
|
-
```
|
139
|
-
- Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
|
140
|
-
|
141
|
-
- Notes
|
142
|
-
- Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
|
143
|
-
Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
|
144
|
-
Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
|
145
|
-
Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
|
146
|
-
|
147
|
-
- Troubleshooting
|
148
|
-
|
149
|
-
- Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
|
150
|
-
FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
|
151
|
-
Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
|
152
|
-
|
dwani-0.1.2/dwani/asr.py
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
from .exceptions import DhwaniAPIError
|
2
|
-
import requests
|
3
|
-
def asr_transcribe(client, file_path, language):
|
4
|
-
with open(file_path, "rb") as f:
|
5
|
-
files = {"file": f}
|
6
|
-
resp = requests.post(
|
7
|
-
f"{client.api_base}/transcribe/?language={language}",
|
8
|
-
headers=client._headers(),
|
9
|
-
files=files
|
10
|
-
)
|
11
|
-
if resp.status_code != 200:
|
12
|
-
raise DhwaniAPIError(resp)
|
13
|
-
return resp.json()
|
14
|
-
|
15
|
-
class ASR:
|
16
|
-
@staticmethod
|
17
|
-
def transcribe(*args, **kwargs):
|
18
|
-
from . import _get_client
|
19
|
-
return _get_client().transcribe(*args, **kwargs)
|
20
|
-
|
21
|
-
|
22
|
-
'''
|
23
|
-
from .docs import Documents
|
24
|
-
|
25
|
-
class documents:
|
26
|
-
@staticmethod
|
27
|
-
def ocr(file_path, language=None):
|
28
|
-
return _get_client().document_ocr(file_path, language)
|
29
|
-
|
30
|
-
@staticmethod
|
31
|
-
def translate(file_path, src_lang, tgt_lang):
|
32
|
-
return _get_client().document_translate(file_path, src_lang, tgt_lang)
|
33
|
-
|
34
|
-
@staticmethod
|
35
|
-
def summarize(file_path, language=None):
|
36
|
-
return _get_client().document_summarize(file_path, language)
|
37
|
-
'''
|
dwani-0.1.2/dwani/chat.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from .exceptions import DhwaniAPIError
|
2
|
-
import requests
|
3
|
-
def chat_create(client, prompt, **kwargs):
|
4
|
-
resp = requests.post(
|
5
|
-
f"{client.api_base}/chat",
|
6
|
-
headers={**client._headers(), "Content-Type": "application/json"},
|
7
|
-
json={"prompt": prompt, **kwargs}
|
8
|
-
)
|
9
|
-
if resp.status_code != 200:
|
10
|
-
raise DhwaniAPIError(resp)
|
11
|
-
return resp.json()
|
12
|
-
|
13
|
-
class Chat:
|
14
|
-
@staticmethod
|
15
|
-
def create(prompt, **kwargs):
|
16
|
-
from . import _get_client
|
17
|
-
return _get_client().chat(prompt, **kwargs)
|
@@ -1,188 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: dwani
|
3
|
-
Version: 0.1.2
|
4
|
-
Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
|
5
|
-
Author-email: sachin <python@dwani.ai>
|
6
|
-
License: MIT License
|
7
|
-
|
8
|
-
Copyright (c) 2025 Sachin Shetty
|
9
|
-
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
12
|
-
in the Software without restriction, including without limitation the rights
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
15
|
-
furnished to do so, subject to the following conditions:
|
16
|
-
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
18
|
-
copies or substantial portions of the Software.
|
19
|
-
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
-
SOFTWARE.
|
27
|
-
|
28
|
-
Project-URL: Homepage, https://github.com/dwani-ai/dwani-server
|
29
|
-
Project-URL: Source, https://github.com/dwani-ai/dwani-server
|
30
|
-
Project-URL: Issues, https://github.com/dwani-ai/dwani-server/issues
|
31
|
-
Requires-Python: >=3.10
|
32
|
-
Description-Content-Type: text/markdown
|
33
|
-
License-File: LICENSE
|
34
|
-
Requires-Dist: requests>=2.25.0
|
35
|
-
Dynamic: license-file
|
36
|
-
|
37
|
-
# Dhwani Server
|
38
|
-
|
39
|
-
Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
|
40
|
-
|
41
|
-
## Features
|
42
|
-
- **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
|
43
|
-
- **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
|
44
|
-
- **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
|
45
|
-
- **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
|
46
|
-
- **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
|
47
|
-
|
48
|
-
## Prerequisites
|
49
|
-
- **System Requirements - User **:
|
50
|
-
- **Python**: 3.10
|
51
|
-
- Ubuntu 22.04
|
52
|
-
- git
|
53
|
-
- vscode
|
54
|
-
- **System Requirements - Server **:
|
55
|
-
- Ubuntu with sufficient RAM (16GB+ recommended for models).
|
56
|
-
- Optional: NVIDIA GPU with CUDA support for faster inference.
|
57
|
-
- **FFmpeg**: Required for audio processing (ASR).
|
58
|
-
|
59
|
-
- Server Setup
|
60
|
-
```bash
|
61
|
-
export HF_HOME=/home/ubuntu/data-dhwani-models
|
62
|
-
export HF_TOKEN='YOur-HF-token'
|
63
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
64
|
-
```
|
65
|
-
## Installation
|
66
|
-
|
67
|
-
1. **Clone the Repository**:
|
68
|
-
```bash
|
69
|
-
git clone https://github.com/slabstech/dhwani-server
|
70
|
-
cd dhwani-server
|
71
|
-
```
|
72
|
-
|
73
|
-
2. Install Libraries:
|
74
|
-
- On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
|
75
|
-
|
76
|
-
3. Set Up Virtual Environment:
|
77
|
-
```bash
|
78
|
-
python -m venv venv
|
79
|
-
source venv/bin/activate
|
80
|
-
```
|
81
|
-
4. Install Dependencies:
|
82
|
-
```bash
|
83
|
-
sudo apt-get install -y ffmpeg build-essential
|
84
|
-
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
|
85
|
-
. "$HOME/.cargo/env"
|
86
|
-
export CC=/usr/bin/gcc
|
87
|
-
export ENV CXX=/usr/bin/g++
|
88
|
-
```
|
89
|
-
```bash
|
90
|
-
pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
|
91
|
-
pip install --no-cache-dir flash-attn --no-build-isolation
|
92
|
-
```
|
93
|
-
|
94
|
-
```bash
|
95
|
-
pip install -r requirements.txt
|
96
|
-
```
|
97
|
-
|
98
|
-
4. Set Environment Variable:
|
99
|
-
Create a .env file in the root directory and add your API key:
|
100
|
-
plaintext
|
101
|
-
```bash
|
102
|
-
API_KEY=your_secret_key
|
103
|
-
```
|
104
|
-
|
105
|
-
5. Running the Server
|
106
|
-
- Start the Server:
|
107
|
-
```bash
|
108
|
-
python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
|
109
|
-
```
|
110
|
-
|
111
|
-
- The server starts with models loaded on start
|
112
|
-
- Access the interactive API docs at http://localhost:7860/docs.
|
113
|
-
|
114
|
-
- (Optional) Load All Models:
|
115
|
-
Preload all models (LLM, Translation, TTS, VLM, ASR) with:
|
116
|
-
-
|
117
|
-
```bash
|
118
|
-
curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
|
119
|
-
```
|
120
|
-
|
121
|
-
- Usage
|
122
|
-
- Endpoints
|
123
|
-
- All endpoints require the X-API-Key header with the value from your .env file.
|
124
|
-
|
125
|
-
- Health Check: GET /health
|
126
|
-
```bash
|
127
|
-
curl "http://localhost:7860/health"
|
128
|
-
```
|
129
|
-
- Response:
|
130
|
-
```bash
|
131
|
-
{"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
|
132
|
-
```
|
133
|
-
|
134
|
-
- Text-to-Speech: POST /v1/audio/speech
|
135
|
-
``` bash
|
136
|
-
curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
|
137
|
-
```
|
138
|
-
- Chat: POST /chat
|
139
|
-
``` bash
|
140
|
-
curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
|
141
|
-
```
|
142
|
-
|
143
|
-
- Response:
|
144
|
-
```{"response": "<Kannada response>"}```
|
145
|
-
- Image Captioning: POST /caption/
|
146
|
-
```bash
|
147
|
-
curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
|
148
|
-
```
|
149
|
-
- Response:``` {"caption": "<short caption>"}```
|
150
|
-
- Visual Query: POST /visual_query/
|
151
|
-
```bash
|
152
|
-
curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
|
153
|
-
```
|
154
|
-
- Response: ```{"answer": "<answer>"}```
|
155
|
-
- Object Detection: POST /detect/
|
156
|
-
```bash
|
157
|
-
curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
|
158
|
-
```
|
159
|
-
- Response: ```{"objects": [<list of detected objects>]}```
|
160
|
-
- Object Pointing: POST /point/
|
161
|
-
```bash
|
162
|
-
|
163
|
-
curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
|
164
|
-
```
|
165
|
-
- Response: ```{"points": [<list of points>]}```
|
166
|
-
- Transcription: POST /transcribe/
|
167
|
-
```bash
|
168
|
-
curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
|
169
|
-
```
|
170
|
-
- Response: ```{"text": "<transcribed text>"}```
|
171
|
-
- Batch Transcription: POST /transcribe_batch/
|
172
|
-
```bash
|
173
|
-
curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
|
174
|
-
```
|
175
|
-
- Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
|
176
|
-
|
177
|
-
- Notes
|
178
|
-
- Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
|
179
|
-
Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
|
180
|
-
Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
|
181
|
-
Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
|
182
|
-
|
183
|
-
- Troubleshooting
|
184
|
-
|
185
|
-
- Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
|
186
|
-
FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
|
187
|
-
Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
|
188
|
-
|
dwani-0.1.2/pyproject.toml
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
[build-system]
|
2
|
-
requires = ["setuptools>=61.0", "wheel"]
|
3
|
-
build-backend = "setuptools.build_meta"
|
4
|
-
|
5
|
-
[project]
|
6
|
-
name = "dwani"
|
7
|
-
version = "0.1.2"
|
8
|
-
description = "Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)"
|
9
|
-
authors = [
|
10
|
-
{ name="sachin", email="python@dwani.ai" }
|
11
|
-
]
|
12
|
-
readme = "README.md"
|
13
|
-
license = { file = "LICENSE" }
|
14
|
-
requires-python = ">=3.10"
|
15
|
-
|
16
|
-
dependencies = [
|
17
|
-
"requests>=2.25.0",
|
18
|
-
]
|
19
|
-
|
20
|
-
[project.urls]
|
21
|
-
Homepage = "https://github.com/dwani-ai/dwani-server"
|
22
|
-
Source = "https://github.com/dwani-ai/dwani-server"
|
23
|
-
Issues = "https://github.com/dwani-ai/dwani-server/issues"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|