dwani 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dwani-0.1.3/PKG-INFO ADDED
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: dwani
3
+ Version: 0.1.3
4
+ Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
5
+ Author-email: sachin <python@dwani.ai>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Sachin Shetty
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/dwani-ai/dwani-python
29
+ Project-URL: Source, https://github.com/dwani-ai/dwani-python
30
+ Project-URL: Issues, https://github.com/dwani-ai/dwani-python/issues
31
+ Requires-Python: >=3.10
32
+ Description-Content-Type: text/markdown
33
+ License-File: LICENSE
34
+ Requires-Dist: requests>=2.25.0
35
+ Dynamic: license-file
36
+
37
+ # dwani.ai - python library
38
+
39
+
40
+ ```bash
41
+ pip install dwani
42
+ ```
43
+
44
+
45
+
46
+ ```python
47
+ import dwani
48
+ import os
49
+
50
+ dwani.api_key = os.getenv("DWANI_API_KEY")
51
+
52
+ dwani.api_base = os.getenv("DWANI_API_BASE")
53
+
54
+ resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
55
+ print(resp)
56
+ ```
57
+
58
+
59
+ <!--
60
+ ## local development
61
+ pip install -e .
62
+
63
+
64
+ pip install twine build
65
+ rm -rf dist/
66
+ python -m build
67
+
68
+ python -m twine upload dist/*
69
+
70
+ -->
dwani-0.1.3/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # dwani.ai - python library
2
+
3
+
4
+ ```bash
5
+ pip install dwani
6
+ ```
7
+
8
+
9
+
10
+ ```python
11
+ import dwani
12
+ import os
13
+
14
+ dwani.api_key = os.getenv("DWANI_API_KEY")
15
+
16
+ dwani.api_base = os.getenv("DWANI_API_BASE")
17
+
18
+ resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
19
+ print(resp)
20
+ ```
21
+
22
+
23
+ <!--
24
+ ## local development
25
+ pip install -e .
26
+
27
+
28
+ pip install twine build
29
+ rm -rf dist/
30
+ python -m build
31
+
32
+ python -m twine upload dist/*
33
+
34
+ -->
@@ -1,5 +1,5 @@
1
1
  from .exceptions import DhwaniAPIError
2
-
2
+ import requests
3
3
  def asr_transcribe(client, file_path, language):
4
4
  with open(file_path, "rb") as f:
5
5
  files = {"file": f}
@@ -18,6 +18,8 @@ class ASR:
18
18
  from . import _get_client
19
19
  return _get_client().transcribe(*args, **kwargs)
20
20
 
21
+
22
+ '''
21
23
  from .docs import Documents
22
24
 
23
25
  class documents:
@@ -32,3 +34,4 @@ class documents:
32
34
  @staticmethod
33
35
  def summarize(file_path, language=None):
34
36
  return _get_client().document_summarize(file_path, language)
37
+ '''
@@ -1,5 +1,5 @@
1
1
  from .exceptions import DhwaniAPIError
2
-
2
+ import requests
3
3
  def audio_speech(client, input, voice, model, response_format="mp3", output_file=None):
4
4
  data = {
5
5
  "input": input,
@@ -0,0 +1,25 @@
1
+ from .exceptions import DhwaniAPIError
2
+ import requests
3
+
4
+ def chat_create(client, prompt, src_lang, tgt_lang, **kwargs):
5
+ url = f"{client.api_base}/v1/indic_chat"
6
+ payload = {
7
+ "prompt": prompt,
8
+ "src_lang": src_lang,
9
+ "tgt_lang": tgt_lang
10
+ }
11
+ payload.update(kwargs)
12
+ resp = requests.post(
13
+ url,
14
+ headers={**client._headers(), "Content-Type": "application/json"},
15
+ json=payload
16
+ )
17
+ if resp.status_code != 200:
18
+ raise DhwaniAPIError(resp)
19
+ return resp.json()
20
+
21
+ class Chat:
22
+ @staticmethod
23
+ def create(prompt, src_lang, tgt_lang, **kwargs):
24
+ from . import _get_client
25
+ return _get_client().chat(prompt, src_lang, tgt_lang, **kwargs)
@@ -4,17 +4,18 @@ from .exceptions import DhwaniAPIError
4
4
 
5
5
  class DhwaniClient:
6
6
  def __init__(self, api_key=None, api_base=None):
7
- self.api_key = api_key or os.getenv("DHWANI_API_KEY")
8
- self.api_base = api_base or os.getenv("DHWANI_API_BASE", "http://localhost:7860")
7
+ self.api_key = api_key or os.getenv("DWANI_API_KEY")
8
+ self.api_base = api_base or os.getenv("DWANI_API_BASE", "http://localhost:7860")
9
9
  if not self.api_key:
10
10
  raise ValueError("DHWANI_API_KEY not set")
11
11
 
12
12
  def _headers(self):
13
13
  return {"X-API-Key": self.api_key}
14
14
 
15
- def chat(self, prompt, **kwargs):
15
+ def chat(self, prompt, src_lang, tgt_lang, **kwargs):
16
16
  from .chat import chat_create
17
- return chat_create(self, prompt, **kwargs)
17
+ return chat_create(self, prompt, src_lang, tgt_lang, **kwargs)
18
+
18
19
 
19
20
  def speech(self, *args, **kwargs):
20
21
  from .audio import audio_speech
@@ -1,5 +1,5 @@
1
1
  from .exceptions import DhwaniAPIError
2
-
2
+ import requests
3
3
  def vision_caption(client, file_path, length="short"):
4
4
  with open(file_path, "rb") as f:
5
5
  files = {"file": f}
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: dwani
3
+ Version: 0.1.3
4
+ Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
5
+ Author-email: sachin <python@dwani.ai>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Sachin Shetty
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/dwani-ai/dwani-python
29
+ Project-URL: Source, https://github.com/dwani-ai/dwani-python
30
+ Project-URL: Issues, https://github.com/dwani-ai/dwani-python/issues
31
+ Requires-Python: >=3.10
32
+ Description-Content-Type: text/markdown
33
+ License-File: LICENSE
34
+ Requires-Dist: requests>=2.25.0
35
+ Dynamic: license-file
36
+
37
+ # dwani.ai - python library
38
+
39
+
40
+ ```bash
41
+ pip install dwani
42
+ ```
43
+
44
+
45
+
46
+ ```python
47
+ import dwani
48
+ import os
49
+
50
+ dwani.api_key = os.getenv("DWANI_API_KEY")
51
+
52
+ dwani.api_base = os.getenv("DWANI_API_BASE")
53
+
54
+ resp = dwani.Chat.create("Hello!", "eng_Latn", "kan_Knda")
55
+ print(resp)
56
+ ```
57
+
58
+
59
+ <!--
60
+ ## local development
61
+ pip install -e .
62
+
63
+
64
+ pip install twine build
65
+ rm -rf dist/
66
+ python -m build
67
+
68
+ python -m twine upload dist/*
69
+
70
+ -->
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ dwani/__init__.py
5
+ dwani/asr.py
6
+ dwani/audio.py
7
+ dwani/chat.py
8
+ dwani/client.py
9
+ dwani/docs.py
10
+ dwani/exceptions.py
11
+ dwani/vision.py
12
+ dwani.egg-info/PKG-INFO
13
+ dwani.egg-info/SOURCES.txt
14
+ dwani.egg-info/dependency_links.txt
15
+ dwani.egg-info/requires.txt
16
+ dwani.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ requests>=2.25.0
@@ -0,0 +1 @@
1
+ dwani
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dwani"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  description = "Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)"
9
9
  authors = [
10
10
  { name="sachin", email="python@dwani.ai" }
@@ -14,15 +14,10 @@ license = { file = "LICENSE" }
14
14
  requires-python = ">=3.10"
15
15
 
16
16
  dependencies = [
17
- "fastapi>=0.95.0",
18
- "uvicorn[standard]>=0.22.0",
19
- "pydantic>=2.0.0",
20
17
  "requests>=2.25.0",
21
- "python-multipart>=0.0.5",
22
- "pydantic-settings>=2.0.0"
23
18
  ]
24
19
 
25
20
  [project.urls]
26
- Homepage = "https://github.com/dwani-ai/dwani-server"
27
- Source = "https://github.com/dwani-ai/dwani-server"
28
- Issues = "https://github.com/dwani-ai/dwani-server/issues"
21
+ Homepage = "https://github.com/dwani-ai/dwani-python"
22
+ Source = "https://github.com/dwani-ai/dwani-python"
23
+ Issues = "https://github.com/dwani-ai/dwani-python/issues"
dwani-0.1.1/PKG-INFO DELETED
@@ -1,193 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: dwani
3
- Version: 0.1.1
4
- Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
5
- Author-email: sachin <python@dwani.ai>
6
- License: MIT License
7
-
8
- Copyright (c) 2025 Sachin Shetty
9
-
10
- Permission is hereby granted, free of charge, to any person obtaining a copy
11
- of this software and associated documentation files (the "Software"), to deal
12
- in the Software without restriction, including without limitation the rights
13
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
- copies of the Software, and to permit persons to whom the Software is
15
- furnished to do so, subject to the following conditions:
16
-
17
- The above copyright notice and this permission notice shall be included in all
18
- copies or substantial portions of the Software.
19
-
20
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
- SOFTWARE.
27
-
28
- Project-URL: Homepage, https://github.com/dwani-ai/dwani-server
29
- Project-URL: Source, https://github.com/dwani-ai/dwani-server
30
- Project-URL: Issues, https://github.com/dwani-ai/dwani-server/issues
31
- Requires-Python: >=3.10
32
- Description-Content-Type: text/markdown
33
- License-File: LICENSE
34
- Requires-Dist: fastapi>=0.95.0
35
- Requires-Dist: uvicorn[standard]>=0.22.0
36
- Requires-Dist: pydantic>=2.0.0
37
- Requires-Dist: requests>=2.25.0
38
- Requires-Dist: python-multipart>=0.0.5
39
- Requires-Dist: pydantic-settings>=2.0.0
40
- Dynamic: license-file
41
-
42
- # Dhwani Server
43
-
44
- Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
45
-
46
- ## Features
47
- - **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
48
- - **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
49
- - **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
50
- - **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
51
- - **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
52
-
53
- ## Prerequisites
54
- - **System Requirements - User **:
55
- - **Python**: 3.10
56
- - Ubuntu 22.04
57
- - git
58
- - vscode
59
- - **System Requirements - Server **:
60
- - Ubuntu with sufficient RAM (16GB+ recommended for models).
61
- - Optional: NVIDIA GPU with CUDA support for faster inference.
62
- - **FFmpeg**: Required for audio processing (ASR).
63
-
64
- - Server Setup
65
- ```bash
66
- export HF_HOME=/home/ubuntu/data-dhwani-models
67
- export HF_TOKEN='YOur-HF-token'
68
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
69
- ```
70
- ## Installation
71
-
72
- 1. **Clone the Repository**:
73
- ```bash
74
- git clone https://github.com/slabstech/dhwani-server
75
- cd dhwani-server
76
- ```
77
-
78
- 2. Install Libraries:
79
- - On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
80
-
81
- 3. Set Up Virtual Environment:
82
- ```bash
83
- python -m venv venv
84
- source venv/bin/activate
85
- ```
86
- 4. Install Dependencies:
87
- ```bash
88
- sudo apt-get install -y ffmpeg build-essential
89
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
90
- . "$HOME/.cargo/env"
91
- export CC=/usr/bin/gcc
92
- export ENV CXX=/usr/bin/g++
93
- ```
94
- ```bash
95
- pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
96
- pip install --no-cache-dir flash-attn --no-build-isolation
97
- ```
98
-
99
- ```bash
100
- pip install -r requirements.txt
101
- ```
102
-
103
- 4. Set Environment Variable:
104
- Create a .env file in the root directory and add your API key:
105
- plaintext
106
- ```bash
107
- API_KEY=your_secret_key
108
- ```
109
-
110
- 5. Running the Server
111
- - Start the Server:
112
- ```bash
113
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
114
- ```
115
-
116
- - The server starts with models loaded on start
117
- - Access the interactive API docs at http://localhost:7860/docs.
118
-
119
- - (Optional) Load All Models:
120
- Preload all models (LLM, Translation, TTS, VLM, ASR) with:
121
- -
122
- ```bash
123
- curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
124
- ```
125
-
126
- - Usage
127
- - Endpoints
128
- - All endpoints require the X-API-Key header with the value from your .env file.
129
-
130
- - Health Check: GET /health
131
- ```bash
132
- curl "http://localhost:7860/health"
133
- ```
134
- - Response:
135
- ```bash
136
- {"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
137
- ```
138
-
139
- - Text-to-Speech: POST /v1/audio/speech
140
- ``` bash
141
- curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
142
- ```
143
- - Chat: POST /chat
144
- ``` bash
145
- curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
146
- ```
147
-
148
- - Response:
149
- ```{"response": "<Kannada response>"}```
150
- - Image Captioning: POST /caption/
151
- ```bash
152
- curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
153
- ```
154
- - Response:``` {"caption": "<short caption>"}```
155
- - Visual Query: POST /visual_query/
156
- ```bash
157
- curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
158
- ```
159
- - Response: ```{"answer": "<answer>"}```
160
- - Object Detection: POST /detect/
161
- ```bash
162
- curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
163
- ```
164
- - Response: ```{"objects": [<list of detected objects>]}```
165
- - Object Pointing: POST /point/
166
- ```bash
167
-
168
- curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
169
- ```
170
- - Response: ```{"points": [<list of points>]}```
171
- - Transcription: POST /transcribe/
172
- ```bash
173
- curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
174
- ```
175
- - Response: ```{"text": "<transcribed text>"}```
176
- - Batch Transcription: POST /transcribe_batch/
177
- ```bash
178
- curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
179
- ```
180
- - Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
181
-
182
- - Notes
183
- - Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
184
- Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
185
- Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
186
- Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
187
-
188
- - Troubleshooting
189
-
190
- - Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
191
- FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
192
- Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
193
-
dwani-0.1.1/README.md DELETED
@@ -1,152 +0,0 @@
1
- # Dhwani Server
2
-
3
- Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
4
-
5
- ## Features
6
- - **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
7
- - **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
8
- - **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
9
- - **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
10
- - **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
11
-
12
- ## Prerequisites
13
- - **System Requirements - User **:
14
- - **Python**: 3.10
15
- - Ubuntu 22.04
16
- - git
17
- - vscode
18
- - **System Requirements - Server **:
19
- - Ubuntu with sufficient RAM (16GB+ recommended for models).
20
- - Optional: NVIDIA GPU with CUDA support for faster inference.
21
- - **FFmpeg**: Required for audio processing (ASR).
22
-
23
- - Server Setup
24
- ```bash
25
- export HF_HOME=/home/ubuntu/data-dhwani-models
26
- export HF_TOKEN='YOur-HF-token'
27
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
28
- ```
29
- ## Installation
30
-
31
- 1. **Clone the Repository**:
32
- ```bash
33
- git clone https://github.com/slabstech/dhwani-server
34
- cd dhwani-server
35
- ```
36
-
37
- 2. Install Libraries:
38
- - On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
39
-
40
- 3. Set Up Virtual Environment:
41
- ```bash
42
- python -m venv venv
43
- source venv/bin/activate
44
- ```
45
- 4. Install Dependencies:
46
- ```bash
47
- sudo apt-get install -y ffmpeg build-essential
48
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
49
- . "$HOME/.cargo/env"
50
- export CC=/usr/bin/gcc
51
- export ENV CXX=/usr/bin/g++
52
- ```
53
- ```bash
54
- pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
55
- pip install --no-cache-dir flash-attn --no-build-isolation
56
- ```
57
-
58
- ```bash
59
- pip install -r requirements.txt
60
- ```
61
-
62
- 4. Set Environment Variable:
63
- Create a .env file in the root directory and add your API key:
64
- plaintext
65
- ```bash
66
- API_KEY=your_secret_key
67
- ```
68
-
69
- 5. Running the Server
70
- - Start the Server:
71
- ```bash
72
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
73
- ```
74
-
75
- - The server starts with models loaded on start
76
- - Access the interactive API docs at http://localhost:7860/docs.
77
-
78
- - (Optional) Load All Models:
79
- Preload all models (LLM, Translation, TTS, VLM, ASR) with:
80
- -
81
- ```bash
82
- curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
83
- ```
84
-
85
- - Usage
86
- - Endpoints
87
- - All endpoints require the X-API-Key header with the value from your .env file.
88
-
89
- - Health Check: GET /health
90
- ```bash
91
- curl "http://localhost:7860/health"
92
- ```
93
- - Response:
94
- ```bash
95
- {"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
96
- ```
97
-
98
- - Text-to-Speech: POST /v1/audio/speech
99
- ``` bash
100
- curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
101
- ```
102
- - Chat: POST /chat
103
- ``` bash
104
- curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
105
- ```
106
-
107
- - Response:
108
- ```{"response": "<Kannada response>"}```
109
- - Image Captioning: POST /caption/
110
- ```bash
111
- curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
112
- ```
113
- - Response:``` {"caption": "<short caption>"}```
114
- - Visual Query: POST /visual_query/
115
- ```bash
116
- curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
117
- ```
118
- - Response: ```{"answer": "<answer>"}```
119
- - Object Detection: POST /detect/
120
- ```bash
121
- curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
122
- ```
123
- - Response: ```{"objects": [<list of detected objects>]}```
124
- - Object Pointing: POST /point/
125
- ```bash
126
-
127
- curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
128
- ```
129
- - Response: ```{"points": [<list of points>]}```
130
- - Transcription: POST /transcribe/
131
- ```bash
132
- curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
133
- ```
134
- - Response: ```{"text": "<transcribed text>"}```
135
- - Batch Transcription: POST /transcribe_batch/
136
- ```bash
137
- curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
138
- ```
139
- - Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
140
-
141
- - Notes
142
- - Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
143
- Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
144
- Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
145
- Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
146
-
147
- - Troubleshooting
148
-
149
- - Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
150
- FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
151
- Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
152
-
dwani-0.1.1/src/chat.py DELETED
@@ -1,17 +0,0 @@
1
- from .exceptions import DhwaniAPIError
2
-
3
- def chat_create(client, prompt, **kwargs):
4
- resp = requests.post(
5
- f"{client.api_base}/chat",
6
- headers={**client._headers(), "Content-Type": "application/json"},
7
- json={"prompt": prompt, **kwargs}
8
- )
9
- if resp.status_code != 200:
10
- raise DhwaniAPIError(resp)
11
- return resp.json()
12
-
13
- class Chat:
14
- @staticmethod
15
- def create(prompt, **kwargs):
16
- from . import _get_client
17
- return _get_client().chat(prompt, **kwargs)
@@ -1,193 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: dwani
3
- Version: 0.1.1
4
- Summary: Multimodal AI server for Indian languages (speech, vision, LLMs, TTS, ASR, etc.)
5
- Author-email: sachin <python@dwani.ai>
6
- License: MIT License
7
-
8
- Copyright (c) 2025 Sachin Shetty
9
-
10
- Permission is hereby granted, free of charge, to any person obtaining a copy
11
- of this software and associated documentation files (the "Software"), to deal
12
- in the Software without restriction, including without limitation the rights
13
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
- copies of the Software, and to permit persons to whom the Software is
15
- furnished to do so, subject to the following conditions:
16
-
17
- The above copyright notice and this permission notice shall be included in all
18
- copies or substantial portions of the Software.
19
-
20
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
- SOFTWARE.
27
-
28
- Project-URL: Homepage, https://github.com/dwani-ai/dwani-server
29
- Project-URL: Source, https://github.com/dwani-ai/dwani-server
30
- Project-URL: Issues, https://github.com/dwani-ai/dwani-server/issues
31
- Requires-Python: >=3.10
32
- Description-Content-Type: text/markdown
33
- License-File: LICENSE
34
- Requires-Dist: fastapi>=0.95.0
35
- Requires-Dist: uvicorn[standard]>=0.22.0
36
- Requires-Dist: pydantic>=2.0.0
37
- Requires-Dist: requests>=2.25.0
38
- Requires-Dist: python-multipart>=0.0.5
39
- Requires-Dist: pydantic-settings>=2.0.0
40
- Dynamic: license-file
41
-
42
- # Dhwani Server
43
-
44
- Dhwani API is a FastAPI-based application providing AI-powered services for Indian languages, including text-to-speech (TTS), language model (LLM) chat, vision-language model (VLM) capabilities, and automatic speech recognition (ASR). It supports lazy loading of models for fast startup and includes endpoints for various tasks.
45
-
46
- ## Features
47
- - **Text-to-Speech (TTS)**: Generate audio from text in Indian languages using Parler TTS.
48
- - **Chat**: Process Kannada prompts and respond in Kannada via translation and LLM.
49
- - **Vision-Language Model (VLM)**: Caption images, answer visual queries, detect, and point objects.
50
- - **Automatic Speech Recognition (ASR)**: Transcribe audio files in multiple Indian languages.
51
- - **Lazy Loading**: Models load on-demand or via an explicit endpoint for fast startup.
52
-
53
- ## Prerequisites
54
- - **System Requirements - User **:
55
- - **Python**: 3.10
56
- - Ubuntu 22.04
57
- - git
58
- - vscode
59
- - **System Requirements - Server **:
60
- - Ubuntu with sufficient RAM (16GB+ recommended for models).
61
- - Optional: NVIDIA GPU with CUDA support for faster inference.
62
- - **FFmpeg**: Required for audio processing (ASR).
63
-
64
- - Server Setup
65
- ```bash
66
- export HF_HOME=/home/ubuntu/data-dhwani-models
67
- export HF_TOKEN='YOur-HF-token'
68
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
69
- ```
70
- ## Installation
71
-
72
- 1. **Clone the Repository**:
73
- ```bash
74
- git clone https://github.com/slabstech/dhwani-server
75
- cd dhwani-server
76
- ```
77
-
78
- 2. Install Libraries:
79
- - On Ubuntu: ```sudo apt-get install ffmpeg build-essential```
80
-
81
- 3. Set Up Virtual Environment:
82
- ```bash
83
- python -m venv venv
84
- source venv/bin/activate
85
- ```
86
- 4. Install Dependencies:
87
- ```bash
88
- sudo apt-get install -y ffmpeg build-essential
89
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --profile minimal
90
- . "$HOME/.cargo/env"
91
- export CC=/usr/bin/gcc
92
- export ENV CXX=/usr/bin/g++
93
- ```
94
- ```bash
95
- pip install --no-cache-dir --upgrade pip setuptools psutil setuptools-rust torch==2.6.0
96
- pip install --no-cache-dir flash-attn --no-build-isolation
97
- ```
98
-
99
- ```bash
100
- pip install -r requirements.txt
101
- ```
102
-
103
- 4. Set Environment Variable:
104
- Create a .env file in the root directory and add your API key:
105
- plaintext
106
- ```bash
107
- API_KEY=your_secret_key
108
- ```
109
-
110
- 5. Running the Server
111
- - Start the Server:
112
- ```bash
113
- python src/server/main.py --host 0.0.0.0 --port 7860 --config config_two
114
- ```
115
-
116
- - The server starts with models loaded on start
117
- - Access the interactive API docs at http://localhost:7860/docs.
118
-
119
- - (Optional) Load All Models:
120
- Preload all models (LLM, Translation, TTS, VLM, ASR) with:
121
- -
122
- ```bash
123
- curl -X POST "http://localhost:7860/load_all_models" -H "X-API-Key: your_secret_key"
124
- ```
125
-
126
- - Usage
127
- - Endpoints
128
- - All endpoints require the X-API-Key header with the value from your .env file.
129
-
130
- - Health Check: GET /health
131
- ```bash
132
- curl "http://localhost:7860/health"
133
- ```
134
- - Response:
135
- ```bash
136
- {"status": "healthy", "model": "Qwen/Qwen2.5-3B-Instruct"}
137
- ```
138
-
139
- - Text-to-Speech: POST /v1/audio/speech
140
- ``` bash
141
- curl -X POST "http://localhost:7860/v1/audio/speech" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"input": "ನಮಸ್ಕಾರ", "voice": "Female voice", "model": "ai4bharat/indic-parler-tts", "response_format": "mp3"}' --output speech.mp3
142
- ```
143
- - Chat: POST /chat
144
- ``` bash
145
- curl -X POST "http://localhost:7860/chat" -H "X-API-Key: your_secret_key" -H "Content-Type: application/json" -d '{"prompt": "ನೀವು ಹೇಗಿದ್ದೀರಿ?"}'
146
- ```
147
-
148
- - Response:
149
- ```{"response": "<Kannada response>"}```
150
- - Image Captioning: POST /caption/
151
- ```bash
152
- curl -X POST "http://localhost:7860/caption/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "length=short"
153
- ```
154
- - Response:``` {"caption": "<short caption>"}```
155
- - Visual Query: POST /visual_query/
156
- ```bash
157
- curl -X POST "http://localhost:7860/visual_query/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "query=What is this?"
158
- ```
159
- - Response: ```{"answer": "<answer>"}```
160
- - Object Detection: POST /detect/
161
- ```bash
162
- curl -X POST "http://localhost:7860/detect/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=face"
163
- ```
164
- - Response: ```{"objects": [<list of detected objects>]}```
165
- - Object Pointing: POST /point/
166
- ```bash
167
-
168
- curl -X POST "http://localhost:7860/point/" -H "X-API-Key: your_secret_key" -F "file=@image.jpg" -F "object_type=person"
169
- ```
170
- - Response: ```{"points": [<list of points>]}```
171
- - Transcription: POST /transcribe/
172
- ```bash
173
- curl -X POST "http://localhost:7860/transcribe/?language=kannada" -H "X-API-Key: your_secret_key" -F "file=@audio.wav"
174
- ```
175
- - Response: ```{"text": "<transcribed text>"}```
176
- - Batch Transcription: POST /transcribe_batch/
177
- ```bash
178
- curl -X POST "http://localhost:7860/transcribe_batch/?language=kannada" -H "X-API-Key: your_secret_key" -F "files=@audio1.wav" -F "files=@audio2.mp3"
179
- ```
180
- - Response: ```{"transcriptions": ["<text1>", "<text2>"]}```
181
-
182
- - Notes
183
- - Lazy Loading: Models load on first use or via /load_all_models. Expect a delay on the first request for each model type.
184
- Supported Languages: ASR supports multiple Indian languages (e.g., kannada, hindi, tamil); see models/asr.py for the full list.
185
- Logs: Check dhwani_api.log for detailed logs (rotated at 10MB, 5 backups).
186
- Performance: Use a GPU with flash-attn installed for faster TTS and ASR inference.
187
-
188
- - Troubleshooting
189
-
190
- - Module Errors: Ensure all dependencies are installed. Re-run pip install if needed.
191
- FFmpeg Not Found: Install FFmpeg and ensure it’s in your PATH.
192
- Permission Denied: Run with sudo if accessing restricted ports (e.g., < 1024).
193
-
@@ -1,16 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- src/__init__.py
5
- src/asr.py
6
- src/audio.py
7
- src/chat.py
8
- src/client.py
9
- src/docs.py
10
- src/exceptions.py
11
- src/vision.py
12
- src/dwani.egg-info/PKG-INFO
13
- src/dwani.egg-info/SOURCES.txt
14
- src/dwani.egg-info/dependency_links.txt
15
- src/dwani.egg-info/requires.txt
16
- src/dwani.egg-info/top_level.txt
@@ -1,6 +0,0 @@
1
- fastapi>=0.95.0
2
- uvicorn[standard]>=0.22.0
3
- pydantic>=2.0.0
4
- requests>=2.25.0
5
- python-multipart>=0.0.5
6
- pydantic-settings>=2.0.0
@@ -1,8 +0,0 @@
1
- __init__
2
- asr
3
- audio
4
- chat
5
- client
6
- docs
7
- exceptions
8
- vision
File without changes
File without changes
File without changes
File without changes
File without changes