voicepipe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voicepipe-0.1.0/LICENSE +21 -0
- voicepipe-0.1.0/PKG-INFO +207 -0
- voicepipe-0.1.0/README.md +172 -0
- voicepipe-0.1.0/pyproject.toml +69 -0
- voicepipe-0.1.0/setup.cfg +4 -0
- voicepipe-0.1.0/src/voicepipe/__init__.py +18 -0
- voicepipe-0.1.0/src/voicepipe/audio.py +156 -0
- voicepipe-0.1.0/src/voicepipe/cli.py +79 -0
- voicepipe-0.1.0/src/voicepipe/stt.py +178 -0
- voicepipe-0.1.0/src/voicepipe/tts.py +230 -0
- voicepipe-0.1.0/src/voicepipe/voice_pipeline.py +251 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/PKG-INFO +207 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/SOURCES.txt +15 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/dependency_links.txt +1 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/entry_points.txt +2 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/requires.txt +14 -0
- voicepipe-0.1.0/src/voicepipe.egg-info/top_level.txt +1 -0
voicepipe-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DanLab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
voicepipe-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: voicepipe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One-command STT + TTS for any app
|
|
5
|
+
Author-email: DanLab <dan@danlab.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/danlab-ai/voicepipe
|
|
8
|
+
Project-URL: Documentation, https://voicepipe.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/danlab-ai/voicepipe
|
|
10
|
+
Project-URL: Issues, https://github.com/danlab-ai/voicepipe/issues
|
|
11
|
+
Keywords: stt,tts,speech,voice,whisper,kittentts,ai
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy>=1.20.0
|
|
24
|
+
Provides-Extra: kittentts
|
|
25
|
+
Requires-Dist: kittentts; extra == "kittentts"
|
|
26
|
+
Provides-Extra: gtts
|
|
27
|
+
Requires-Dist: gtts; extra == "gtts"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
31
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# VoicePipe
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<strong>One-command voice integration for any app</strong>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://pypi.org/project/voicepipe/">
|
|
44
|
+
<img src="https://img.shields.io/pypi/v/voicepipe.svg" alt="PyPI version">
|
|
45
|
+
</a>
|
|
46
|
+
<a href="https://pypi.org/project/voicepipe/">
|
|
47
|
+
<img src="https://img.shields.io/pypi/pyversions/voicepipe.svg" alt="Python versions">
|
|
48
|
+
</a>
|
|
49
|
+
<a href="https://github.com/danlab-ai/voicepipe/blob/main/LICENSE">
|
|
50
|
+
<img src="https://img.shields.io/github/license/danlab-ai/voicepipe.svg" alt="License">
|
|
51
|
+
</a>
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Overview
|
|
57
|
+
|
|
58
|
+
VoicePipe provides **one-command** STT (Speech-to-Text) + TTS (Text-to-Speech) for any application.
|
|
59
|
+
|
|
60
|
+
- **STT**: whisper.cpp - fastest local speech recognition
|
|
61
|
+
- **TTS**: KittenTTS - smallest neural TTS (15-80MB)
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install voicepipe
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from voicepipe import VoicePipeline
|
|
73
|
+
|
|
74
|
+
# Initialize (auto-downloads models)
|
|
75
|
+
voice = VoicePipeline()
|
|
76
|
+
|
|
77
|
+
# Speech to Text
|
|
78
|
+
text = voice.speech_to_text("audio.wav")
|
|
79
|
+
print(f"You said: {text}")
|
|
80
|
+
|
|
81
|
+
# Text to Speech
|
|
82
|
+
audio = voice.text_to_speech("Hello, world!")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Requirements
|
|
86
|
+
|
|
87
|
+
- Python 3.8+
|
|
88
|
+
- FFmpeg (for audio processing)
|
|
89
|
+
|
|
90
|
+
## Install FFmpeg
|
|
91
|
+
|
|
92
|
+
**macOS:**
|
|
93
|
+
```bash
|
|
94
|
+
brew install ffmpeg
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Linux:**
|
|
98
|
+
```bash
|
|
99
|
+
sudo apt install ffmpeg
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Windows:**
|
|
103
|
+
```powershell
|
|
104
|
+
choco install ffmpeg
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Configuration
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
voice = VoicePipeline(
|
|
111
|
+
stt_model="tiny", # tiny, base, small
|
|
112
|
+
tts_model="nano", # nano, micro, mini
|
|
113
|
+
tts_voice="Bella", # 8 voices available
|
|
114
|
+
tts_speed=1.0, # 0.5 - 2.0
|
|
115
|
+
language="en", # or "auto"
|
|
116
|
+
cache_dir="~/.voicepipe" # model cache
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Available Voices
|
|
121
|
+
|
|
122
|
+
- Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo
|
|
123
|
+
|
|
124
|
+
## Models
|
|
125
|
+
|
|
126
|
+
### STT (whisper.cpp)
|
|
127
|
+
| Model | Size | RAM | Speed |
|
|
128
|
+
|-------|------|-----|-------|
|
|
129
|
+
| tiny | 75MB | ~500MB | 10x realtime |
|
|
130
|
+
| base | 142MB | ~1GB | 5x realtime |
|
|
131
|
+
| small | 466MB | ~2GB | 2x realtime |
|
|
132
|
+
|
|
133
|
+
### TTS (KittenTTS)
|
|
134
|
+
| Model | Size | Quality |
|
|
135
|
+
|-------|------|---------|
|
|
136
|
+
| nano | 15MB | Good |
|
|
137
|
+
| micro | 40MB | Better |
|
|
138
|
+
| mini | 80MB | Best |
|
|
139
|
+
|
|
140
|
+
## Use Cases
|
|
141
|
+
|
|
142
|
+
### Chatbot with Voice
|
|
143
|
+
```python
|
|
144
|
+
@app.post("/voice/chat")
|
|
145
|
+
async def voice_chat(audio: bytes):
|
|
146
|
+
# Convert speech to text
|
|
147
|
+
text = voice.speech_to_text_bytes(audio)
|
|
148
|
+
|
|
149
|
+
# Get chatbot response
|
|
150
|
+
response = await chatbot.chat(text)
|
|
151
|
+
|
|
152
|
+
# Convert response to speech
|
|
153
|
+
audio_response = voice.text_to_speech(response)
|
|
154
|
+
|
|
155
|
+
return {"audio": audio_response}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Voice Assistant
|
|
159
|
+
```python
|
|
160
|
+
async def run_assistant():
|
|
161
|
+
while True:
|
|
162
|
+
# Continuously listen and respond
|
|
163
|
+
text = await voice.speech_to_text_async(microphone_stream)
|
|
164
|
+
response = await assistant.respond(text)
|
|
165
|
+
voice.text_to_speech(response, play=True)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## API Reference
|
|
169
|
+
|
|
170
|
+
### VoicePipeline
|
|
171
|
+
|
|
172
|
+
| Method | Description |
|
|
173
|
+
|--------|-------------|
|
|
174
|
+
| `speech_to_text(audio_path)` | Convert audio file to text |
|
|
175
|
+
| `speech_to_text_bytes(audio_data)` | Convert raw audio to text |
|
|
176
|
+
| `text_to_speech(text)` | Convert text to audio bytes |
|
|
177
|
+
| `text_to_speech_file(text, path)` | Convert text to audio file |
|
|
178
|
+
| `list_voices()` | Get available TTS voices |
|
|
179
|
+
| `get_status()` | Get pipeline status |
|
|
180
|
+
|
|
181
|
+
## Development
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Clone repository
|
|
185
|
+
git clone https://github.com/danlab-ai/voicepipe.git
|
|
186
|
+
cd voicepipe
|
|
187
|
+
|
|
188
|
+
# Install in development mode
|
|
189
|
+
pip install -e ".[dev]"
|
|
190
|
+
|
|
191
|
+
# Run tests
|
|
192
|
+
pytest
|
|
193
|
+
|
|
194
|
+
# Format code
|
|
195
|
+
black src/voicepipe
|
|
196
|
+
ruff check src/voicepipe
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
MIT License - see [LICENSE](LICENSE)
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
<p align="center">
|
|
206
|
+
Built by <a href="https://danlab.dev">DanLab</a>
|
|
207
|
+
</p>
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# VoicePipe
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<strong>One-command voice integration for any app</strong>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://pypi.org/project/voicepipe/">
|
|
9
|
+
<img src="https://img.shields.io/pypi/v/voicepipe.svg" alt="PyPI version">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/voicepipe/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/pyversions/voicepipe.svg" alt="Python versions">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://github.com/danlab-ai/voicepipe/blob/main/LICENSE">
|
|
15
|
+
<img src="https://img.shields.io/github/license/danlab-ai/voicepipe.svg" alt="License">
|
|
16
|
+
</a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Overview
|
|
22
|
+
|
|
23
|
+
VoicePipe provides **one-command** STT (Speech-to-Text) + TTS (Text-to-Speech) for any application.
|
|
24
|
+
|
|
25
|
+
- **STT**: whisper.cpp - fastest local speech recognition
|
|
26
|
+
- **TTS**: KittenTTS - smallest neural TTS (15-80MB)
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install voicepipe
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from voicepipe import VoicePipeline
|
|
38
|
+
|
|
39
|
+
# Initialize (auto-downloads models)
|
|
40
|
+
voice = VoicePipeline()
|
|
41
|
+
|
|
42
|
+
# Speech to Text
|
|
43
|
+
text = voice.speech_to_text("audio.wav")
|
|
44
|
+
print(f"You said: {text}")
|
|
45
|
+
|
|
46
|
+
# Text to Speech
|
|
47
|
+
audio = voice.text_to_speech("Hello, world!")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Requirements
|
|
51
|
+
|
|
52
|
+
- Python 3.8+
|
|
53
|
+
- FFmpeg (for audio processing)
|
|
54
|
+
|
|
55
|
+
## Install FFmpeg
|
|
56
|
+
|
|
57
|
+
**macOS:**
|
|
58
|
+
```bash
|
|
59
|
+
brew install ffmpeg
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Linux:**
|
|
63
|
+
```bash
|
|
64
|
+
sudo apt install ffmpeg
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Windows:**
|
|
68
|
+
```powershell
|
|
69
|
+
choco install ffmpeg
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Configuration
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
voice = VoicePipeline(
|
|
76
|
+
stt_model="tiny", # tiny, base, small
|
|
77
|
+
tts_model="nano", # nano, micro, mini
|
|
78
|
+
tts_voice="Bella", # 8 voices available
|
|
79
|
+
tts_speed=1.0, # 0.5 - 2.0
|
|
80
|
+
language="en", # or "auto"
|
|
81
|
+
cache_dir="~/.voicepipe" # model cache
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Available Voices
|
|
86
|
+
|
|
87
|
+
- Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo
|
|
88
|
+
|
|
89
|
+
## Models
|
|
90
|
+
|
|
91
|
+
### STT (whisper.cpp)
|
|
92
|
+
| Model | Size | RAM | Speed |
|
|
93
|
+
|-------|------|-----|-------|
|
|
94
|
+
| tiny | 75MB | ~500MB | 10x realtime |
|
|
95
|
+
| base | 142MB | ~1GB | 5x realtime |
|
|
96
|
+
| small | 466MB | ~2GB | 2x realtime |
|
|
97
|
+
|
|
98
|
+
### TTS (KittenTTS)
|
|
99
|
+
| Model | Size | Quality |
|
|
100
|
+
|-------|------|---------|
|
|
101
|
+
| nano | 15MB | Good |
|
|
102
|
+
| micro | 40MB | Better |
|
|
103
|
+
| mini | 80MB | Best |
|
|
104
|
+
|
|
105
|
+
## Use Cases
|
|
106
|
+
|
|
107
|
+
### Chatbot with Voice
|
|
108
|
+
```python
|
|
109
|
+
@app.post("/voice/chat")
|
|
110
|
+
async def voice_chat(audio: bytes):
|
|
111
|
+
# Convert speech to text
|
|
112
|
+
text = voice.speech_to_text_bytes(audio)
|
|
113
|
+
|
|
114
|
+
# Get chatbot response
|
|
115
|
+
response = await chatbot.chat(text)
|
|
116
|
+
|
|
117
|
+
# Convert response to speech
|
|
118
|
+
audio_response = voice.text_to_speech(response)
|
|
119
|
+
|
|
120
|
+
return {"audio": audio_response}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Voice Assistant
|
|
124
|
+
```python
|
|
125
|
+
async def run_assistant():
|
|
126
|
+
while True:
|
|
127
|
+
# Continuously listen and respond
|
|
128
|
+
text = await voice.speech_to_text_async(microphone_stream)
|
|
129
|
+
response = await assistant.respond(text)
|
|
130
|
+
voice.text_to_speech(response, play=True)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## API Reference
|
|
134
|
+
|
|
135
|
+
### VoicePipeline
|
|
136
|
+
|
|
137
|
+
| Method | Description |
|
|
138
|
+
|--------|-------------|
|
|
139
|
+
| `speech_to_text(audio_path)` | Convert audio file to text |
|
|
140
|
+
| `speech_to_text_bytes(audio_data)` | Convert raw audio to text |
|
|
141
|
+
| `text_to_speech(text)` | Convert text to audio bytes |
|
|
142
|
+
| `text_to_speech_file(text, path)` | Convert text to audio file |
|
|
143
|
+
| `list_voices()` | Get available TTS voices |
|
|
144
|
+
| `get_status()` | Get pipeline status |
|
|
145
|
+
|
|
146
|
+
## Development
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Clone repository
|
|
150
|
+
git clone https://github.com/danlab-ai/voicepipe.git
|
|
151
|
+
cd voicepipe
|
|
152
|
+
|
|
153
|
+
# Install in development mode
|
|
154
|
+
pip install -e ".[dev]"
|
|
155
|
+
|
|
156
|
+
# Run tests
|
|
157
|
+
pytest
|
|
158
|
+
|
|
159
|
+
# Format code
|
|
160
|
+
black src/voicepipe
|
|
161
|
+
ruff check src/voicepipe
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
MIT License - see [LICENSE](LICENSE)
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
<p align="center">
|
|
171
|
+
Built by <a href="https://danlab.dev">DanLab</a>
|
|
172
|
+
</p>
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "voicepipe"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "One-command STT + TTS for any app"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "DanLab", email = "dan@danlab.dev"},
|
|
13
|
+
]
|
|
14
|
+
keywords = ["stt", "tts", "speech", "voice", "whisper", "kittentts", "ai"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.8",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
dependencies = [
|
|
28
|
+
"numpy>=1.20.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
kittentts = ["kittentts"]
|
|
33
|
+
gtts = ["gtts"]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=7.0.0",
|
|
36
|
+
"pytest-asyncio>=0.21.0",
|
|
37
|
+
"black>=23.0.0",
|
|
38
|
+
"mypy>=1.0.0",
|
|
39
|
+
"ruff>=0.1.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/danlab-ai/voicepipe"
|
|
44
|
+
Documentation = "https://voicepipe.readthedocs.io"
|
|
45
|
+
Repository = "https://github.com/danlab-ai/voicepipe"
|
|
46
|
+
Issues = "https://github.com/danlab-ai/voicepipe/issues"
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
voicepipe = "voicepipe.cli:main"
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.packages.find]
|
|
52
|
+
where = ["src"]
|
|
53
|
+
|
|
54
|
+
[tool.setuptools.package-data]
|
|
55
|
+
voicepipe = ["py.typed"]
|
|
56
|
+
|
|
57
|
+
[tool.black]
|
|
58
|
+
line-length = 100
|
|
59
|
+
target-version = ["py38"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff]
|
|
62
|
+
line-length = 100
|
|
63
|
+
target-version = "py38"
|
|
64
|
+
|
|
65
|
+
[tool.mypy]
|
|
66
|
+
python_version = "3.8"
|
|
67
|
+
warn_return_any = true
|
|
68
|
+
warn_unused_configs = true
|
|
69
|
+
disallow_untyped_defs = false
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VoicePipe - Universal Voice Pipeline
|
|
3
|
+
One-command STT + TTS for any app
|
|
4
|
+
|
|
5
|
+
Install: pip install voicepipe
|
|
6
|
+
Usage:
|
|
7
|
+
from voicepipe import VoicePipeline
|
|
8
|
+
voice = VoicePipeline()
|
|
9
|
+
text = voice.speech_to_text("audio.wav")
|
|
10
|
+
audio = voice.text_to_speech("Hello!")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
14
|
+
__author__ = "DanLab"
|
|
15
|
+
|
|
16
|
+
from voicepipe.voice_pipeline import VoicePipeline
|
|
17
|
+
|
|
18
|
+
__all__ = ["VoicePipeline", "__version__"]
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio utilities for VoicePipe
|
|
3
|
+
"""
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AudioUtils:
|
|
12
|
+
"""Audio processing utilities."""
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def convert_audio(
|
|
16
|
+
input_path: str,
|
|
17
|
+
output_path: str,
|
|
18
|
+
sample_rate: int = 16000,
|
|
19
|
+
mono: bool = True,
|
|
20
|
+
format: str = "wav",
|
|
21
|
+
) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Convert audio file to different format.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
input_path: Input audio file
|
|
27
|
+
output_path: Output audio file
|
|
28
|
+
sample_rate: Target sample rate
|
|
29
|
+
mono: Convert to mono
|
|
30
|
+
format: Output format
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Path to converted file
|
|
34
|
+
"""
|
|
35
|
+
mono_arg = "-ac 1" if mono else ""
|
|
36
|
+
cmd = [
|
|
37
|
+
"ffmpeg", "-y",
|
|
38
|
+
"-i", input_path,
|
|
39
|
+
"-ar", str(sample_rate),
|
|
40
|
+
mono_arg,
|
|
41
|
+
f"-c:a", "pcm_s16le" if format == "wav" else format,
|
|
42
|
+
output_path,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
46
|
+
|
|
47
|
+
if result.returncode != 0:
|
|
48
|
+
raise RuntimeError(f"Audio conversion failed: {result.stderr}")
|
|
49
|
+
|
|
50
|
+
return output_path
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def get_audio_duration(audio_path: str) -> float:
|
|
54
|
+
"""Get duration of audio file in seconds."""
|
|
55
|
+
cmd = [
|
|
56
|
+
"ffprobe",
|
|
57
|
+
"-v", "error",
|
|
58
|
+
"-show_entries", "format=duration",
|
|
59
|
+
"-of", "default=noprint_wrappers=1:nokey=1",
|
|
60
|
+
audio_path,
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
64
|
+
|
|
65
|
+
if result.returncode != 0:
|
|
66
|
+
raise RuntimeError(f"Failed to get duration: {result.stderr}")
|
|
67
|
+
|
|
68
|
+
return float(result.stdout.strip())
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def normalize_audio(input_path: str, output_path: str) -> str:
|
|
72
|
+
"""Normalize audio volume."""
|
|
73
|
+
cmd = [
|
|
74
|
+
"ffmpeg", "-y",
|
|
75
|
+
"-i", input_path,
|
|
76
|
+
"-af", "loudnorm=I=-16:TP=-1.5:LRA=11",
|
|
77
|
+
output_path,
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
81
|
+
|
|
82
|
+
if result.returncode != 0:
|
|
83
|
+
raise RuntimeError(f"Audio normalization failed: {result.stderr}")
|
|
84
|
+
|
|
85
|
+
return output_path
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def trim_silence(
|
|
89
|
+
input_path: str,
|
|
90
|
+
output_path: str,
|
|
91
|
+
threshold: float = -40,
|
|
92
|
+
min_duration: float = 0.5,
|
|
93
|
+
) -> str:
|
|
94
|
+
"""Trim silence from audio."""
|
|
95
|
+
cmd = [
|
|
96
|
+
"ffmpeg", "-y",
|
|
97
|
+
"-i", input_path,
|
|
98
|
+
"-af", f"silenceremove=start_periods=1:start_duration={min_duration}:start_threshold={threshold}dB:detection=speech",
|
|
99
|
+
output_path,
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
103
|
+
|
|
104
|
+
if result.returncode != 0:
|
|
105
|
+
raise RuntimeError(f"Silence trim failed: {result.stderr}")
|
|
106
|
+
|
|
107
|
+
return output_path
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def split_stereo(input_path: str) -> tuple:
|
|
111
|
+
"""Split stereo audio to two mono files."""
|
|
112
|
+
left_path = input_path.replace(".wav", "_left.wav")
|
|
113
|
+
right_path = input_path.replace(".wav", "_right.wav")
|
|
114
|
+
|
|
115
|
+
# Left channel
|
|
116
|
+
subprocess.run([
|
|
117
|
+
"ffmpeg", "-y", "-i", input_path,
|
|
118
|
+
"-af", "pan=mono|c0=c0",
|
|
119
|
+
left_path
|
|
120
|
+
], capture_output=True)
|
|
121
|
+
|
|
122
|
+
# Right channel
|
|
123
|
+
subprocess.run([
|
|
124
|
+
"ffmpeg", "-y", "-i", input_path,
|
|
125
|
+
"-af", "pan=mono|c0=c1",
|
|
126
|
+
right_path
|
|
127
|
+
], capture_output=True)
|
|
128
|
+
|
|
129
|
+
return left_path, right_path
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def concatenate_audio(audio_files: list, output_path: str) -> str:
|
|
133
|
+
"""Concatenate multiple audio files."""
|
|
134
|
+
# Create file list
|
|
135
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix=".txt", delete=False) as f:
|
|
136
|
+
for audio_file in audio_files:
|
|
137
|
+
f.write(f"file '{audio_file}'\n")
|
|
138
|
+
list_path = f.name
|
|
139
|
+
|
|
140
|
+
cmd = [
|
|
141
|
+
"ffmpeg", "-y",
|
|
142
|
+
"-f", "concat",
|
|
143
|
+
"-safe", "0",
|
|
144
|
+
"-i", list_path,
|
|
145
|
+
"-c", "copy",
|
|
146
|
+
output_path,
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
150
|
+
|
|
151
|
+
os.unlink(list_path)
|
|
152
|
+
|
|
153
|
+
if result.returncode != 0:
|
|
154
|
+
raise RuntimeError(f"Audio concatenation failed: {result.stderr}")
|
|
155
|
+
|
|
156
|
+
return output_path
|