speaker-encoder-pipeline 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speaker_encoder_pipeline-0.1.0/LICENSE +21 -0
- speaker_encoder_pipeline-0.1.0/MANIFEST.in +19 -0
- speaker_encoder_pipeline-0.1.0/PKG-INFO +321 -0
- speaker_encoder_pipeline-0.1.0/README.md +270 -0
- speaker_encoder_pipeline-0.1.0/data/__init__.py +0 -0
- speaker_encoder_pipeline-0.1.0/data/augmentations.py +67 -0
- speaker_encoder_pipeline-0.1.0/data/dataset.py +73 -0
- speaker_encoder_pipeline-0.1.0/data/preprocessor.py +622 -0
- speaker_encoder_pipeline-0.1.0/data/sampler.py +75 -0
- speaker_encoder_pipeline-0.1.0/model/__init__.py +0 -0
- speaker_encoder_pipeline-0.1.0/model/cadence_branch.py +93 -0
- speaker_encoder_pipeline-0.1.0/model/encoder.py +98 -0
- speaker_encoder_pipeline-0.1.0/model/fusion.py +83 -0
- speaker_encoder_pipeline-0.1.0/model/loss.py +176 -0
- speaker_encoder_pipeline-0.1.0/model/ssl_branch.py +67 -0
- speaker_encoder_pipeline-0.1.0/model/timbre_branch.py +137 -0
- speaker_encoder_pipeline-0.1.0/pyproject.toml +129 -0
- speaker_encoder_pipeline-0.1.0/pytest.ini +41 -0
- speaker_encoder_pipeline-0.1.0/requirements-dev.txt +22 -0
- speaker_encoder_pipeline-0.1.0/requirements.txt +20 -0
- speaker_encoder_pipeline-0.1.0/setup.cfg +4 -0
- speaker_encoder_pipeline-0.1.0/setup.py +65 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/PKG-INFO +321 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/SOURCES.txt +36 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/dependency_links.txt +1 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/entry_points.txt +5 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/requires.txt +24 -0
- speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/top_level.txt +4 -0
- speaker_encoder_pipeline-0.1.0/tests/conftest.py +52 -0
- speaker_encoder_pipeline-0.1.0/tests/test_data.py +130 -0
- speaker_encoder_pipeline-0.1.0/tests/test_model.py +165 -0
- speaker_encoder_pipeline-0.1.0/tests/test_package.py +112 -0
- speaker_encoder_pipeline-0.1.0/tests/test_train.py +145 -0
- speaker_encoder_pipeline-0.1.0/train/__init__.py +0 -0
- speaker_encoder_pipeline-0.1.0/train/dataloader.py +169 -0
- speaker_encoder_pipeline-0.1.0/utils/__init__.py +0 -0
- speaker_encoder_pipeline-0.1.0/utils/audio_utils.py +105 -0
- speaker_encoder_pipeline-0.1.0/version.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Speaker Encoder Pipeline Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include version.txt
|
|
4
|
+
include requirements.txt
|
|
5
|
+
include requirements-dev.txt
|
|
6
|
+
include pytest.ini
|
|
7
|
+
|
|
8
|
+
recursive-include model *.py
|
|
9
|
+
recursive-include data *.py
|
|
10
|
+
recursive-include train *.py
|
|
11
|
+
recursive-include utils *.py
|
|
12
|
+
recursive-include tests *.py
|
|
13
|
+
|
|
14
|
+
recursive-exclude * __pycache__
|
|
15
|
+
recursive-exclude * *.pyc
|
|
16
|
+
recursive-exclude * *.pyo
|
|
17
|
+
recursive-exclude * .DS_Store
|
|
18
|
+
recursive-exclude wandb *
|
|
19
|
+
recursive-exclude models *
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speaker-encoder-pipeline
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multilingual Speaker Encoder with Multi-branch Architecture for Few-shot Voice Synthesis
|
|
5
|
+
Home-page: https://github.com/yourusername/Zero_shotVoiceClone
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: Your Name <your.email@example.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/yourusername/Zero_shotVoiceClone
|
|
10
|
+
Project-URL: Bug Reports, https://github.com/yourusername/Zero_shotVoiceClone/issues
|
|
11
|
+
Project-URL: Source Code, https://github.com/yourusername/Zero_shotVoiceClone
|
|
12
|
+
Keywords: speaker-encoder,voice-synthesis,multilingual,few-shot,deep-learning
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=2.0.0
|
|
26
|
+
Requires-Dist: torchaudio>=2.0.1
|
|
27
|
+
Requires-Dist: librosa>=0.9.0
|
|
28
|
+
Requires-Dist: numpy>=1.21.0
|
|
29
|
+
Requires-Dist: pandas>=1.3.0
|
|
30
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
31
|
+
Requires-Dist: tqdm>=4.60.0
|
|
32
|
+
Requires-Dist: wandb>=0.13.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: flake8>=4.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: isort>=5.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=0.990; extra == "dev"
|
|
41
|
+
Requires-Dist: build>=0.10.0; extra == "dev"
|
|
42
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
43
|
+
Provides-Extra: test
|
|
44
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
45
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
46
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
|
|
47
|
+
Dynamic: author
|
|
48
|
+
Dynamic: home-page
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
Dynamic: requires-python
|
|
51
|
+
|
|
52
|
+
# Speaker Encoder Pipeline
|
|
53
|
+
|
|
54
|
+
A multilingual speaker encoder with multi-branch architecture for few-shot voice synthesis. This package provides a robust speaker encoding system that captures timbre, cadence, and self-supervised learning features.
|
|
55
|
+
|
|
56
|
+
## Features
|
|
57
|
+
|
|
58
|
+
- **Multi-branch Architecture**: Combines timbre, cadence, and SSL features
|
|
59
|
+
- **Multilingual Support**: English and Tamil language support
|
|
60
|
+
- **Few-shot Learning**: Efficient speaker encoding with minimal audio samples
|
|
61
|
+
- **Modular Design**: Easy to extend and customize
|
|
62
|
+
- **CI/CD Ready**: Fully configured with GitHub Actions
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
### From PyPI (when published)
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install speaker-encoder-pipeline
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### From Source
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
git clone https://github.com/yourusername/Zero_shotVoiceClone.git
|
|
76
|
+
cd speaker_encoder_pipeline
|
|
77
|
+
|
|
78
|
+
# Install in development mode
|
|
79
|
+
pip install -e ".[dev]"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Quick Start
|
|
83
|
+
|
|
84
|
+
### Basic Usage
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from model.encoder import MultilingualSpeakerEncoder
|
|
88
|
+
from config import SAMPLE_RATE
|
|
89
|
+
import torch
|
|
90
|
+
|
|
91
|
+
# Initialize model
|
|
92
|
+
encoder = MultilingualSpeakerEncoder(config)
|
|
93
|
+
|
|
94
|
+
# Prepare audio (mono, 16kHz)
|
|
95
|
+
audio = torch.randn(batch_size, SAMPLE_RATE * duration)
|
|
96
|
+
|
|
97
|
+
# Get speaker embedding
|
|
98
|
+
embedding = encoder(audio)
|
|
99
|
+
print(embedding.shape) # (batch_size, embedding_dim)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Command-line Interface
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# Prepare data
|
|
106
|
+
speaker-encoder-prepare --data-dir ./data
|
|
107
|
+
|
|
108
|
+
# Train model
|
|
109
|
+
speaker-encoder-train --epochs 100 --batch-size 32
|
|
110
|
+
|
|
111
|
+
# Validate speakers
|
|
112
|
+
speaker-encoder-validate --model-path ./models/best_model.pt
|
|
113
|
+
|
|
114
|
+
# Evaluate performance
|
|
115
|
+
speaker-encoder-evaluate --test-dir ./test_data
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Development Setup
|
|
119
|
+
|
|
120
|
+
### Installation with Dev Dependencies
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install -e ".[dev]"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Running Tests
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# Run all tests
|
|
130
|
+
pytest
|
|
131
|
+
|
|
132
|
+
# Run with verbose output
|
|
133
|
+
pytest -v
|
|
134
|
+
|
|
135
|
+
# Run specific test file
|
|
136
|
+
pytest tests/test_model.py
|
|
137
|
+
|
|
138
|
+
# Run specific test
|
|
139
|
+
pytest tests/test_model.py::TestMultilingualSpeakerEncoder::test_forward_pass
|
|
140
|
+
|
|
141
|
+
# Run with coverage report
|
|
142
|
+
pytest --cov=model --cov=data --cov=train --cov=utils --cov-report=html
|
|
143
|
+
|
|
144
|
+
# Run only unit tests
|
|
145
|
+
pytest -m unit
|
|
146
|
+
|
|
147
|
+
# Run without slow tests
|
|
148
|
+
pytest -m "not slow"
|
|
149
|
+
|
|
150
|
+
# Run in parallel (faster)
|
|
151
|
+
pytest -n auto
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Code Quality
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Format code with black
|
|
158
|
+
black model/ data/ train/ utils/
|
|
159
|
+
|
|
160
|
+
# Check imports with isort
|
|
161
|
+
isort model/ data/ train/ utils/
|
|
162
|
+
|
|
163
|
+
# Lint with flake8
|
|
164
|
+
flake8 model/ data/ train/ utils/
|
|
165
|
+
|
|
166
|
+
# Type checking with mypy
|
|
167
|
+
mypy model/ data/ train/ utils/
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Package Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
speaker_encoder_pipeline/
|
|
174
|
+
├── model/
|
|
175
|
+
│ ├── encoder.py # Main encoder architecture
|
|
176
|
+
│ ├── timbre_branch.py # Timbre extraction
|
|
177
|
+
│ ├── cadence_branch.py # Cadence extraction
|
|
178
|
+
│ ├── ssl_branch.py # SSL features
|
|
179
|
+
│ ├── fusion.py # Feature fusion
|
|
180
|
+
│ └── loss.py # Loss functions
|
|
181
|
+
├── data/
|
|
182
|
+
│ ├── dataset.py # Dataset classes
|
|
183
|
+
│ ├── preprocessor.py # Audio preprocessing
|
|
184
|
+
│ ├── augmentations.py # Data augmentation
|
|
185
|
+
│ ├── sampler.py # Sampling strategies
|
|
186
|
+
│ └── __init__.py
|
|
187
|
+
├── train/
|
|
188
|
+
│ ├── dataloader.py # Data loading
|
|
189
|
+
│ └── __init__.py
|
|
190
|
+
├── utils/
|
|
191
|
+
│ ├── audio_utils.py # Audio utilities
|
|
192
|
+
│ └── __init__.py
|
|
193
|
+
├── tests/
|
|
194
|
+
│ ├── test_model.py # Model tests
|
|
195
|
+
│ ├── test_data.py # Data tests
|
|
196
|
+
│ ├── test_train.py # Training tests
|
|
197
|
+
│ ├── test_package.py # Package structure tests
|
|
198
|
+
│ └── conftest.py # Pytest configuration
|
|
199
|
+
├── config.py # Configuration
|
|
200
|
+
├── setup.py # Package setup
|
|
201
|
+
├── pyproject.toml # Modern Python packaging
|
|
202
|
+
├── pytest.ini # Pytest configuration
|
|
203
|
+
├── requirements.txt # Core dependencies
|
|
204
|
+
├── requirements-dev.txt # Development dependencies
|
|
205
|
+
└── README.md # This file
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Configuration
|
|
209
|
+
|
|
210
|
+
Edit `config.py` to customize:
|
|
211
|
+
|
|
212
|
+
- Audio parameters (sample rate, channels, format)
|
|
213
|
+
- Dataset paths and sources
|
|
214
|
+
- Model architecture parameters
|
|
215
|
+
- Training hyperparameters
|
|
216
|
+
|
|
217
|
+
## Publishing to PyPI
|
|
218
|
+
|
|
219
|
+
### 1. Update Version
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Update version in version.txt
|
|
223
|
+
echo "0.2.0" > version.txt
|
|
224
|
+
|
|
225
|
+
# Commit changes
|
|
226
|
+
git add version.txt
|
|
227
|
+
git commit -m "Bump version to 0.2.0"
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### 2. Create a Git Tag
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
git tag -a v0.2.0 -m "Release version 0.2.0"
|
|
234
|
+
git push origin v0.2.0
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### 3. GitHub Actions will automatically:
|
|
238
|
+
- Run all tests on multiple Python versions and OSes
|
|
239
|
+
- Build distribution packages (wheel and sdist)
|
|
240
|
+
- Publish to PyPI (requires `PYPI_API_TOKEN` secret in GitHub)
|
|
241
|
+
- Create a GitHub Release
|
|
242
|
+
|
|
243
|
+
### 4. Configure PyPI Token (one-time setup)
|
|
244
|
+
|
|
245
|
+
In your GitHub repository:
|
|
246
|
+
1. Go to **Settings → Secrets and variables → Actions**
|
|
247
|
+
2. Add a new secret: `PYPI_API_TOKEN`
|
|
248
|
+
3. Generate token at https://pypi.org/manage/account/token/
|
|
249
|
+
|
|
250
|
+
## Testing
|
|
251
|
+
|
|
252
|
+
The project uses pytest with the following markers:
|
|
253
|
+
|
|
254
|
+
- `@pytest.mark.unit` - Unit tests
|
|
255
|
+
- `@pytest.mark.integration` - Integration tests
|
|
256
|
+
- `@pytest.mark.slow` - Slow tests
|
|
257
|
+
- `@pytest.mark.gpu` - GPU-required tests
|
|
258
|
+
- `@pytest.mark.model` - Model-specific tests
|
|
259
|
+
- `@pytest.mark.data` - Data processing tests
|
|
260
|
+
- `@pytest.mark.train` - Training tests
|
|
261
|
+
|
|
262
|
+
## CI/CD Workflow
|
|
263
|
+
|
|
264
|
+
The GitHub Actions workflow (`python-package.yml`):
|
|
265
|
+
|
|
266
|
+
1. **Test Job**: Runs on every push and pull request
|
|
267
|
+
- Tests on Ubuntu, Windows, and macOS
|
|
268
|
+
- Python 3.9, 3.10, 3.11
|
|
269
|
+
- Linting with flake8
|
|
270
|
+
- Code formatting with black
|
|
271
|
+
- Import sorting with isort
|
|
272
|
+
- Coverage reports to Codecov
|
|
273
|
+
|
|
274
|
+
2. **Build and Publish Job**: Runs on version tags
|
|
275
|
+
- Builds distribution packages
|
|
276
|
+
- Validates with twine
|
|
277
|
+
- Publishes to PyPI
|
|
278
|
+
- Creates GitHub Release
|
|
279
|
+
|
|
280
|
+
## Contributing
|
|
281
|
+
|
|
282
|
+
1. Fork the repository
|
|
283
|
+
2. Create a feature branch: `git checkout -b feature/your-feature`
|
|
284
|
+
3. Make changes and write tests
|
|
285
|
+
4. Ensure all tests pass: `pytest`
|
|
286
|
+
5. Format code: `black` and `isort`
|
|
287
|
+
6. Commit and push to your fork
|
|
288
|
+
7. Create a Pull Request
|
|
289
|
+
|
|
290
|
+
## License
|
|
291
|
+
|
|
292
|
+
MIT License - see LICENSE file for details
|
|
293
|
+
|
|
294
|
+
## Citation
|
|
295
|
+
|
|
296
|
+
If you use this package in your research, please cite:
|
|
297
|
+
|
|
298
|
+
```bibtex
|
|
299
|
+
@software{speaker_encoder_2025,
|
|
300
|
+
title={Speaker Encoder Pipeline},
|
|
301
|
+
author={Your Name},
|
|
302
|
+
year={2025},
|
|
303
|
+
url={https://github.com/yourusername/Zero_shotVoiceClone}
|
|
304
|
+
}
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## Support
|
|
308
|
+
|
|
309
|
+
For issues and questions:
|
|
310
|
+
- Open an issue on GitHub
|
|
311
|
+
- Check existing documentation
|
|
312
|
+
- Review test cases for usage examples
|
|
313
|
+
|
|
314
|
+
## Changelog
|
|
315
|
+
|
|
316
|
+
### Version 0.1.0 (Initial Release)
|
|
317
|
+
- Multi-branch speaker encoder
|
|
318
|
+
- Multilingual support (English, Tamil)
|
|
319
|
+
- Complete training pipeline
|
|
320
|
+
- CI/CD with GitHub Actions
|
|
321
|
+
- PyPI packaging support
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# Speaker Encoder Pipeline
|
|
2
|
+
|
|
3
|
+
A multilingual speaker encoder with multi-branch architecture for few-shot voice synthesis. This package provides a robust speaker encoding system that captures timbre, cadence, and self-supervised learning features.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multi-branch Architecture**: Combines timbre, cadence, and SSL features
|
|
8
|
+
- **Multilingual Support**: English and Tamil language support
|
|
9
|
+
- **Few-shot Learning**: Efficient speaker encoding with minimal audio samples
|
|
10
|
+
- **Modular Design**: Easy to extend and customize
|
|
11
|
+
- **CI/CD Ready**: Fully configured with GitHub Actions
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
### From PyPI (when published)
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install speaker-encoder-pipeline
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### From Source
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
git clone https://github.com/yourusername/Zero_shotVoiceClone.git
|
|
25
|
+
cd speaker_encoder_pipeline
|
|
26
|
+
|
|
27
|
+
# Install in development mode
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
### Basic Usage
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from model.encoder import MultilingualSpeakerEncoder
|
|
37
|
+
from config import SAMPLE_RATE
|
|
38
|
+
import torch
|
|
39
|
+
|
|
40
|
+
# Initialize model
|
|
41
|
+
encoder = MultilingualSpeakerEncoder(config)
|
|
42
|
+
|
|
43
|
+
# Prepare audio (mono, 16kHz)
|
|
44
|
+
audio = torch.randn(batch_size, SAMPLE_RATE * duration)
|
|
45
|
+
|
|
46
|
+
# Get speaker embedding
|
|
47
|
+
embedding = encoder(audio)
|
|
48
|
+
print(embedding.shape) # (batch_size, embedding_dim)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Command-line Interface
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Prepare data
|
|
55
|
+
speaker-encoder-prepare --data-dir ./data
|
|
56
|
+
|
|
57
|
+
# Train model
|
|
58
|
+
speaker-encoder-train --epochs 100 --batch-size 32
|
|
59
|
+
|
|
60
|
+
# Validate speakers
|
|
61
|
+
speaker-encoder-validate --model-path ./models/best_model.pt
|
|
62
|
+
|
|
63
|
+
# Evaluate performance
|
|
64
|
+
speaker-encoder-evaluate --test-dir ./test_data
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Development Setup
|
|
68
|
+
|
|
69
|
+
### Installation with Dev Dependencies
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install -e ".[dev]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Running Tests
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Run all tests
|
|
79
|
+
pytest
|
|
80
|
+
|
|
81
|
+
# Run with verbose output
|
|
82
|
+
pytest -v
|
|
83
|
+
|
|
84
|
+
# Run specific test file
|
|
85
|
+
pytest tests/test_model.py
|
|
86
|
+
|
|
87
|
+
# Run specific test
|
|
88
|
+
pytest tests/test_model.py::TestMultilingualSpeakerEncoder::test_forward_pass
|
|
89
|
+
|
|
90
|
+
# Run with coverage report
|
|
91
|
+
pytest --cov=model --cov=data --cov=train --cov=utils --cov-report=html
|
|
92
|
+
|
|
93
|
+
# Run only unit tests
|
|
94
|
+
pytest -m unit
|
|
95
|
+
|
|
96
|
+
# Run without slow tests
|
|
97
|
+
pytest -m "not slow"
|
|
98
|
+
|
|
99
|
+
# Run in parallel (faster)
|
|
100
|
+
pytest -n auto
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Code Quality
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Format code with black
|
|
107
|
+
black model/ data/ train/ utils/
|
|
108
|
+
|
|
109
|
+
# Check imports with isort
|
|
110
|
+
isort model/ data/ train/ utils/
|
|
111
|
+
|
|
112
|
+
# Lint with flake8
|
|
113
|
+
flake8 model/ data/ train/ utils/
|
|
114
|
+
|
|
115
|
+
# Type checking with mypy
|
|
116
|
+
mypy model/ data/ train/ utils/
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Package Structure
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
speaker_encoder_pipeline/
|
|
123
|
+
├── model/
|
|
124
|
+
│ ├── encoder.py # Main encoder architecture
|
|
125
|
+
│ ├── timbre_branch.py # Timbre extraction
|
|
126
|
+
│ ├── cadence_branch.py # Cadence extraction
|
|
127
|
+
│ ├── ssl_branch.py # SSL features
|
|
128
|
+
│ ├── fusion.py # Feature fusion
|
|
129
|
+
│ └── loss.py # Loss functions
|
|
130
|
+
├── data/
|
|
131
|
+
│ ├── dataset.py # Dataset classes
|
|
132
|
+
│ ├── preprocessor.py # Audio preprocessing
|
|
133
|
+
│ ├── augmentations.py # Data augmentation
|
|
134
|
+
│ ├── sampler.py # Sampling strategies
|
|
135
|
+
│ └── __init__.py
|
|
136
|
+
├── train/
|
|
137
|
+
│ ├── dataloader.py # Data loading
|
|
138
|
+
│ └── __init__.py
|
|
139
|
+
├── utils/
|
|
140
|
+
│ ├── audio_utils.py # Audio utilities
|
|
141
|
+
│ └── __init__.py
|
|
142
|
+
├── tests/
|
|
143
|
+
│ ├── test_model.py # Model tests
|
|
144
|
+
│ ├── test_data.py # Data tests
|
|
145
|
+
│ ├── test_train.py # Training tests
|
|
146
|
+
│ ├── test_package.py # Package structure tests
|
|
147
|
+
│ └── conftest.py # Pytest configuration
|
|
148
|
+
├── config.py # Configuration
|
|
149
|
+
├── setup.py # Package setup
|
|
150
|
+
├── pyproject.toml # Modern Python packaging
|
|
151
|
+
├── pytest.ini # Pytest configuration
|
|
152
|
+
├── requirements.txt # Core dependencies
|
|
153
|
+
├── requirements-dev.txt # Development dependencies
|
|
154
|
+
└── README.md # This file
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Configuration
|
|
158
|
+
|
|
159
|
+
Edit `config.py` to customize:
|
|
160
|
+
|
|
161
|
+
- Audio parameters (sample rate, channels, format)
|
|
162
|
+
- Dataset paths and sources
|
|
163
|
+
- Model architecture parameters
|
|
164
|
+
- Training hyperparameters
|
|
165
|
+
|
|
166
|
+
## Publishing to PyPI
|
|
167
|
+
|
|
168
|
+
### 1. Update Version
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
# Update version in version.txt
|
|
172
|
+
echo "0.2.0" > version.txt
|
|
173
|
+
|
|
174
|
+
# Commit changes
|
|
175
|
+
git add version.txt
|
|
176
|
+
git commit -m "Bump version to 0.2.0"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 2. Create a Git Tag
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
git tag -a v0.2.0 -m "Release version 0.2.0"
|
|
183
|
+
git push origin v0.2.0
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 3. GitHub Actions will automatically:
|
|
187
|
+
- Run all tests on multiple Python versions and OSes
|
|
188
|
+
- Build distribution packages (wheel and sdist)
|
|
189
|
+
- Publish to PyPI (requires `PYPI_API_TOKEN` secret in GitHub)
|
|
190
|
+
- Create a GitHub Release
|
|
191
|
+
|
|
192
|
+
### 4. Configure PyPI Token (one-time setup)
|
|
193
|
+
|
|
194
|
+
In your GitHub repository:
|
|
195
|
+
1. Go to **Settings → Secrets and variables → Actions**
|
|
196
|
+
2. Add a new secret: `PYPI_API_TOKEN`
|
|
197
|
+
3. Generate token at https://pypi.org/manage/account/token/
|
|
198
|
+
|
|
199
|
+
## Testing
|
|
200
|
+
|
|
201
|
+
The project uses pytest with the following markers:
|
|
202
|
+
|
|
203
|
+
- `@pytest.mark.unit` - Unit tests
|
|
204
|
+
- `@pytest.mark.integration` - Integration tests
|
|
205
|
+
- `@pytest.mark.slow` - Slow tests
|
|
206
|
+
- `@pytest.mark.gpu` - GPU-required tests
|
|
207
|
+
- `@pytest.mark.model` - Model-specific tests
|
|
208
|
+
- `@pytest.mark.data` - Data processing tests
|
|
209
|
+
- `@pytest.mark.train` - Training tests
|
|
210
|
+
|
|
211
|
+
## CI/CD Workflow
|
|
212
|
+
|
|
213
|
+
The GitHub Actions workflow (`python-package.yml`):
|
|
214
|
+
|
|
215
|
+
1. **Test Job**: Runs on every push and pull request
|
|
216
|
+
- Tests on Ubuntu, Windows, and macOS
|
|
217
|
+
- Python 3.9, 3.10, 3.11
|
|
218
|
+
- Linting with flake8
|
|
219
|
+
- Code formatting with black
|
|
220
|
+
- Import sorting with isort
|
|
221
|
+
- Coverage reports to Codecov
|
|
222
|
+
|
|
223
|
+
2. **Build and Publish Job**: Runs on version tags
|
|
224
|
+
- Builds distribution packages
|
|
225
|
+
- Validates with twine
|
|
226
|
+
- Publishes to PyPI
|
|
227
|
+
- Creates GitHub Release
|
|
228
|
+
|
|
229
|
+
## Contributing
|
|
230
|
+
|
|
231
|
+
1. Fork the repository
|
|
232
|
+
2. Create a feature branch: `git checkout -b feature/your-feature`
|
|
233
|
+
3. Make changes and write tests
|
|
234
|
+
4. Ensure all tests pass: `pytest`
|
|
235
|
+
5. Format code: `black` and `isort`
|
|
236
|
+
6. Commit and push to your fork
|
|
237
|
+
7. Create a Pull Request
|
|
238
|
+
|
|
239
|
+
## License
|
|
240
|
+
|
|
241
|
+
MIT License - see LICENSE file for details
|
|
242
|
+
|
|
243
|
+
## Citation
|
|
244
|
+
|
|
245
|
+
If you use this package in your research, please cite:
|
|
246
|
+
|
|
247
|
+
```bibtex
|
|
248
|
+
@software{speaker_encoder_2025,
|
|
249
|
+
title={Speaker Encoder Pipeline},
|
|
250
|
+
author={Your Name},
|
|
251
|
+
year={2025},
|
|
252
|
+
url={https://github.com/yourusername/Zero_shotVoiceClone}
|
|
253
|
+
}
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Support
|
|
257
|
+
|
|
258
|
+
For issues and questions:
|
|
259
|
+
- Open an issue on GitHub
|
|
260
|
+
- Check existing documentation
|
|
261
|
+
- Review test cases for usage examples
|
|
262
|
+
|
|
263
|
+
## Changelog
|
|
264
|
+
|
|
265
|
+
### Version 0.1.0 (Initial Release)
|
|
266
|
+
- Multi-branch speaker encoder
|
|
267
|
+
- Multilingual support (English, Tamil)
|
|
268
|
+
- Complete training pipeline
|
|
269
|
+
- CI/CD with GitHub Actions
|
|
270
|
+
- PyPI packaging support
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torchaudio
|
|
3
|
+
import random
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class AudioAugmentations:
|
|
7
|
+
"""On-the-fly audio augmentations for speaker encoder training."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, config):
|
|
10
|
+
self.config = config
|
|
11
|
+
self.noise_snr_min = config.NOISE_SNR_MIN
|
|
12
|
+
self.noise_snr_max = config.NOISE_SNR_MAX
|
|
13
|
+
self.reverb_prob = config.REVERB_PROB
|
|
14
|
+
self.speed_prob = config.SPEED_PERTURB_PROB
|
|
15
|
+
self.speed_factors = config.SPEED_FACTORS
|
|
16
|
+
|
|
17
|
+
def __call__(self, audio):
|
|
18
|
+
"""Apply augmentations to audio tensor."""
|
|
19
|
+
audio = audio.clone()
|
|
20
|
+
|
|
21
|
+
# Speed perturbation
|
|
22
|
+
if random.random() < self.speed_prob:
|
|
23
|
+
factor = random.choice(self.speed_factors)
|
|
24
|
+
audio = self._speed_perturb(audio, factor)
|
|
25
|
+
|
|
26
|
+
# Add noise
|
|
27
|
+
if self.noise_snr_min > 0:
|
|
28
|
+
audio = self._add_noise(audio)
|
|
29
|
+
|
|
30
|
+
# Reverb (simplified - can be enhanced with actual RIRs)
|
|
31
|
+
if random.random() < self.reverb_prob:
|
|
32
|
+
audio = self._add_reverb(audio)
|
|
33
|
+
|
|
34
|
+
return audio
|
|
35
|
+
|
|
36
|
+
def _speed_perturb(self, audio, factor):
|
|
37
|
+
"""Change playback speed."""
|
|
38
|
+
# Simple resampling
|
|
39
|
+
indices = torch.arange(0, len(audio), factor)
|
|
40
|
+
indices = indices[indices < len(audio)].long()
|
|
41
|
+
return audio[indices]
|
|
42
|
+
|
|
43
|
+
def _add_noise(self, audio):
|
|
44
|
+
"""Add random noise with specified SNR."""
|
|
45
|
+
# Generate random noise
|
|
46
|
+
noise = torch.randn_like(audio) * 0.01
|
|
47
|
+
|
|
48
|
+
# Calculate target SNR
|
|
49
|
+
snr_db = random.uniform(self.noise_snr_min, self.noise_snr_max)
|
|
50
|
+
snr = 10 ** (snr_db / 20)
|
|
51
|
+
|
|
52
|
+
# Scale noise to achieve target SNR
|
|
53
|
+
audio_power = torch.mean(audio ** 2)
|
|
54
|
+
noise_power = torch.mean(noise ** 2)
|
|
55
|
+
noise = noise * torch.sqrt(audio_power / (noise_power * snr ** 2))
|
|
56
|
+
|
|
57
|
+
return audio + noise
|
|
58
|
+
|
|
59
|
+
def _add_reverb(self, audio):
|
|
60
|
+
"""Simple reverb simulation (impulse response placeholder)."""
|
|
61
|
+
# Simple delay effect
|
|
62
|
+
delay_len = 160 # 10ms at 16kHz
|
|
63
|
+
if len(audio) > delay_len:
|
|
64
|
+
delayed = torch.zeros_like(audio)
|
|
65
|
+
delayed[delay_len:] = audio[:-delay_len] * 0.3
|
|
66
|
+
return audio + delayed
|
|
67
|
+
return audio
|