speaker-encoder-pipeline 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. speaker_encoder_pipeline-0.1.0/LICENSE +21 -0
  2. speaker_encoder_pipeline-0.1.0/MANIFEST.in +19 -0
  3. speaker_encoder_pipeline-0.1.0/PKG-INFO +321 -0
  4. speaker_encoder_pipeline-0.1.0/README.md +270 -0
  5. speaker_encoder_pipeline-0.1.0/data/__init__.py +0 -0
  6. speaker_encoder_pipeline-0.1.0/data/augmentations.py +67 -0
  7. speaker_encoder_pipeline-0.1.0/data/dataset.py +73 -0
  8. speaker_encoder_pipeline-0.1.0/data/preprocessor.py +622 -0
  9. speaker_encoder_pipeline-0.1.0/data/sampler.py +75 -0
  10. speaker_encoder_pipeline-0.1.0/model/__init__.py +0 -0
  11. speaker_encoder_pipeline-0.1.0/model/cadence_branch.py +93 -0
  12. speaker_encoder_pipeline-0.1.0/model/encoder.py +98 -0
  13. speaker_encoder_pipeline-0.1.0/model/fusion.py +83 -0
  14. speaker_encoder_pipeline-0.1.0/model/loss.py +176 -0
  15. speaker_encoder_pipeline-0.1.0/model/ssl_branch.py +67 -0
  16. speaker_encoder_pipeline-0.1.0/model/timbre_branch.py +137 -0
  17. speaker_encoder_pipeline-0.1.0/pyproject.toml +129 -0
  18. speaker_encoder_pipeline-0.1.0/pytest.ini +41 -0
  19. speaker_encoder_pipeline-0.1.0/requirements-dev.txt +22 -0
  20. speaker_encoder_pipeline-0.1.0/requirements.txt +20 -0
  21. speaker_encoder_pipeline-0.1.0/setup.cfg +4 -0
  22. speaker_encoder_pipeline-0.1.0/setup.py +65 -0
  23. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/PKG-INFO +321 -0
  24. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/SOURCES.txt +36 -0
  25. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/dependency_links.txt +1 -0
  26. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/entry_points.txt +5 -0
  27. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/requires.txt +24 -0
  28. speaker_encoder_pipeline-0.1.0/speaker_encoder_pipeline.egg-info/top_level.txt +4 -0
  29. speaker_encoder_pipeline-0.1.0/tests/conftest.py +52 -0
  30. speaker_encoder_pipeline-0.1.0/tests/test_data.py +130 -0
  31. speaker_encoder_pipeline-0.1.0/tests/test_model.py +165 -0
  32. speaker_encoder_pipeline-0.1.0/tests/test_package.py +112 -0
  33. speaker_encoder_pipeline-0.1.0/tests/test_train.py +145 -0
  34. speaker_encoder_pipeline-0.1.0/train/__init__.py +0 -0
  35. speaker_encoder_pipeline-0.1.0/train/dataloader.py +169 -0
  36. speaker_encoder_pipeline-0.1.0/utils/__init__.py +0 -0
  37. speaker_encoder_pipeline-0.1.0/utils/audio_utils.py +105 -0
  38. speaker_encoder_pipeline-0.1.0/version.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Speaker Encoder Pipeline Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,19 @@
1
+ include README.md
2
+ include LICENSE
3
+ include version.txt
4
+ include requirements.txt
5
+ include requirements-dev.txt
6
+ include pytest.ini
7
+
8
+ recursive-include model *.py
9
+ recursive-include data *.py
10
+ recursive-include train *.py
11
+ recursive-include utils *.py
12
+ recursive-include tests *.py
13
+
14
+ recursive-exclude * __pycache__
15
+ recursive-exclude * *.pyc
16
+ recursive-exclude * *.pyo
17
+ recursive-exclude * .DS_Store
18
+ recursive-exclude wandb *
19
+ recursive-exclude models *
@@ -0,0 +1,321 @@
1
+ Metadata-Version: 2.4
2
+ Name: speaker-encoder-pipeline
3
+ Version: 0.1.0
4
+ Summary: Multilingual Speaker Encoder with Multi-branch Architecture for Few-shot Voice Synthesis
5
+ Home-page: https://github.com/yourusername/Zero_shotVoiceClone
6
+ Author: Your Name
7
+ Author-email: Your Name <your.email@example.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/yourusername/Zero_shotVoiceClone
10
+ Project-URL: Bug Reports, https://github.com/yourusername/Zero_shotVoiceClone/issues
11
+ Project-URL: Source Code, https://github.com/yourusername/Zero_shotVoiceClone
12
+ Keywords: speaker-encoder,voice-synthesis,multilingual,few-shot,deep-learning
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch>=2.0.0
26
+ Requires-Dist: torchaudio>=2.0.1
27
+ Requires-Dist: librosa>=0.9.0
28
+ Requires-Dist: numpy>=1.21.0
29
+ Requires-Dist: pandas>=1.3.0
30
+ Requires-Dist: scikit-learn>=1.0.0
31
+ Requires-Dist: tqdm>=4.60.0
32
+ Requires-Dist: wandb>=0.13.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
36
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
37
+ Requires-Dist: black>=22.0.0; extra == "dev"
38
+ Requires-Dist: flake8>=4.0.0; extra == "dev"
39
+ Requires-Dist: isort>=5.0.0; extra == "dev"
40
+ Requires-Dist: mypy>=0.990; extra == "dev"
41
+ Requires-Dist: build>=0.10.0; extra == "dev"
42
+ Requires-Dist: twine>=4.0.0; extra == "dev"
43
+ Provides-Extra: test
44
+ Requires-Dist: pytest>=7.0.0; extra == "test"
45
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
46
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
47
+ Dynamic: author
48
+ Dynamic: home-page
49
+ Dynamic: license-file
50
+ Dynamic: requires-python
51
+
52
+ # Speaker Encoder Pipeline
53
+
54
+ A multilingual speaker encoder with multi-branch architecture for few-shot voice synthesis. This package provides a robust speaker encoding system that captures timbre, cadence, and self-supervised learning features.
55
+
56
+ ## Features
57
+
58
+ - **Multi-branch Architecture**: Combines timbre, cadence, and SSL features
59
+ - **Multilingual Support**: English and Tamil language support
60
+ - **Few-shot Learning**: Efficient speaker encoding with minimal audio samples
61
+ - **Modular Design**: Easy to extend and customize
62
+ - **CI/CD Ready**: Fully configured with GitHub Actions
63
+
64
+ ## Installation
65
+
66
+ ### From PyPI (when published)
67
+
68
+ ```bash
69
+ pip install speaker-encoder-pipeline
70
+ ```
71
+
72
+ ### From Source
73
+
74
+ ```bash
75
+ git clone https://github.com/yourusername/Zero_shotVoiceClone.git
76
+ cd speaker_encoder_pipeline
77
+
78
+ # Install in development mode
79
+ pip install -e ".[dev]"
80
+ ```
81
+
82
+ ## Quick Start
83
+
84
+ ### Basic Usage
85
+
86
+ ```python
87
+ from model.encoder import MultilingualSpeakerEncoder
88
+ from config import SAMPLE_RATE
89
+ import torch
90
+
91
+ # Initialize model
92
+ encoder = MultilingualSpeakerEncoder(config)
93
+
94
+ # Prepare audio (mono, 16kHz)
95
+ audio = torch.randn(batch_size, SAMPLE_RATE * duration)
96
+
97
+ # Get speaker embedding
98
+ embedding = encoder(audio)
99
+ print(embedding.shape) # (batch_size, embedding_dim)
100
+ ```
101
+
102
+ ### Command-line Interface
103
+
104
+ ```bash
105
+ # Prepare data
106
+ speaker-encoder-prepare --data-dir ./data
107
+
108
+ # Train model
109
+ speaker-encoder-train --epochs 100 --batch-size 32
110
+
111
+ # Validate speakers
112
+ speaker-encoder-validate --model-path ./models/best_model.pt
113
+
114
+ # Evaluate performance
115
+ speaker-encoder-evaluate --test-dir ./test_data
116
+ ```
117
+
118
+ ## Development Setup
119
+
120
+ ### Installation with Dev Dependencies
121
+
122
+ ```bash
123
+ pip install -e ".[dev]"
124
+ ```
125
+
126
+ ### Running Tests
127
+
128
+ ```bash
129
+ # Run all tests
130
+ pytest
131
+
132
+ # Run with verbose output
133
+ pytest -v
134
+
135
+ # Run specific test file
136
+ pytest tests/test_model.py
137
+
138
+ # Run specific test
139
+ pytest tests/test_model.py::TestMultilingualSpeakerEncoder::test_forward_pass
140
+
141
+ # Run with coverage report
142
+ pytest --cov=model --cov=data --cov=train --cov=utils --cov-report=html
143
+
144
+ # Run only unit tests
145
+ pytest -m unit
146
+
147
+ # Run without slow tests
148
+ pytest -m "not slow"
149
+
150
+ # Run in parallel (faster)
151
+ pytest -n auto
152
+ ```
153
+
154
+ ### Code Quality
155
+
156
+ ```bash
157
+ # Format code with black
158
+ black model/ data/ train/ utils/
159
+
160
+ # Check imports with isort
161
+ isort model/ data/ train/ utils/
162
+
163
+ # Lint with flake8
164
+ flake8 model/ data/ train/ utils/
165
+
166
+ # Type checking with mypy
167
+ mypy model/ data/ train/ utils/
168
+ ```
169
+
170
+ ## Package Structure
171
+
172
+ ```
173
+ speaker_encoder_pipeline/
174
+ ├── model/
175
+ │ ├── encoder.py # Main encoder architecture
176
+ │ ├── timbre_branch.py # Timbre extraction
177
+ │ ├── cadence_branch.py # Cadence extraction
178
+ │ ├── ssl_branch.py # SSL features
179
+ │ ├── fusion.py # Feature fusion
180
+ │ └── loss.py # Loss functions
181
+ ├── data/
182
+ │ ├── dataset.py # Dataset classes
183
+ │ ├── preprocessor.py # Audio preprocessing
184
+ │ ├── augmentations.py # Data augmentation
185
+ │ ├── sampler.py # Sampling strategies
186
+ │ └── __init__.py
187
+ ├── train/
188
+ │ ├── dataloader.py # Data loading
189
+ │ └── __init__.py
190
+ ├── utils/
191
+ │ ├── audio_utils.py # Audio utilities
192
+ │ └── __init__.py
193
+ ├── tests/
194
+ │ ├── test_model.py # Model tests
195
+ │ ├── test_data.py # Data tests
196
+ │ ├── test_train.py # Training tests
197
+ │ ├── test_package.py # Package structure tests
198
+ │ └── conftest.py # Pytest configuration
199
+ ├── config.py # Configuration
200
+ ├── setup.py # Package setup
201
+ ├── pyproject.toml # Modern Python packaging
202
+ ├── pytest.ini # Pytest configuration
203
+ ├── requirements.txt # Core dependencies
204
+ ├── requirements-dev.txt # Development dependencies
205
+ └── README.md # This file
206
+ ```
207
+
208
+ ## Configuration
209
+
210
+ Edit `config.py` to customize:
211
+
212
+ - Audio parameters (sample rate, channels, format)
213
+ - Dataset paths and sources
214
+ - Model architecture parameters
215
+ - Training hyperparameters
216
+
217
+ ## Publishing to PyPI
218
+
219
+ ### 1. Update Version
220
+
221
+ ```bash
222
+ # Update version in version.txt
223
+ echo "0.2.0" > version.txt
224
+
225
+ # Commit changes
226
+ git add version.txt
227
+ git commit -m "Bump version to 0.2.0"
228
+ ```
229
+
230
+ ### 2. Create a Git Tag
231
+
232
+ ```bash
233
+ git tag -a v0.2.0 -m "Release version 0.2.0"
234
+ git push origin v0.2.0
235
+ ```
236
+
237
+ ### 3. GitHub Actions will automatically:
238
+ - Run all tests on multiple Python versions and OSes
239
+ - Build distribution packages (wheel and sdist)
240
+ - Publish to PyPI (requires `PYPI_API_TOKEN` secret in GitHub)
241
+ - Create a GitHub Release
242
+
243
+ ### 4. Configure PyPI Token (one-time setup)
244
+
245
+ In your GitHub repository:
246
+ 1. Go to **Settings → Secrets and variables → Actions**
247
+ 2. Add a new secret: `PYPI_API_TOKEN`
248
+ 3. Generate token at https://pypi.org/manage/account/token/
249
+
250
+ ## Testing
251
+
252
+ The project uses pytest with the following markers:
253
+
254
+ - `@pytest.mark.unit` - Unit tests
255
+ - `@pytest.mark.integration` - Integration tests
256
+ - `@pytest.mark.slow` - Slow tests
257
+ - `@pytest.mark.gpu` - GPU-required tests
258
+ - `@pytest.mark.model` - Model-specific tests
259
+ - `@pytest.mark.data` - Data processing tests
260
+ - `@pytest.mark.train` - Training tests
261
+
262
+ ## CI/CD Workflow
263
+
264
+ The GitHub Actions workflow (`python-package.yml`):
265
+
266
+ 1. **Test Job**: Runs on every push and pull request
267
+ - Tests on Ubuntu, Windows, and macOS
268
+ - Python 3.9, 3.10, 3.11
269
+ - Linting with flake8
270
+ - Code formatting with black
271
+ - Import sorting with isort
272
+ - Coverage reports to Codecov
273
+
274
+ 2. **Build and Publish Job**: Runs on version tags
275
+ - Builds distribution packages
276
+ - Validates with twine
277
+ - Publishes to PyPI
278
+ - Creates GitHub Release
279
+
280
+ ## Contributing
281
+
282
+ 1. Fork the repository
283
+ 2. Create a feature branch: `git checkout -b feature/your-feature`
284
+ 3. Make changes and write tests
285
+ 4. Ensure all tests pass: `pytest`
286
+ 5. Format code: `black` and `isort`
287
+ 6. Commit and push to your fork
288
+ 7. Create a Pull Request
289
+
290
+ ## License
291
+
292
+ MIT License - see LICENSE file for details
293
+
294
+ ## Citation
295
+
296
+ If you use this package in your research, please cite:
297
+
298
+ ```bibtex
299
+ @software{speaker_encoder_2025,
300
+ title={Speaker Encoder Pipeline},
301
+ author={Your Name},
302
+ year={2025},
303
+ url={https://github.com/yourusername/Zero_shotVoiceClone}
304
+ }
305
+ ```
306
+
307
+ ## Support
308
+
309
+ For issues and questions:
310
+ - Open an issue on GitHub
311
+ - Check existing documentation
312
+ - Review test cases for usage examples
313
+
314
+ ## Changelog
315
+
316
+ ### Version 0.1.0 (Initial Release)
317
+ - Multi-branch speaker encoder
318
+ - Multilingual support (English, Tamil)
319
+ - Complete training pipeline
320
+ - CI/CD with GitHub Actions
321
+ - PyPI packaging support
@@ -0,0 +1,270 @@
1
+ # Speaker Encoder Pipeline
2
+
3
+ A multilingual speaker encoder with multi-branch architecture for few-shot voice synthesis. This package provides a robust speaker encoding system that captures timbre, cadence, and self-supervised learning features.
4
+
5
+ ## Features
6
+
7
+ - **Multi-branch Architecture**: Combines timbre, cadence, and SSL features
8
+ - **Multilingual Support**: English and Tamil language support
9
+ - **Few-shot Learning**: Efficient speaker encoding with minimal audio samples
10
+ - **Modular Design**: Easy to extend and customize
11
+ - **CI/CD Ready**: Fully configured with GitHub Actions
12
+
13
+ ## Installation
14
+
15
+ ### From PyPI (when published)
16
+
17
+ ```bash
18
+ pip install speaker-encoder-pipeline
19
+ ```
20
+
21
+ ### From Source
22
+
23
+ ```bash
24
+ git clone https://github.com/yourusername/Zero_shotVoiceClone.git
25
+ cd speaker_encoder_pipeline
26
+
27
+ # Install in development mode
28
+ pip install -e ".[dev]"
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ### Basic Usage
34
+
35
+ ```python
36
+ from model.encoder import MultilingualSpeakerEncoder
37
+ from config import SAMPLE_RATE
38
+ import torch
39
+
40
+ # Initialize model
41
+ encoder = MultilingualSpeakerEncoder(config)
42
+
43
+ # Prepare audio (mono, 16kHz)
44
+ audio = torch.randn(batch_size, SAMPLE_RATE * duration)
45
+
46
+ # Get speaker embedding
47
+ embedding = encoder(audio)
48
+ print(embedding.shape) # (batch_size, embedding_dim)
49
+ ```
50
+
51
+ ### Command-line Interface
52
+
53
+ ```bash
54
+ # Prepare data
55
+ speaker-encoder-prepare --data-dir ./data
56
+
57
+ # Train model
58
+ speaker-encoder-train --epochs 100 --batch-size 32
59
+
60
+ # Validate speakers
61
+ speaker-encoder-validate --model-path ./models/best_model.pt
62
+
63
+ # Evaluate performance
64
+ speaker-encoder-evaluate --test-dir ./test_data
65
+ ```
66
+
67
+ ## Development Setup
68
+
69
+ ### Installation with Dev Dependencies
70
+
71
+ ```bash
72
+ pip install -e ".[dev]"
73
+ ```
74
+
75
+ ### Running Tests
76
+
77
+ ```bash
78
+ # Run all tests
79
+ pytest
80
+
81
+ # Run with verbose output
82
+ pytest -v
83
+
84
+ # Run specific test file
85
+ pytest tests/test_model.py
86
+
87
+ # Run specific test
88
+ pytest tests/test_model.py::TestMultilingualSpeakerEncoder::test_forward_pass
89
+
90
+ # Run with coverage report
91
+ pytest --cov=model --cov=data --cov=train --cov=utils --cov-report=html
92
+
93
+ # Run only unit tests
94
+ pytest -m unit
95
+
96
+ # Run without slow tests
97
+ pytest -m "not slow"
98
+
99
+ # Run in parallel (faster)
100
+ pytest -n auto
101
+ ```
102
+
103
+ ### Code Quality
104
+
105
+ ```bash
106
+ # Format code with black
107
+ black model/ data/ train/ utils/
108
+
109
+ # Check imports with isort
110
+ isort model/ data/ train/ utils/
111
+
112
+ # Lint with flake8
113
+ flake8 model/ data/ train/ utils/
114
+
115
+ # Type checking with mypy
116
+ mypy model/ data/ train/ utils/
117
+ ```
118
+
119
+ ## Package Structure
120
+
121
+ ```
122
+ speaker_encoder_pipeline/
123
+ ├── model/
124
+ │ ├── encoder.py # Main encoder architecture
125
+ │ ├── timbre_branch.py # Timbre extraction
126
+ │ ├── cadence_branch.py # Cadence extraction
127
+ │ ├── ssl_branch.py # SSL features
128
+ │ ├── fusion.py # Feature fusion
129
+ │ └── loss.py # Loss functions
130
+ ├── data/
131
+ │ ├── dataset.py # Dataset classes
132
+ │ ├── preprocessor.py # Audio preprocessing
133
+ │ ├── augmentations.py # Data augmentation
134
+ │ ├── sampler.py # Sampling strategies
135
+ │ └── __init__.py
136
+ ├── train/
137
+ │ ├── dataloader.py # Data loading
138
+ │ └── __init__.py
139
+ ├── utils/
140
+ │ ├── audio_utils.py # Audio utilities
141
+ │ └── __init__.py
142
+ ├── tests/
143
+ │ ├── test_model.py # Model tests
144
+ │ ├── test_data.py # Data tests
145
+ │ ├── test_train.py # Training tests
146
+ │ ├── test_package.py # Package structure tests
147
+ │ └── conftest.py # Pytest configuration
148
+ ├── config.py # Configuration
149
+ ├── setup.py # Package setup
150
+ ├── pyproject.toml # Modern Python packaging
151
+ ├── pytest.ini # Pytest configuration
152
+ ├── requirements.txt # Core dependencies
153
+ ├── requirements-dev.txt # Development dependencies
154
+ └── README.md # This file
155
+ ```
156
+
157
+ ## Configuration
158
+
159
+ Edit `config.py` to customize:
160
+
161
+ - Audio parameters (sample rate, channels, format)
162
+ - Dataset paths and sources
163
+ - Model architecture parameters
164
+ - Training hyperparameters
165
+
166
+ ## Publishing to PyPI
167
+
168
+ ### 1. Update Version
169
+
170
+ ```bash
171
+ # Update version in version.txt
172
+ echo "0.2.0" > version.txt
173
+
174
+ # Commit changes
175
+ git add version.txt
176
+ git commit -m "Bump version to 0.2.0"
177
+ ```
178
+
179
+ ### 2. Create a Git Tag
180
+
181
+ ```bash
182
+ git tag -a v0.2.0 -m "Release version 0.2.0"
183
+ git push origin v0.2.0
184
+ ```
185
+
186
+ ### 3. GitHub Actions will automatically:
187
+ - Run all tests on multiple Python versions and OSes
188
+ - Build distribution packages (wheel and sdist)
189
+ - Publish to PyPI (requires `PYPI_API_TOKEN` secret in GitHub)
190
+ - Create a GitHub Release
191
+
192
+ ### 4. Configure PyPI Token (one-time setup)
193
+
194
+ In your GitHub repository:
195
+ 1. Go to **Settings → Secrets and variables → Actions**
196
+ 2. Add a new secret: `PYPI_API_TOKEN`
197
+ 3. Generate token at https://pypi.org/manage/account/token/
198
+
199
+ ## Testing
200
+
201
+ The project uses pytest with the following markers:
202
+
203
+ - `@pytest.mark.unit` - Unit tests
204
+ - `@pytest.mark.integration` - Integration tests
205
+ - `@pytest.mark.slow` - Slow tests
206
+ - `@pytest.mark.gpu` - GPU-required tests
207
+ - `@pytest.mark.model` - Model-specific tests
208
+ - `@pytest.mark.data` - Data processing tests
209
+ - `@pytest.mark.train` - Training tests
210
+
211
+ ## CI/CD Workflow
212
+
213
+ The GitHub Actions workflow (`python-package.yml`):
214
+
215
+ 1. **Test Job**: Runs on every push and pull request
216
+ - Tests on Ubuntu, Windows, and macOS
217
+ - Python 3.9, 3.10, 3.11
218
+ - Linting with flake8
219
+ - Code formatting with black
220
+ - Import sorting with isort
221
+ - Coverage reports to Codecov
222
+
223
+ 2. **Build and Publish Job**: Runs on version tags
224
+ - Builds distribution packages
225
+ - Validates with twine
226
+ - Publishes to PyPI
227
+ - Creates GitHub Release
228
+
229
+ ## Contributing
230
+
231
+ 1. Fork the repository
232
+ 2. Create a feature branch: `git checkout -b feature/your-feature`
233
+ 3. Make changes and write tests
234
+ 4. Ensure all tests pass: `pytest`
235
+ 5. Format code: `black` and `isort`
236
+ 6. Commit and push to your fork
237
+ 7. Create a Pull Request
238
+
239
+ ## License
240
+
241
+ MIT License - see LICENSE file for details
242
+
243
+ ## Citation
244
+
245
+ If you use this package in your research, please cite:
246
+
247
+ ```bibtex
248
+ @software{speaker_encoder_2025,
249
+ title={Speaker Encoder Pipeline},
250
+ author={Your Name},
251
+ year={2025},
252
+ url={https://github.com/yourusername/Zero_shotVoiceClone}
253
+ }
254
+ ```
255
+
256
+ ## Support
257
+
258
+ For issues and questions:
259
+ - Open an issue on GitHub
260
+ - Check existing documentation
261
+ - Review test cases for usage examples
262
+
263
+ ## Changelog
264
+
265
+ ### Version 0.1.0 (Initial Release)
266
+ - Multi-branch speaker encoder
267
+ - Multilingual support (English, Tamil)
268
+ - Complete training pipeline
269
+ - CI/CD with GitHub Actions
270
+ - PyPI packaging support
File without changes
@@ -0,0 +1,67 @@
1
+ import torch
2
+ import torchaudio
3
+ import random
4
+ import numpy as np
5
+
6
+ class AudioAugmentations:
7
+ """On-the-fly audio augmentations for speaker encoder training."""
8
+
9
+ def __init__(self, config):
10
+ self.config = config
11
+ self.noise_snr_min = config.NOISE_SNR_MIN
12
+ self.noise_snr_max = config.NOISE_SNR_MAX
13
+ self.reverb_prob = config.REVERB_PROB
14
+ self.speed_prob = config.SPEED_PERTURB_PROB
15
+ self.speed_factors = config.SPEED_FACTORS
16
+
17
+ def __call__(self, audio):
18
+ """Apply augmentations to audio tensor."""
19
+ audio = audio.clone()
20
+
21
+ # Speed perturbation
22
+ if random.random() < self.speed_prob:
23
+ factor = random.choice(self.speed_factors)
24
+ audio = self._speed_perturb(audio, factor)
25
+
26
+ # Add noise
27
+ if self.noise_snr_min > 0:
28
+ audio = self._add_noise(audio)
29
+
30
+ # Reverb (simplified - can be enhanced with actual RIRs)
31
+ if random.random() < self.reverb_prob:
32
+ audio = self._add_reverb(audio)
33
+
34
+ return audio
35
+
36
+ def _speed_perturb(self, audio, factor):
37
+ """Change playback speed."""
38
+ # Simple resampling
39
+ indices = torch.arange(0, len(audio), factor)
40
+ indices = indices[indices < len(audio)].long()
41
+ return audio[indices]
42
+
43
+ def _add_noise(self, audio):
44
+ """Add random noise with specified SNR."""
45
+ # Generate random noise
46
+ noise = torch.randn_like(audio) * 0.01
47
+
48
+ # Calculate target SNR
49
+ snr_db = random.uniform(self.noise_snr_min, self.noise_snr_max)
50
+ snr = 10 ** (snr_db / 20)
51
+
52
+ # Scale noise to achieve target SNR
53
+ audio_power = torch.mean(audio ** 2)
54
+ noise_power = torch.mean(noise ** 2)
55
+ noise = noise * torch.sqrt(audio_power / (noise_power * snr ** 2))
56
+
57
+ return audio + noise
58
+
59
+ def _add_reverb(self, audio):
60
+ """Simple reverb simulation (impulse response placeholder)."""
61
+ # Simple delay effect
62
+ delay_len = 160 # 10ms at 16kHz
63
+ if len(audio) > delay_len:
64
+ delayed = torch.zeros_like(audio)
65
+ delayed[delay_len:] = audio[:-delay_len] * 0.3
66
+ return audio + delayed
67
+ return audio