mlx-speech 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_speech-0.1.0/LICENSE +21 -0
- mlx_speech-0.1.0/PKG-INFO +154 -0
- mlx_speech-0.1.0/README.md +119 -0
- mlx_speech-0.1.0/pyproject.toml +36 -0
- mlx_speech-0.1.0/src/mlx_speech/__init__.py +5 -0
- mlx_speech-0.1.0/src/mlx_speech/audio/__init__.py +21 -0
- mlx_speech-0.1.0/src/mlx_speech/audio/io.py +179 -0
- mlx_speech-0.1.0/src/mlx_speech/checkpoints/__init__.py +21 -0
- mlx_speech-0.1.0/src/mlx_speech/checkpoints/layout.py +85 -0
- mlx_speech-0.1.0/src/mlx_speech/checkpoints/sharded.py +123 -0
- mlx_speech-0.1.0/src/mlx_speech/generation/__init__.py +42 -0
- mlx_speech-0.1.0/src/mlx_speech/generation/cohere_asr.py +187 -0
- mlx_speech-0.1.0/src/mlx_speech/generation/moss_delay.py +907 -0
- mlx_speech-0.1.0/src/mlx_speech/generation/moss_local.py +703 -0
- mlx_speech-0.1.0/src/mlx_speech/generation/vibevoice.py +414 -0
- mlx_speech-0.1.0/src/mlx_speech/models/__init__.py +1 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/__init__.py +31 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/checkpoint.py +400 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/config.py +254 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/decoder.py +317 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/encoder.py +426 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/feature_extraction.py +363 -0
- mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/tokenizer.py +116 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/__init__.py +49 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/checkpoint.py +329 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/config.py +157 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/model.py +702 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/__init__.py +45 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/cache.py +8 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/config.py +5 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/model.py +21 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/processor.py +21 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_common/tokenizer.py +7 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/__init__.py +92 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/checkpoint.py +253 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/config.py +105 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/dialogue.py +384 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/model.py +166 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/processor.py +338 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/sound_effect.py +44 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/tokenizer.py +7 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/__init__.py +77 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/cache.py +137 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/checkpoint.py +369 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/config.py +156 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/model.py +808 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/processor.py +652 -0
- mlx_speech-0.1.0/src/mlx_speech/models/moss_local/tokenizer.py +95 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/__init__.py +1 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/acoustic.py +609 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/checkpoint.py +341 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/config.py +263 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/connector.py +30 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/diffusion.py +421 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/model.py +208 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/qwen2.py +220 -0
- mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/tokenizer.py +77 -0
- mlx_speech-0.1.0/src/mlx_speech/py.typed +1 -0
mlx_speech-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 AppAutomaton swarm of agents
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlx-speech
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MLX-native speech library for Apple Silicon.
|
|
5
|
+
Author: AppAutomaton
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 AppAutomaton swarm of agents
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
Requires-Dist: mlx>=0.31.1
|
|
28
|
+
Requires-Dist: numpy
|
|
29
|
+
Requires-Dist: safetensors
|
|
30
|
+
Requires-Dist: soundfile>=0.13.1
|
|
31
|
+
Requires-Dist: tokenizers>=0.22.2
|
|
32
|
+
Requires-Python: >=3.13
|
|
33
|
+
Project-URL: Repository, https://github.com/appautomaton/mlx-speech
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# mlx-speech
|
|
37
|
+
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
[](https://developer.apple.com/documentation/apple-silicon)
|
|
41
|
+
|
|
42
|
+
Local speech synthesis on Apple Silicon, running pure MLX. No cloud, no PyTorch.
|
|
43
|
+
|
|
44
|
+
| Model | Best for |
|
|
45
|
+
| --- | --- |
|
|
46
|
+
| MossTTSLocal | shorter TTS, voice cloning, continuation |
|
|
47
|
+
| MOSS-TTSD | multi-speaker dialogue |
|
|
48
|
+
| MOSS-SoundEffect | text-to-sound-effect |
|
|
49
|
+
| VibeVoice | long-form speech, voice-conditioned generation |
|
|
50
|
+
|
|
51
|
+
## Requirements
|
|
52
|
+
|
|
53
|
+
- Apple Silicon Mac (M1 or later)
|
|
54
|
+
- Python 3.13+
|
|
55
|
+
- [uv](https://docs.astral.sh/uv/)
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/appautomaton/mlx-speech.git
|
|
61
|
+
cd mlx-speech
|
|
62
|
+
uv sync
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
> PyPI package (`pip install mlx-speech`) coming soon.
|
|
66
|
+
|
|
67
|
+
Convert the checkpoints you want to use — each model family has a `scripts/convert_*.py` entry point:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python scripts/convert_moss_local.py
|
|
71
|
+
python scripts/convert_moss_audio_tokenizer.py
|
|
72
|
+
python scripts/convert_moss_ttsd.py
|
|
73
|
+
python scripts/convert_moss_sound_effect.py
|
|
74
|
+
python scripts/convert_vibevoice.py
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Usage
|
|
78
|
+
|
|
79
|
+
**Generate speech:**
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python scripts/generate_moss_local.py \
|
|
83
|
+
--text "Hello, this is a test." \
|
|
84
|
+
--output outputs/moss_local.wav
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Clone a voice:**
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
python scripts/generate_moss_local.py \
|
|
91
|
+
--mode clone \
|
|
92
|
+
--text "Hello, this is a cloned sample." \
|
|
93
|
+
--reference-audio reference.wav \
|
|
94
|
+
--output outputs/moss_local_clone.wav
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Multi-speaker dialogue:**
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python scripts/generate_moss_ttsd.py \
|
|
101
|
+
--text "[S1] Watson, we should go now." \
|
|
102
|
+
--output outputs/ttsd.wav
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Sound effect:**
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
python scripts/generate_moss_sound_effect.py \
|
|
109
|
+
--ambient-sound "rolling thunder with steady rainfall on a metal roof" \
|
|
110
|
+
--duration-seconds 8 \
|
|
111
|
+
--output outputs/thunder.wav
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**VibeVoice:**
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
python scripts/generate_vibevoice.py \
|
|
118
|
+
--text "Hello from VibeVoice." \
|
|
119
|
+
--output outputs/vibevoice.wav
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Exploring the Codebase
|
|
123
|
+
|
|
124
|
+
The PyPI package is still in progress. The best way to explore right now is to drop the repo into an agentic coding tool like [Claude Code](https://claude.ai/code) or Cursor — the codebase is structured and self-describing, and an agent can walk you through it quickly.
|
|
125
|
+
|
|
126
|
+
## Model Guides
|
|
127
|
+
|
|
128
|
+
Each family has a doc covering behavior, flags, and known limitations:
|
|
129
|
+
|
|
130
|
+
- [MossTTSLocal](./docs/moss-local.md)
|
|
131
|
+
- [MOSS-TTSD](./docs/moss-ttsd.md)
|
|
132
|
+
- [MOSS-SoundEffect](./docs/moss-sound-effect.md)
|
|
133
|
+
- [VibeVoice](./docs/vibevoice.md)
|
|
134
|
+
|
|
135
|
+
## Development
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
uv run pytest
|
|
139
|
+
uv run ruff check .
|
|
140
|
+
uv build --no-sources
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
```text
|
|
144
|
+
mlx-speech/
|
|
145
|
+
src/mlx_speech/ library code
|
|
146
|
+
scripts/ conversion and generation entry points
|
|
147
|
+
models/ local checkpoints (not in git)
|
|
148
|
+
tests/ unit and integration tests
|
|
149
|
+
docs/ model-family behavior guides
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
MIT — see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# mlx-speech
|
|
2
|
+
|
|
3
|
+
[](LICENSE)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://developer.apple.com/documentation/apple-silicon)
|
|
6
|
+
|
|
7
|
+
Local speech synthesis on Apple Silicon, running pure MLX. No cloud, no PyTorch.
|
|
8
|
+
|
|
9
|
+
| Model | Best for |
|
|
10
|
+
| --- | --- |
|
|
11
|
+
| MossTTSLocal | shorter TTS, voice cloning, continuation |
|
|
12
|
+
| MOSS-TTSD | multi-speaker dialogue |
|
|
13
|
+
| MOSS-SoundEffect | text-to-sound-effect |
|
|
14
|
+
| VibeVoice | long-form speech, voice-conditioned generation |
|
|
15
|
+
|
|
16
|
+
## Requirements
|
|
17
|
+
|
|
18
|
+
- Apple Silicon Mac (M1 or later)
|
|
19
|
+
- Python 3.13+
|
|
20
|
+
- [uv](https://docs.astral.sh/uv/)
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/appautomaton/mlx-speech.git
|
|
26
|
+
cd mlx-speech
|
|
27
|
+
uv sync
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
> PyPI package (`pip install mlx-speech`) coming soon.
|
|
31
|
+
|
|
32
|
+
Convert the checkpoints you want to use — each model family has a `scripts/convert_*.py` entry point:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python scripts/convert_moss_local.py
|
|
36
|
+
python scripts/convert_moss_audio_tokenizer.py
|
|
37
|
+
python scripts/convert_moss_ttsd.py
|
|
38
|
+
python scripts/convert_moss_sound_effect.py
|
|
39
|
+
python scripts/convert_vibevoice.py
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
**Generate speech:**
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
python scripts/generate_moss_local.py \
|
|
48
|
+
--text "Hello, this is a test." \
|
|
49
|
+
--output outputs/moss_local.wav
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Clone a voice:**
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python scripts/generate_moss_local.py \
|
|
56
|
+
--mode clone \
|
|
57
|
+
--text "Hello, this is a cloned sample." \
|
|
58
|
+
--reference-audio reference.wav \
|
|
59
|
+
--output outputs/moss_local_clone.wav
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Multi-speaker dialogue:**
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
python scripts/generate_moss_ttsd.py \
|
|
66
|
+
--text "[S1] Watson, we should go now." \
|
|
67
|
+
--output outputs/ttsd.wav
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Sound effect:**
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
python scripts/generate_moss_sound_effect.py \
|
|
74
|
+
--ambient-sound "rolling thunder with steady rainfall on a metal roof" \
|
|
75
|
+
--duration-seconds 8 \
|
|
76
|
+
--output outputs/thunder.wav
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**VibeVoice:**
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python scripts/generate_vibevoice.py \
|
|
83
|
+
--text "Hello from VibeVoice." \
|
|
84
|
+
--output outputs/vibevoice.wav
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Exploring the Codebase
|
|
88
|
+
|
|
89
|
+
The PyPI package is still in progress. The best way to explore right now is to drop the repo into an agentic coding tool like [Claude Code](https://claude.ai/code) or Cursor — the codebase is structured and self-describing, and an agent can walk you through it quickly.
|
|
90
|
+
|
|
91
|
+
## Model Guides
|
|
92
|
+
|
|
93
|
+
Each family has a doc covering behavior, flags, and known limitations:
|
|
94
|
+
|
|
95
|
+
- [MossTTSLocal](./docs/moss-local.md)
|
|
96
|
+
- [MOSS-TTSD](./docs/moss-ttsd.md)
|
|
97
|
+
- [MOSS-SoundEffect](./docs/moss-sound-effect.md)
|
|
98
|
+
- [VibeVoice](./docs/vibevoice.md)
|
|
99
|
+
|
|
100
|
+
## Development
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
uv run pytest
|
|
104
|
+
uv run ruff check .
|
|
105
|
+
uv build --no-sources
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
```text
|
|
109
|
+
mlx-speech/
|
|
110
|
+
src/mlx_speech/ library code
|
|
111
|
+
scripts/ conversion and generation entry points
|
|
112
|
+
models/ local checkpoints (not in git)
|
|
113
|
+
tests/ unit and integration tests
|
|
114
|
+
docs/ model-family behavior guides
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT — see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "mlx-speech"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MLX-native speech library for Apple Silicon."
|
|
5
|
+
license = { file = "LICENSE" }
|
|
6
|
+
authors = [{ name = "AppAutomaton" }]
|
|
7
|
+
urls = { Repository = "https://github.com/appautomaton/mlx-speech" }
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mlx>=0.31.1",
|
|
12
|
+
"numpy",
|
|
13
|
+
"safetensors",
|
|
14
|
+
"soundfile>=0.13.1",
|
|
15
|
+
"tokenizers>=0.22.2",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[dependency-groups]
|
|
19
|
+
dev = [
|
|
20
|
+
"pytest>=8.3,<9",
|
|
21
|
+
"ruff>=0.11,<0.12",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["uv_build>=0.11.2,<0.12"]
|
|
26
|
+
build-backend = "uv_build"
|
|
27
|
+
|
|
28
|
+
[tool.pytest.ini_options]
|
|
29
|
+
testpaths = ["tests"]
|
|
30
|
+
markers = [
|
|
31
|
+
"local_integration: test requires local model artifacts or repo-specific runtime assets",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[tool.ruff]
|
|
35
|
+
target-version = "py313"
|
|
36
|
+
exclude = [".references", ".venv"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Audio utilities for mlx-voice."""
|
|
2
|
+
|
|
3
|
+
from .io import (
|
|
4
|
+
load_audio,
|
|
5
|
+
loudness_normalize,
|
|
6
|
+
mix_down_mono,
|
|
7
|
+
normalize_peak,
|
|
8
|
+
resample_audio,
|
|
9
|
+
trim_leading_silence,
|
|
10
|
+
write_wav,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"load_audio",
|
|
15
|
+
"loudness_normalize",
|
|
16
|
+
"mix_down_mono",
|
|
17
|
+
"normalize_peak",
|
|
18
|
+
"resample_audio",
|
|
19
|
+
"trim_leading_silence",
|
|
20
|
+
"write_wav",
|
|
21
|
+
]
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Small audio I/O helpers for local v0 validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import wave
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import mlx.core as mx
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import soundfile as sf
|
|
13
|
+
except ModuleNotFoundError: # pragma: no cover - exercised only in lean envs
|
|
14
|
+
sf = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def mix_down_mono(samples: mx.array) -> mx.array:
|
|
18
|
+
"""Convert audio shaped like (samples,) or (samples, channels) to mono."""
|
|
19
|
+
|
|
20
|
+
waveform = np.asarray(samples, dtype=np.float32)
|
|
21
|
+
if waveform.ndim == 1:
|
|
22
|
+
return mx.array(waveform, dtype=mx.float32)
|
|
23
|
+
if waveform.ndim != 2:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"Expected waveform with shape (samples,) or (samples, channels), got {waveform.shape}."
|
|
26
|
+
)
|
|
27
|
+
return mx.array(np.mean(waveform, axis=1, dtype=np.float32), dtype=mx.float32)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def resample_audio(
|
|
31
|
+
samples: mx.array,
|
|
32
|
+
*,
|
|
33
|
+
orig_sample_rate: int,
|
|
34
|
+
target_sample_rate: int,
|
|
35
|
+
) -> mx.array:
|
|
36
|
+
"""Resample mono audio with linear interpolation."""
|
|
37
|
+
|
|
38
|
+
if orig_sample_rate <= 0 or target_sample_rate <= 0:
|
|
39
|
+
raise ValueError("Sample rates must be positive.")
|
|
40
|
+
waveform = np.asarray(samples, dtype=np.float32)
|
|
41
|
+
if waveform.ndim != 1:
|
|
42
|
+
raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
|
|
43
|
+
if waveform.size == 0 or orig_sample_rate == target_sample_rate:
|
|
44
|
+
return mx.array(waveform, dtype=mx.float32)
|
|
45
|
+
|
|
46
|
+
duration = waveform.shape[0] / float(orig_sample_rate)
|
|
47
|
+
target_samples = max(1, int(round(duration * target_sample_rate)))
|
|
48
|
+
source_positions = np.linspace(0.0, 1.0, num=waveform.shape[0], endpoint=False, dtype=np.float32)
|
|
49
|
+
target_positions = np.linspace(0.0, 1.0, num=target_samples, endpoint=False, dtype=np.float32)
|
|
50
|
+
resampled = np.interp(target_positions, source_positions, waveform).astype(np.float32)
|
|
51
|
+
return mx.array(resampled, dtype=mx.float32)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def loudness_normalize(
|
|
55
|
+
samples: mx.array,
|
|
56
|
+
*,
|
|
57
|
+
target_dbfs: float = -20.0,
|
|
58
|
+
gain_range: tuple[float, float] = (-3.0, 3.0),
|
|
59
|
+
) -> mx.array:
|
|
60
|
+
"""Apply a small loudness correction in dBFS."""
|
|
61
|
+
|
|
62
|
+
waveform = samples.astype(mx.float32)
|
|
63
|
+
if waveform.size == 0:
|
|
64
|
+
return waveform
|
|
65
|
+
|
|
66
|
+
power = float(mx.mean(waveform * waveform).item())
|
|
67
|
+
current_dbfs = 10.0 * np.log10(power + 1e-9)
|
|
68
|
+
gain = max(gain_range[0], min(target_dbfs - current_dbfs, gain_range[1]))
|
|
69
|
+
factor = 10.0 ** (gain / 20.0)
|
|
70
|
+
return waveform * factor
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def load_audio(
|
|
74
|
+
path: str | Path,
|
|
75
|
+
*,
|
|
76
|
+
sample_rate: int | None = None,
|
|
77
|
+
mono: bool = True,
|
|
78
|
+
) -> tuple[mx.array, int]:
|
|
79
|
+
"""Load local audio from disk."""
|
|
80
|
+
|
|
81
|
+
if sf is not None:
|
|
82
|
+
waveform, loaded_sample_rate = sf.read(str(path), always_2d=False, dtype="float32")
|
|
83
|
+
samples = mx.array(waveform, dtype=mx.float32)
|
|
84
|
+
else:
|
|
85
|
+
with wave.open(str(path), "rb") as wav_file:
|
|
86
|
+
loaded_sample_rate = wav_file.getframerate()
|
|
87
|
+
channels = wav_file.getnchannels()
|
|
88
|
+
sample_width = wav_file.getsampwidth()
|
|
89
|
+
frames = wav_file.readframes(wav_file.getnframes())
|
|
90
|
+
if sample_width != 2:
|
|
91
|
+
raise RuntimeError("WAV fallback supports 16-bit PCM only.")
|
|
92
|
+
waveform = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32767.0
|
|
93
|
+
if channels > 1:
|
|
94
|
+
waveform = waveform.reshape(-1, channels)
|
|
95
|
+
samples = mx.array(waveform, dtype=mx.float32)
|
|
96
|
+
if mono:
|
|
97
|
+
samples = mix_down_mono(samples)
|
|
98
|
+
if sample_rate is not None and int(loaded_sample_rate) != int(sample_rate):
|
|
99
|
+
samples = resample_audio(
|
|
100
|
+
samples,
|
|
101
|
+
orig_sample_rate=int(loaded_sample_rate),
|
|
102
|
+
target_sample_rate=int(sample_rate),
|
|
103
|
+
)
|
|
104
|
+
loaded_sample_rate = int(sample_rate)
|
|
105
|
+
return samples, int(loaded_sample_rate)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def trim_leading_silence(
|
|
109
|
+
samples: mx.array,
|
|
110
|
+
*,
|
|
111
|
+
sample_rate: int,
|
|
112
|
+
threshold: float = 0.003,
|
|
113
|
+
frame_ms: float = 20.0,
|
|
114
|
+
keep_ms: float = 80.0,
|
|
115
|
+
) -> mx.array:
|
|
116
|
+
"""Trim leading low-energy audio using a small RMS window."""
|
|
117
|
+
|
|
118
|
+
waveform = np.asarray(samples, dtype=np.float32)
|
|
119
|
+
if waveform.ndim != 1:
|
|
120
|
+
raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
|
|
121
|
+
if waveform.size == 0:
|
|
122
|
+
return samples
|
|
123
|
+
|
|
124
|
+
frame_size = max(1, int(sample_rate * frame_ms / 1000.0))
|
|
125
|
+
keep_samples = max(0, int(sample_rate * keep_ms / 1000.0))
|
|
126
|
+
|
|
127
|
+
start_index = 0
|
|
128
|
+
for idx in range(0, waveform.size, frame_size):
|
|
129
|
+
frame = waveform[idx : idx + frame_size]
|
|
130
|
+
if frame.size == 0:
|
|
131
|
+
break
|
|
132
|
+
rms = float(np.sqrt(np.mean(frame * frame)))
|
|
133
|
+
if rms >= threshold:
|
|
134
|
+
start_index = max(0, idx - keep_samples)
|
|
135
|
+
break
|
|
136
|
+
else:
|
|
137
|
+
return samples
|
|
138
|
+
|
|
139
|
+
return mx.array(waveform[start_index:], dtype=samples.dtype)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def normalize_peak(
|
|
143
|
+
samples: mx.array,
|
|
144
|
+
*,
|
|
145
|
+
target_peak: float = 0.95,
|
|
146
|
+
max_gain: float = 4.0,
|
|
147
|
+
) -> mx.array:
|
|
148
|
+
"""Scale waveform so peak amplitude approaches `target_peak`."""
|
|
149
|
+
|
|
150
|
+
waveform = samples.astype(mx.float32)
|
|
151
|
+
peak = float(mx.max(mx.abs(waveform)).item()) if waveform.size > 0 else 0.0
|
|
152
|
+
if peak <= 0.0:
|
|
153
|
+
return waveform
|
|
154
|
+
|
|
155
|
+
gain = min(max_gain, target_peak / peak)
|
|
156
|
+
if gain <= 1.0:
|
|
157
|
+
return waveform
|
|
158
|
+
return waveform * gain
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def write_wav(path: str | Path, samples: mx.array, *, sample_rate: int) -> Path:
|
|
162
|
+
"""Write a mono float waveform to 16-bit PCM WAV."""
|
|
163
|
+
|
|
164
|
+
output_path = Path(path)
|
|
165
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
|
|
167
|
+
waveform = np.asarray(samples, dtype=np.float32)
|
|
168
|
+
if waveform.ndim != 1:
|
|
169
|
+
raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
|
|
170
|
+
waveform = np.clip(waveform, -1.0, 1.0)
|
|
171
|
+
pcm16 = (waveform * 32767.0).astype(np.int16)
|
|
172
|
+
|
|
173
|
+
with wave.open(str(output_path), "wb") as wav_file:
|
|
174
|
+
wav_file.setnchannels(1)
|
|
175
|
+
wav_file.setsampwidth(2)
|
|
176
|
+
wav_file.setframerate(sample_rate)
|
|
177
|
+
wav_file.writeframes(pcm16.tobytes())
|
|
178
|
+
|
|
179
|
+
return output_path
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Checkpoint loading and remapping helpers for mlx-voice."""
|
|
2
|
+
|
|
3
|
+
from .layout import ModelArtifactLayout, OpenMossV0Layouts, get_openmoss_v0_layouts
|
|
4
|
+
from .sharded import (
|
|
5
|
+
INDEX_FILENAME,
|
|
6
|
+
LoadedStateDict,
|
|
7
|
+
ShardedCheckpointIndex,
|
|
8
|
+
load_state_dict,
|
|
9
|
+
summarize_prefixes,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"INDEX_FILENAME",
|
|
14
|
+
"LoadedStateDict",
|
|
15
|
+
"ModelArtifactLayout",
|
|
16
|
+
"OpenMossV0Layouts",
|
|
17
|
+
"ShardedCheckpointIndex",
|
|
18
|
+
"get_openmoss_v0_layouts",
|
|
19
|
+
"load_state_dict",
|
|
20
|
+
"summarize_prefixes",
|
|
21
|
+
]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Local checkpoint layout helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
10
|
+
MODELS_ROOT = REPO_ROOT / "models"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class ModelArtifactLayout:
|
|
15
|
+
"""Filesystem layout for one model family."""
|
|
16
|
+
|
|
17
|
+
family: str
|
|
18
|
+
model_name: str
|
|
19
|
+
repo_id: str
|
|
20
|
+
root_dir: Path
|
|
21
|
+
original_dir: Path
|
|
22
|
+
mlx_int8_dir: Path
|
|
23
|
+
|
|
24
|
+
def ensure(self) -> "ModelArtifactLayout":
|
|
25
|
+
self.original_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
self.mlx_int8_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class OpenMossV0Layouts:
|
|
32
|
+
"""Grouped layout for the v0 OpenMOSS assets."""
|
|
33
|
+
|
|
34
|
+
moss_tts_local: ModelArtifactLayout
|
|
35
|
+
audio_tokenizer: ModelArtifactLayout
|
|
36
|
+
moss_sound_effect: ModelArtifactLayout
|
|
37
|
+
|
|
38
|
+
def ensure(self) -> "OpenMossV0Layouts":
|
|
39
|
+
self.moss_tts_local.ensure()
|
|
40
|
+
self.audio_tokenizer.ensure()
|
|
41
|
+
self.moss_sound_effect.ensure()
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _build_model_layout(
|
|
46
|
+
models_root: Path,
|
|
47
|
+
family: str,
|
|
48
|
+
model_name: str,
|
|
49
|
+
repo_id: str,
|
|
50
|
+
) -> ModelArtifactLayout:
|
|
51
|
+
root_dir = models_root / family / model_name
|
|
52
|
+
return ModelArtifactLayout(
|
|
53
|
+
family=family,
|
|
54
|
+
model_name=model_name,
|
|
55
|
+
repo_id=repo_id,
|
|
56
|
+
root_dir=root_dir,
|
|
57
|
+
original_dir=root_dir / "original",
|
|
58
|
+
mlx_int8_dir=root_dir / "mlx-int8",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_openmoss_v0_layouts(models_root: Path | None = None) -> OpenMossV0Layouts:
|
|
63
|
+
"""Return the local model layout used by the current v0 plan."""
|
|
64
|
+
|
|
65
|
+
resolved_root = MODELS_ROOT if models_root is None else Path(models_root)
|
|
66
|
+
return OpenMossV0Layouts(
|
|
67
|
+
moss_tts_local=_build_model_layout(
|
|
68
|
+
models_root=resolved_root,
|
|
69
|
+
family="openmoss",
|
|
70
|
+
model_name="moss_tts_local",
|
|
71
|
+
repo_id="OpenMOSS-Team/MOSS-TTS-Local-Transformer",
|
|
72
|
+
),
|
|
73
|
+
audio_tokenizer=_build_model_layout(
|
|
74
|
+
models_root=resolved_root,
|
|
75
|
+
family="openmoss",
|
|
76
|
+
model_name="moss_audio_tokenizer",
|
|
77
|
+
repo_id="OpenMOSS-Team/MOSS-Audio-Tokenizer",
|
|
78
|
+
),
|
|
79
|
+
moss_sound_effect=_build_model_layout(
|
|
80
|
+
models_root=resolved_root,
|
|
81
|
+
family="openmoss",
|
|
82
|
+
model_name="moss_sound_effect",
|
|
83
|
+
repo_id="OpenMOSS-Team/MOSS-SoundEffect",
|
|
84
|
+
),
|
|
85
|
+
)
|