mlx-speech 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. mlx_speech-0.1.0/LICENSE +21 -0
  2. mlx_speech-0.1.0/PKG-INFO +154 -0
  3. mlx_speech-0.1.0/README.md +119 -0
  4. mlx_speech-0.1.0/pyproject.toml +36 -0
  5. mlx_speech-0.1.0/src/mlx_speech/__init__.py +5 -0
  6. mlx_speech-0.1.0/src/mlx_speech/audio/__init__.py +21 -0
  7. mlx_speech-0.1.0/src/mlx_speech/audio/io.py +179 -0
  8. mlx_speech-0.1.0/src/mlx_speech/checkpoints/__init__.py +21 -0
  9. mlx_speech-0.1.0/src/mlx_speech/checkpoints/layout.py +85 -0
  10. mlx_speech-0.1.0/src/mlx_speech/checkpoints/sharded.py +123 -0
  11. mlx_speech-0.1.0/src/mlx_speech/generation/__init__.py +42 -0
  12. mlx_speech-0.1.0/src/mlx_speech/generation/cohere_asr.py +187 -0
  13. mlx_speech-0.1.0/src/mlx_speech/generation/moss_delay.py +907 -0
  14. mlx_speech-0.1.0/src/mlx_speech/generation/moss_local.py +703 -0
  15. mlx_speech-0.1.0/src/mlx_speech/generation/vibevoice.py +414 -0
  16. mlx_speech-0.1.0/src/mlx_speech/models/__init__.py +1 -0
  17. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/__init__.py +31 -0
  18. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/checkpoint.py +400 -0
  19. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/config.py +254 -0
  20. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/decoder.py +317 -0
  21. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/encoder.py +426 -0
  22. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/feature_extraction.py +363 -0
  23. mlx_speech-0.1.0/src/mlx_speech/models/cohere_asr/tokenizer.py +116 -0
  24. mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/__init__.py +49 -0
  25. mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/checkpoint.py +329 -0
  26. mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/config.py +157 -0
  27. mlx_speech-0.1.0/src/mlx_speech/models/moss_audio_tokenizer/model.py +702 -0
  28. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/__init__.py +45 -0
  29. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/cache.py +8 -0
  30. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/config.py +5 -0
  31. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/model.py +21 -0
  32. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/processor.py +21 -0
  33. mlx_speech-0.1.0/src/mlx_speech/models/moss_common/tokenizer.py +7 -0
  34. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/__init__.py +92 -0
  35. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/checkpoint.py +253 -0
  36. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/config.py +105 -0
  37. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/dialogue.py +384 -0
  38. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/model.py +166 -0
  39. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/processor.py +338 -0
  40. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/sound_effect.py +44 -0
  41. mlx_speech-0.1.0/src/mlx_speech/models/moss_delay/tokenizer.py +7 -0
  42. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/__init__.py +77 -0
  43. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/cache.py +137 -0
  44. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/checkpoint.py +369 -0
  45. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/config.py +156 -0
  46. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/model.py +808 -0
  47. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/processor.py +652 -0
  48. mlx_speech-0.1.0/src/mlx_speech/models/moss_local/tokenizer.py +95 -0
  49. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/__init__.py +1 -0
  50. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/acoustic.py +609 -0
  51. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/checkpoint.py +341 -0
  52. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/config.py +263 -0
  53. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/connector.py +30 -0
  54. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/diffusion.py +421 -0
  55. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/model.py +208 -0
  56. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/qwen2.py +220 -0
  57. mlx_speech-0.1.0/src/mlx_speech/models/vibevoice/tokenizer.py +77 -0
  58. mlx_speech-0.1.0/src/mlx_speech/py.typed +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 AppAutomaton swarm of agents
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.3
2
+ Name: mlx-speech
3
+ Version: 0.1.0
4
+ Summary: MLX-native speech library for Apple Silicon.
5
+ Author: AppAutomaton
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 AppAutomaton swarm of agents
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ Requires-Dist: mlx>=0.31.1
28
+ Requires-Dist: numpy
29
+ Requires-Dist: safetensors
30
+ Requires-Dist: soundfile>=0.13.1
31
+ Requires-Dist: tokenizers>=0.22.2
32
+ Requires-Python: >=3.13
33
+ Project-URL: Repository, https://github.com/appautomaton/mlx-speech
34
+ Description-Content-Type: text/markdown
35
+
36
+ # mlx-speech
37
+
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
39
+ [![Python 3.13+](https://img.shields.io/badge/python-3.13+-blue.svg)](https://www.python.org/downloads/)
40
+ [![Platform](https://img.shields.io/badge/platform-Apple%20Silicon-black?logo=apple)](https://developer.apple.com/documentation/apple-silicon)
41
+
42
+ Local speech synthesis on Apple Silicon, running pure MLX. No cloud, no PyTorch.
43
+
44
+ | Model | Best for |
45
+ | --- | --- |
46
+ | MossTTSLocal | shorter TTS, voice cloning, continuation |
47
+ | MOSS-TTSD | multi-speaker dialogue |
48
+ | MOSS-SoundEffect | text-to-sound-effect |
49
+ | VibeVoice | long-form speech, voice-conditioned generation |
50
+
51
+ ## Requirements
52
+
53
+ - Apple Silicon Mac (M1 or later)
54
+ - Python 3.13+
55
+ - [uv](https://docs.astral.sh/uv/)
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ git clone https://github.com/appautomaton/mlx-speech.git
61
+ cd mlx-speech
62
+ uv sync
63
+ ```
64
+
65
+ > PyPI package (`pip install mlx-speech`) coming soon.
66
+
67
+ Convert the checkpoints you want to use — each model family has a `scripts/convert_*.py` entry point:
68
+
69
+ ```bash
70
+ python scripts/convert_moss_local.py
71
+ python scripts/convert_moss_audio_tokenizer.py
72
+ python scripts/convert_moss_ttsd.py
73
+ python scripts/convert_moss_sound_effect.py
74
+ python scripts/convert_vibevoice.py
75
+ ```
76
+
77
+ ## Usage
78
+
79
+ **Generate speech:**
80
+
81
+ ```bash
82
+ python scripts/generate_moss_local.py \
83
+ --text "Hello, this is a test." \
84
+ --output outputs/moss_local.wav
85
+ ```
86
+
87
+ **Clone a voice:**
88
+
89
+ ```bash
90
+ python scripts/generate_moss_local.py \
91
+ --mode clone \
92
+ --text "Hello, this is a cloned sample." \
93
+ --reference-audio reference.wav \
94
+ --output outputs/moss_local_clone.wav
95
+ ```
96
+
97
+ **Multi-speaker dialogue:**
98
+
99
+ ```bash
100
+ python scripts/generate_moss_ttsd.py \
101
+ --text "[S1] Watson, we should go now." \
102
+ --output outputs/ttsd.wav
103
+ ```
104
+
105
+ **Sound effect:**
106
+
107
+ ```bash
108
+ python scripts/generate_moss_sound_effect.py \
109
+ --ambient-sound "rolling thunder with steady rainfall on a metal roof" \
110
+ --duration-seconds 8 \
111
+ --output outputs/thunder.wav
112
+ ```
113
+
114
+ **VibeVoice:**
115
+
116
+ ```bash
117
+ python scripts/generate_vibevoice.py \
118
+ --text "Hello from VibeVoice." \
119
+ --output outputs/vibevoice.wav
120
+ ```
121
+
122
+ ## Exploring the Codebase
123
+
124
+ The PyPI package is still in progress. The best way to explore right now is to drop the repo into an agentic coding tool like [Claude Code](https://claude.ai/code) or Cursor — the codebase is structured and self-describing, and an agent can walk you through it quickly.
125
+
126
+ ## Model Guides
127
+
128
+ Each family has a doc covering behavior, flags, and known limitations:
129
+
130
+ - [MossTTSLocal](./docs/moss-local.md)
131
+ - [MOSS-TTSD](./docs/moss-ttsd.md)
132
+ - [MOSS-SoundEffect](./docs/moss-sound-effect.md)
133
+ - [VibeVoice](./docs/vibevoice.md)
134
+
135
+ ## Development
136
+
137
+ ```bash
138
+ uv run pytest
139
+ uv run ruff check .
140
+ uv build --no-sources
141
+ ```
142
+
143
+ ```text
144
+ mlx-speech/
145
+ src/mlx_speech/ library code
146
+ scripts/ conversion and generation entry points
147
+ models/ local checkpoints (not in git)
148
+ tests/ unit and integration tests
149
+ docs/ model-family behavior guides
150
+ ```
151
+
152
+ ## License
153
+
154
+ MIT — see [LICENSE](LICENSE)
@@ -0,0 +1,119 @@
1
+ # mlx-speech
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
4
+ [![Python 3.13+](https://img.shields.io/badge/python-3.13+-blue.svg)](https://www.python.org/downloads/)
5
+ [![Platform](https://img.shields.io/badge/platform-Apple%20Silicon-black?logo=apple)](https://developer.apple.com/documentation/apple-silicon)
6
+
7
+ Local speech synthesis on Apple Silicon, running pure MLX. No cloud, no PyTorch.
8
+
9
+ | Model | Best for |
10
+ | --- | --- |
11
+ | MossTTSLocal | shorter TTS, voice cloning, continuation |
12
+ | MOSS-TTSD | multi-speaker dialogue |
13
+ | MOSS-SoundEffect | text-to-sound-effect |
14
+ | VibeVoice | long-form speech, voice-conditioned generation |
15
+
16
+ ## Requirements
17
+
18
+ - Apple Silicon Mac (M1 or later)
19
+ - Python 3.13+
20
+ - [uv](https://docs.astral.sh/uv/)
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ git clone https://github.com/appautomaton/mlx-speech.git
26
+ cd mlx-speech
27
+ uv sync
28
+ ```
29
+
30
+ > PyPI package (`pip install mlx-speech`) coming soon.
31
+
32
+ Convert the checkpoints you want to use — each model family has a `scripts/convert_*.py` entry point:
33
+
34
+ ```bash
35
+ python scripts/convert_moss_local.py
36
+ python scripts/convert_moss_audio_tokenizer.py
37
+ python scripts/convert_moss_ttsd.py
38
+ python scripts/convert_moss_sound_effect.py
39
+ python scripts/convert_vibevoice.py
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ **Generate speech:**
45
+
46
+ ```bash
47
+ python scripts/generate_moss_local.py \
48
+ --text "Hello, this is a test." \
49
+ --output outputs/moss_local.wav
50
+ ```
51
+
52
+ **Clone a voice:**
53
+
54
+ ```bash
55
+ python scripts/generate_moss_local.py \
56
+ --mode clone \
57
+ --text "Hello, this is a cloned sample." \
58
+ --reference-audio reference.wav \
59
+ --output outputs/moss_local_clone.wav
60
+ ```
61
+
62
+ **Multi-speaker dialogue:**
63
+
64
+ ```bash
65
+ python scripts/generate_moss_ttsd.py \
66
+ --text "[S1] Watson, we should go now." \
67
+ --output outputs/ttsd.wav
68
+ ```
69
+
70
+ **Sound effect:**
71
+
72
+ ```bash
73
+ python scripts/generate_moss_sound_effect.py \
74
+ --ambient-sound "rolling thunder with steady rainfall on a metal roof" \
75
+ --duration-seconds 8 \
76
+ --output outputs/thunder.wav
77
+ ```
78
+
79
+ **VibeVoice:**
80
+
81
+ ```bash
82
+ python scripts/generate_vibevoice.py \
83
+ --text "Hello from VibeVoice." \
84
+ --output outputs/vibevoice.wav
85
+ ```
86
+
87
+ ## Exploring the Codebase
88
+
89
+ The PyPI package is still in progress. The best way to explore right now is to drop the repo into an agentic coding tool like [Claude Code](https://claude.ai/code) or Cursor — the codebase is structured and self-describing, and an agent can walk you through it quickly.
90
+
91
+ ## Model Guides
92
+
93
+ Each family has a doc covering behavior, flags, and known limitations:
94
+
95
+ - [MossTTSLocal](./docs/moss-local.md)
96
+ - [MOSS-TTSD](./docs/moss-ttsd.md)
97
+ - [MOSS-SoundEffect](./docs/moss-sound-effect.md)
98
+ - [VibeVoice](./docs/vibevoice.md)
99
+
100
+ ## Development
101
+
102
+ ```bash
103
+ uv run pytest
104
+ uv run ruff check .
105
+ uv build --no-sources
106
+ ```
107
+
108
+ ```text
109
+ mlx-speech/
110
+ src/mlx_speech/ library code
111
+ scripts/ conversion and generation entry points
112
+ models/ local checkpoints (not in git)
113
+ tests/ unit and integration tests
114
+ docs/ model-family behavior guides
115
+ ```
116
+
117
+ ## License
118
+
119
+ MIT — see [LICENSE](LICENSE)
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "mlx-speech"
3
+ version = "0.1.0"
4
+ description = "MLX-native speech library for Apple Silicon."
5
+ license = { file = "LICENSE" }
6
+ authors = [{ name = "AppAutomaton" }]
7
+ urls = { Repository = "https://github.com/appautomaton/mlx-speech" }
8
+ readme = "README.md"
9
+ requires-python = ">=3.13"
10
+ dependencies = [
11
+ "mlx>=0.31.1",
12
+ "numpy",
13
+ "safetensors",
14
+ "soundfile>=0.13.1",
15
+ "tokenizers>=0.22.2",
16
+ ]
17
+
18
+ [dependency-groups]
19
+ dev = [
20
+ "pytest>=8.3,<9",
21
+ "ruff>=0.11,<0.12",
22
+ ]
23
+
24
+ [build-system]
25
+ requires = ["uv_build>=0.11.2,<0.12"]
26
+ build-backend = "uv_build"
27
+
28
+ [tool.pytest.ini_options]
29
+ testpaths = ["tests"]
30
+ markers = [
31
+ "local_integration: test requires local model artifacts or repo-specific runtime assets",
32
+ ]
33
+
34
+ [tool.ruff]
35
+ target-version = "py313"
36
+ exclude = [".references", ".venv"]
@@ -0,0 +1,5 @@
1
+ """Top-level package for mlx-voice."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.1.0"
@@ -0,0 +1,21 @@
1
+ """Audio utilities for mlx-voice."""
2
+
3
+ from .io import (
4
+ load_audio,
5
+ loudness_normalize,
6
+ mix_down_mono,
7
+ normalize_peak,
8
+ resample_audio,
9
+ trim_leading_silence,
10
+ write_wav,
11
+ )
12
+
13
+ __all__ = [
14
+ "load_audio",
15
+ "loudness_normalize",
16
+ "mix_down_mono",
17
+ "normalize_peak",
18
+ "resample_audio",
19
+ "trim_leading_silence",
20
+ "write_wav",
21
+ ]
@@ -0,0 +1,179 @@
1
+ """Small audio I/O helpers for local v0 validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import wave
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import mlx.core as mx
10
+
11
+ try:
12
+ import soundfile as sf
13
+ except ModuleNotFoundError: # pragma: no cover - exercised only in lean envs
14
+ sf = None
15
+
16
+
17
+ def mix_down_mono(samples: mx.array) -> mx.array:
18
+ """Convert audio shaped like (samples,) or (samples, channels) to mono."""
19
+
20
+ waveform = np.asarray(samples, dtype=np.float32)
21
+ if waveform.ndim == 1:
22
+ return mx.array(waveform, dtype=mx.float32)
23
+ if waveform.ndim != 2:
24
+ raise ValueError(
25
+ f"Expected waveform with shape (samples,) or (samples, channels), got {waveform.shape}."
26
+ )
27
+ return mx.array(np.mean(waveform, axis=1, dtype=np.float32), dtype=mx.float32)
28
+
29
+
30
+ def resample_audio(
31
+ samples: mx.array,
32
+ *,
33
+ orig_sample_rate: int,
34
+ target_sample_rate: int,
35
+ ) -> mx.array:
36
+ """Resample mono audio with linear interpolation."""
37
+
38
+ if orig_sample_rate <= 0 or target_sample_rate <= 0:
39
+ raise ValueError("Sample rates must be positive.")
40
+ waveform = np.asarray(samples, dtype=np.float32)
41
+ if waveform.ndim != 1:
42
+ raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
43
+ if waveform.size == 0 or orig_sample_rate == target_sample_rate:
44
+ return mx.array(waveform, dtype=mx.float32)
45
+
46
+ duration = waveform.shape[0] / float(orig_sample_rate)
47
+ target_samples = max(1, int(round(duration * target_sample_rate)))
48
+ source_positions = np.linspace(0.0, 1.0, num=waveform.shape[0], endpoint=False, dtype=np.float32)
49
+ target_positions = np.linspace(0.0, 1.0, num=target_samples, endpoint=False, dtype=np.float32)
50
+ resampled = np.interp(target_positions, source_positions, waveform).astype(np.float32)
51
+ return mx.array(resampled, dtype=mx.float32)
52
+
53
+
54
+ def loudness_normalize(
55
+ samples: mx.array,
56
+ *,
57
+ target_dbfs: float = -20.0,
58
+ gain_range: tuple[float, float] = (-3.0, 3.0),
59
+ ) -> mx.array:
60
+ """Apply a small loudness correction in dBFS."""
61
+
62
+ waveform = samples.astype(mx.float32)
63
+ if waveform.size == 0:
64
+ return waveform
65
+
66
+ power = float(mx.mean(waveform * waveform).item())
67
+ current_dbfs = 10.0 * np.log10(power + 1e-9)
68
+ gain = max(gain_range[0], min(target_dbfs - current_dbfs, gain_range[1]))
69
+ factor = 10.0 ** (gain / 20.0)
70
+ return waveform * factor
71
+
72
+
73
+ def load_audio(
74
+ path: str | Path,
75
+ *,
76
+ sample_rate: int | None = None,
77
+ mono: bool = True,
78
+ ) -> tuple[mx.array, int]:
79
+ """Load local audio from disk."""
80
+
81
+ if sf is not None:
82
+ waveform, loaded_sample_rate = sf.read(str(path), always_2d=False, dtype="float32")
83
+ samples = mx.array(waveform, dtype=mx.float32)
84
+ else:
85
+ with wave.open(str(path), "rb") as wav_file:
86
+ loaded_sample_rate = wav_file.getframerate()
87
+ channels = wav_file.getnchannels()
88
+ sample_width = wav_file.getsampwidth()
89
+ frames = wav_file.readframes(wav_file.getnframes())
90
+ if sample_width != 2:
91
+ raise RuntimeError("WAV fallback supports 16-bit PCM only.")
92
+ waveform = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32767.0
93
+ if channels > 1:
94
+ waveform = waveform.reshape(-1, channels)
95
+ samples = mx.array(waveform, dtype=mx.float32)
96
+ if mono:
97
+ samples = mix_down_mono(samples)
98
+ if sample_rate is not None and int(loaded_sample_rate) != int(sample_rate):
99
+ samples = resample_audio(
100
+ samples,
101
+ orig_sample_rate=int(loaded_sample_rate),
102
+ target_sample_rate=int(sample_rate),
103
+ )
104
+ loaded_sample_rate = int(sample_rate)
105
+ return samples, int(loaded_sample_rate)
106
+
107
+
108
+ def trim_leading_silence(
109
+ samples: mx.array,
110
+ *,
111
+ sample_rate: int,
112
+ threshold: float = 0.003,
113
+ frame_ms: float = 20.0,
114
+ keep_ms: float = 80.0,
115
+ ) -> mx.array:
116
+ """Trim leading low-energy audio using a small RMS window."""
117
+
118
+ waveform = np.asarray(samples, dtype=np.float32)
119
+ if waveform.ndim != 1:
120
+ raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
121
+ if waveform.size == 0:
122
+ return samples
123
+
124
+ frame_size = max(1, int(sample_rate * frame_ms / 1000.0))
125
+ keep_samples = max(0, int(sample_rate * keep_ms / 1000.0))
126
+
127
+ start_index = 0
128
+ for idx in range(0, waveform.size, frame_size):
129
+ frame = waveform[idx : idx + frame_size]
130
+ if frame.size == 0:
131
+ break
132
+ rms = float(np.sqrt(np.mean(frame * frame)))
133
+ if rms >= threshold:
134
+ start_index = max(0, idx - keep_samples)
135
+ break
136
+ else:
137
+ return samples
138
+
139
+ return mx.array(waveform[start_index:], dtype=samples.dtype)
140
+
141
+
142
+ def normalize_peak(
143
+ samples: mx.array,
144
+ *,
145
+ target_peak: float = 0.95,
146
+ max_gain: float = 4.0,
147
+ ) -> mx.array:
148
+ """Scale waveform so peak amplitude approaches `target_peak`."""
149
+
150
+ waveform = samples.astype(mx.float32)
151
+ peak = float(mx.max(mx.abs(waveform)).item()) if waveform.size > 0 else 0.0
152
+ if peak <= 0.0:
153
+ return waveform
154
+
155
+ gain = min(max_gain, target_peak / peak)
156
+ if gain <= 1.0:
157
+ return waveform
158
+ return waveform * gain
159
+
160
+
161
+ def write_wav(path: str | Path, samples: mx.array, *, sample_rate: int) -> Path:
162
+ """Write a mono float waveform to 16-bit PCM WAV."""
163
+
164
+ output_path = Path(path)
165
+ output_path.parent.mkdir(parents=True, exist_ok=True)
166
+
167
+ waveform = np.asarray(samples, dtype=np.float32)
168
+ if waveform.ndim != 1:
169
+ raise ValueError(f"Expected mono waveform with shape (samples,), got {waveform.shape}.")
170
+ waveform = np.clip(waveform, -1.0, 1.0)
171
+ pcm16 = (waveform * 32767.0).astype(np.int16)
172
+
173
+ with wave.open(str(output_path), "wb") as wav_file:
174
+ wav_file.setnchannels(1)
175
+ wav_file.setsampwidth(2)
176
+ wav_file.setframerate(sample_rate)
177
+ wav_file.writeframes(pcm16.tobytes())
178
+
179
+ return output_path
@@ -0,0 +1,21 @@
1
+ """Checkpoint loading and remapping helpers for mlx-voice."""
2
+
3
+ from .layout import ModelArtifactLayout, OpenMossV0Layouts, get_openmoss_v0_layouts
4
+ from .sharded import (
5
+ INDEX_FILENAME,
6
+ LoadedStateDict,
7
+ ShardedCheckpointIndex,
8
+ load_state_dict,
9
+ summarize_prefixes,
10
+ )
11
+
12
+ __all__ = [
13
+ "INDEX_FILENAME",
14
+ "LoadedStateDict",
15
+ "ModelArtifactLayout",
16
+ "OpenMossV0Layouts",
17
+ "ShardedCheckpointIndex",
18
+ "get_openmoss_v0_layouts",
19
+ "load_state_dict",
20
+ "summarize_prefixes",
21
+ ]
@@ -0,0 +1,85 @@
1
+ """Local checkpoint layout helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ REPO_ROOT = Path(__file__).resolve().parents[3]
10
+ MODELS_ROOT = REPO_ROOT / "models"
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class ModelArtifactLayout:
15
+ """Filesystem layout for one model family."""
16
+
17
+ family: str
18
+ model_name: str
19
+ repo_id: str
20
+ root_dir: Path
21
+ original_dir: Path
22
+ mlx_int8_dir: Path
23
+
24
+ def ensure(self) -> "ModelArtifactLayout":
25
+ self.original_dir.mkdir(parents=True, exist_ok=True)
26
+ self.mlx_int8_dir.mkdir(parents=True, exist_ok=True)
27
+ return self
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class OpenMossV0Layouts:
32
+ """Grouped layout for the v0 OpenMOSS assets."""
33
+
34
+ moss_tts_local: ModelArtifactLayout
35
+ audio_tokenizer: ModelArtifactLayout
36
+ moss_sound_effect: ModelArtifactLayout
37
+
38
+ def ensure(self) -> "OpenMossV0Layouts":
39
+ self.moss_tts_local.ensure()
40
+ self.audio_tokenizer.ensure()
41
+ self.moss_sound_effect.ensure()
42
+ return self
43
+
44
+
45
+ def _build_model_layout(
46
+ models_root: Path,
47
+ family: str,
48
+ model_name: str,
49
+ repo_id: str,
50
+ ) -> ModelArtifactLayout:
51
+ root_dir = models_root / family / model_name
52
+ return ModelArtifactLayout(
53
+ family=family,
54
+ model_name=model_name,
55
+ repo_id=repo_id,
56
+ root_dir=root_dir,
57
+ original_dir=root_dir / "original",
58
+ mlx_int8_dir=root_dir / "mlx-int8",
59
+ )
60
+
61
+
62
+ def get_openmoss_v0_layouts(models_root: Path | None = None) -> OpenMossV0Layouts:
63
+ """Return the local model layout used by the current v0 plan."""
64
+
65
+ resolved_root = MODELS_ROOT if models_root is None else Path(models_root)
66
+ return OpenMossV0Layouts(
67
+ moss_tts_local=_build_model_layout(
68
+ models_root=resolved_root,
69
+ family="openmoss",
70
+ model_name="moss_tts_local",
71
+ repo_id="OpenMOSS-Team/MOSS-TTS-Local-Transformer",
72
+ ),
73
+ audio_tokenizer=_build_model_layout(
74
+ models_root=resolved_root,
75
+ family="openmoss",
76
+ model_name="moss_audio_tokenizer",
77
+ repo_id="OpenMOSS-Team/MOSS-Audio-Tokenizer",
78
+ ),
79
+ moss_sound_effect=_build_model_layout(
80
+ models_root=resolved_root,
81
+ family="openmoss",
82
+ model_name="moss_sound_effect",
83
+ repo_id="OpenMOSS-Team/MOSS-SoundEffect",
84
+ ),
85
+ )