mlx-audio 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. mlx_audio-0.0.1/LICENSE +21 -0
  2. mlx_audio-0.0.1/MANIFEST.in +2 -0
  3. mlx_audio-0.0.1/PKG-INFO +131 -0
  4. mlx_audio-0.0.1/README.md +92 -0
  5. mlx_audio-0.0.1/mlx_audio/__init__.py +0 -0
  6. mlx_audio-0.0.1/mlx_audio/sts/__init__.py +0 -0
  7. mlx_audio-0.0.1/mlx_audio/tts/__init__.py +1 -0
  8. mlx_audio-0.0.1/mlx_audio/tts/generate.py +99 -0
  9. mlx_audio-0.0.1/mlx_audio/tts/models/__init__.py +0 -0
  10. mlx_audio-0.0.1/mlx_audio/tts/models/base.py +48 -0
  11. mlx_audio-0.0.1/mlx_audio/tts/models/interpolate.py +108 -0
  12. mlx_audio-0.0.1/mlx_audio/tts/models/kokoro/__init__.py +4 -0
  13. mlx_audio-0.0.1/mlx_audio/tts/models/kokoro/istftnet.py +937 -0
  14. mlx_audio-0.0.1/mlx_audio/tts/models/kokoro/kokoro.py +316 -0
  15. mlx_audio-0.0.1/mlx_audio/tts/models/kokoro/modules.py +659 -0
  16. mlx_audio-0.0.1/mlx_audio/tts/models/kokoro/pipeline.py +459 -0
  17. mlx_audio-0.0.1/mlx_audio/tts/tests/__init__.py +0 -0
  18. mlx_audio-0.0.1/mlx_audio/tts/tests/test_base.py +66 -0
  19. mlx_audio-0.0.1/mlx_audio/tts/tests/test_interpolate.py +88 -0
  20. mlx_audio-0.0.1/mlx_audio/tts/tests/test_models.py +338 -0
  21. mlx_audio-0.0.1/mlx_audio/tts/utils.py +162 -0
  22. mlx_audio-0.0.1/mlx_audio/version.py +1 -0
  23. mlx_audio-0.0.1/mlx_audio.egg-info/PKG-INFO +131 -0
  24. mlx_audio-0.0.1/mlx_audio.egg-info/SOURCES.txt +29 -0
  25. mlx_audio-0.0.1/mlx_audio.egg-info/dependency_links.txt +1 -0
  26. mlx_audio-0.0.1/mlx_audio.egg-info/entry_points.txt +2 -0
  27. mlx_audio-0.0.1/mlx_audio.egg-info/requires.txt +14 -0
  28. mlx_audio-0.0.1/mlx_audio.egg-info/top_level.txt +1 -0
  29. mlx_audio-0.0.1/requirements.txt +14 -0
  30. mlx_audio-0.0.1/setup.cfg +4 -0
  31. mlx_audio-0.0.1/setup.py +45 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Prince Canuma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include ./requirements.txt
2
+ recursive-include mlx_audio/ *.py
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.2
2
+ Name: mlx-audio
3
+ Version: 0.0.1
4
+ Summary: MLX-Audio is a package for inference of text-to-speech (TTS) and speech-to-speech (STS) models locally on your Mac using MLX
5
+ Home-page: https://github.com/Blaizzy/mlx-audio
6
+ Author: Prince Canuma
7
+ Author-email: prince.gdt@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: misaki[en]>=0.8.2
16
+ Requires-Dist: loguru>=0.7.3
17
+ Requires-Dist: num2words>=0.5.14
18
+ Requires-Dist: spacy>=3.8.4
19
+ Requires-Dist: phonemizer>=3.3.0
20
+ Requires-Dist: espeakng-loader>=0.2.4
21
+ Requires-Dist: mlx>=0.22.0
22
+ Requires-Dist: mlx-vlm>=0.1.14
23
+ Requires-Dist: mlx-lm>=0.21.5
24
+ Requires-Dist: numpy>=1.26.4
25
+ Requires-Dist: torch>=2.5.1
26
+ Requires-Dist: transformers>=4.49.0
27
+ Requires-Dist: sentencepiece>=0.2.0
28
+ Requires-Dist: huggingface_hub>=0.27.0
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: license
36
+ Dynamic: requires-dist
37
+ Dynamic: requires-python
38
+ Dynamic: summary
39
+
40
+ # MLX-Audio
41
+
42
+ A text-to-speech (TTS) and Speech-to-Speech (STS) library built on Apple's MLX framework, providing efficient speech synthesis on Apple Silicon.
43
+
44
+ ## Features
45
+
46
+ - Fast inference on Apple Silicon (M series chips)
47
+ - Multiple language support
48
+ - Voice customization options
49
+ - Quantization support for optimized performance
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install mlx-audio
55
+ ```
56
+
57
+ ## Models
58
+
59
+ ### Kokoro
60
+
61
+ Kokoro is a multilingual TTS model that supports various languages and voice styles.
62
+
63
+ #### Example Usage
64
+
65
+ ```python
66
+ from tts.models.kokoro import KokoroModel, KokoroPipeline
67
+ from IPython.display import Audio
68
+ import soundfile as sf
69
+
70
+ # Initialize the model
71
+ model = KokoroModel(repo_id='prince-canuma/Kokoro-82M')
72
+
73
+ # Create a pipeline with American English
74
+ pipeline = KokoroPipeline(lang_code='a', model=model)
75
+
76
+ # Generate audio
77
+ text = "The MLX King lives. Let him cook!"
78
+ for _, _, audio in pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+'):
79
+ # Display audio in notebook (if applicable)
80
+ display(Audio(data=audio, rate=24000, autoplay=0))
81
+
82
+ # Save audio to file
83
+ sf.write('audio.wav', audio[0], 24000)
84
+ ```
85
+
86
+ #### Language Options
87
+
88
+ - 🇺🇸 `'a'` - American English
89
+ - 🇬🇧 `'b'` - British English
90
+ - 🇯🇵 `'j'` - Japanese (requires `pip install misaki[ja]`)
91
+ - 🇨🇳 `'z'` - Mandarin Chinese (requires `pip install misaki[zh]`)
92
+
93
+ ## Advanced Features
94
+
95
+ ### Quantization
96
+
97
+ You can quantize models for improved performance:
98
+
99
+ ```python
100
+ from tts.models.kokoro import KokoroModel
101
+ from tts.utils import quantize_model
102
+ import json
103
+ import mlx.core as mx
104
+
105
+ model = KokoroModel(repo_id='prince-canuma/Kokoro-82M')
106
+ config = model.config
107
+
108
+ # Quantize to 8-bit
109
+ weights, config = quantize_model(model, config, 64, 8)
110
+
111
+ # Save quantized model
112
+ with open('./8bit/config.json', 'w') as f:
113
+ json.dump(config, f)
114
+
115
+ mx.save_safetensors("./8bit/kokoro-v1_0.safetensors", weights, metadata={"format": "mlx"})
116
+ ```
117
+
118
+ ## Requirements
119
+
120
+ - MLX
121
+ - Python 3.8+
122
+ - Apple Silicon Mac (for optimal performance)
123
+
124
+ ## License
125
+
126
+ [MIT License](LICENSE)
127
+
128
+ ## Acknowledgements
129
+
130
+ - Thanks to the Apple MLX team for providing a great framework for building TTS and STS models.
131
+ - This project uses the Kokoro model architecture for text-to-speech synthesis.
@@ -0,0 +1,92 @@
1
+ # MLX-Audio
2
+
3
+ A text-to-speech (TTS) and Speech-to-Speech (STS) library built on Apple's MLX framework, providing efficient speech synthesis on Apple Silicon.
4
+
5
+ ## Features
6
+
7
+ - Fast inference on Apple Silicon (M series chips)
8
+ - Multiple language support
9
+ - Voice customization options
10
+ - Quantization support for optimized performance
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install mlx-audio
16
+ ```
17
+
18
+ ## Models
19
+
20
+ ### Kokoro
21
+
22
+ Kokoro is a multilingual TTS model that supports various languages and voice styles.
23
+
24
+ #### Example Usage
25
+
26
+ ```python
27
+ from tts.models.kokoro import KokoroModel, KokoroPipeline
28
+ from IPython.display import Audio
29
+ import soundfile as sf
30
+
31
+ # Initialize the model
32
+ model = KokoroModel(repo_id='prince-canuma/Kokoro-82M')
33
+
34
+ # Create a pipeline with American English
35
+ pipeline = KokoroPipeline(lang_code='a', model=model)
36
+
37
+ # Generate audio
38
+ text = "The MLX King lives. Let him cook!"
39
+ for _, _, audio in pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+'):
40
+ # Display audio in notebook (if applicable)
41
+ display(Audio(data=audio, rate=24000, autoplay=0))
42
+
43
+ # Save audio to file
44
+ sf.write('audio.wav', audio[0], 24000)
45
+ ```
46
+
47
+ #### Language Options
48
+
49
+ - 🇺🇸 `'a'` - American English
50
+ - 🇬🇧 `'b'` - British English
51
+ - 🇯🇵 `'j'` - Japanese (requires `pip install misaki[ja]`)
52
+ - 🇨🇳 `'z'` - Mandarin Chinese (requires `pip install misaki[zh]`)
53
+
54
+ ## Advanced Features
55
+
56
+ ### Quantization
57
+
58
+ You can quantize models for improved performance:
59
+
60
+ ```python
61
+ from tts.models.kokoro import KokoroModel
62
+ from tts.utils import quantize_model
63
+ import json
64
+ import mlx.core as mx
65
+
66
+ model = KokoroModel(repo_id='prince-canuma/Kokoro-82M')
67
+ config = model.config
68
+
69
+ # Quantize to 8-bit
70
+ weights, config = quantize_model(model, config, 64, 8)
71
+
72
+ # Save quantized model
73
+ with open('./8bit/config.json', 'w') as f:
74
+ json.dump(config, f)
75
+
76
+ mx.save_safetensors("./8bit/kokoro-v1_0.safetensors", weights, metadata={"format": "mlx"})
77
+ ```
78
+
79
+ ## Requirements
80
+
81
+ - MLX
82
+ - Python 3.8+
83
+ - Apple Silicon Mac (for optimal performance)
84
+
85
+ ## License
86
+
87
+ [MIT License](LICENSE)
88
+
89
+ ## Acknowledgements
90
+
91
+ - Thanks to the Apple MLX team for providing a great framework for building TTS and STS models.
92
+ - This project uses the Kokoro model architecture for text-to-speech synthesis.
File without changes
File without changes
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,99 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import sys
5
+
6
+ import mlx.core as mx
7
+ import soundfile as sf
8
+
9
+ from .utils import load_model
10
+
11
+
12
+ def parse_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--model",
16
+ type=str,
17
+ default="prince-canuma/Kokoro-82M",
18
+ help="Path or repo id of the model",
19
+ )
20
+ parser.add_argument(
21
+ "--text", type=str, default="The sky above the port", help="Text to generate"
22
+ )
23
+ parser.add_argument("--voice", type=str, default="af_heart", help="Voice name")
24
+ parser.add_argument("--speed", type=float, default=1.0, help="Speed of the audio")
25
+ parser.add_argument("--lang_code", type=str, default="a", help="Language code")
26
+ parser.add_argument(
27
+ "--file_prefix", type=str, default="audio", help="Output file name prefix"
28
+ )
29
+ parser.add_argument("--verbose", action="store_false", help="Print verbose output")
30
+ parser.add_argument(
31
+ "--join_audio", action="store_true", help="Join all audio files into one"
32
+ )
33
+ return parser.parse_args()
34
+
35
+
36
+ def main():
37
+ args = parse_args()
38
+ try:
39
+ model = load_model(model_path=args.model)
40
+ print(
41
+ f"\n\033[94mModel:\033[0m {args.model}\n"
42
+ f"\033[94mText:\033[0m {args.text}\n"
43
+ f"\033[94mVoice:\033[0m {args.voice}\n"
44
+ f"\033[94mSpeed:\033[0m {args.speed}x\n"
45
+ f"\033[94mLanguage:\033[0m {args.lang_code}"
46
+ )
47
+ print("==========")
48
+ results = model.generate(
49
+ text=args.text,
50
+ voice=args.voice,
51
+ speed=args.speed,
52
+ lang_code=args.lang_code,
53
+ verbose=True,
54
+ )
55
+ print(
56
+ f"\033[92mAudio generated successfully, saving to\033[0m {args.file_prefix}!"
57
+ )
58
+
59
+ audio_list = []
60
+ for i, result in enumerate(results):
61
+ if args.join_audio:
62
+ audio_list.append(result.audio)
63
+ else:
64
+ sf.write(f"{args.file_prefix}_{i:03d}.wav", result.audio, 24000)
65
+
66
+ if args.verbose:
67
+ print("==========")
68
+ print(f"Duration: {result.audio_duration}")
69
+ print(
70
+ f"Samples/sec: {result.audio_samples['samples-per-sec']:.1f}"
71
+ )
72
+ print(
73
+ f"Prompt: {result.token_count} tokens, {result.prompt['tokens-per-sec']:.1f} tokens-per-sec"
74
+ )
75
+ print(
76
+ f"Audio: {result.audio_samples['samples']} samples, {result.audio_samples['samples-per-sec']:.1f} samples-per-sec"
77
+ )
78
+ print(f"Real-time factor: {result.real_time_factor:.2f}x")
79
+ print(f"Processing time: {result.processing_time_seconds:.2f}s")
80
+ print(f"Peak memory usage: {result.peak_memory_usage:.2f}GB")
81
+
82
+ if args.join_audio:
83
+ print(f"Joining {len(audio_list)} audio files")
84
+ audio = mx.concatenate(audio_list, axis=0)
85
+ sf.write(f"{args.file_prefix}.wav", audio, 24000)
86
+ except ImportError as e:
87
+ print(f"Import error: {e}")
88
+ print(
89
+ "This might be due to incorrect Python path. Check your project structure."
90
+ )
91
+ except Exception as e:
92
+ print(f"Error loading model: {e}")
93
+ import traceback
94
+
95
+ traceback.print_exc()
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
File without changes
@@ -0,0 +1,48 @@
1
+ import inspect
2
+ from dataclasses import dataclass
3
+
4
+ import mlx.core as mx
5
+
6
+
7
+ @dataclass
8
+ class BaseModelArgs:
9
+ @classmethod
10
+ def from_dict(cls, params):
11
+ return cls(
12
+ **{
13
+ k: v
14
+ for k, v in params.items()
15
+ if k in inspect.signature(cls).parameters
16
+ }
17
+ )
18
+
19
+
20
+ def check_array_shape(arr):
21
+ shape = arr.shape
22
+
23
+ # Check if the shape has 4 dimensions
24
+ if len(shape) != 3:
25
+ return False
26
+
27
+ out_channels, kH, KW = shape
28
+
29
+ # Check if out_channels is the largest, and kH and KW are the same
30
+ if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
31
+ return True
32
+ else:
33
+ return False
34
+
35
+
36
+ @dataclass
37
+ class GenerationResult:
38
+ audio: mx.array
39
+ samples: int
40
+ segment_idx: int
41
+ token_count: int
42
+ audio_samples: int
43
+ audio_duration: str
44
+ real_time_factor: float
45
+ prompt: dict
46
+ audio_samples: dict
47
+ processing_time_seconds: float
48
+ peak_memory_usage: float
@@ -0,0 +1,108 @@
1
+ from typing import List, Optional, Tuple, Union
2
+
3
+ import mlx.core as mx
4
+
5
+
6
+ def interpolate(
7
+ input: mx.array,
8
+ size: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
9
+ scale_factor: Optional[Union[float, List[float], Tuple[float, ...]]] = None,
10
+ mode: str = "nearest",
11
+ align_corners: Optional[bool] = None,
12
+ ) -> mx.array:
13
+ """Interpolate array with correct shape handling.
14
+
15
+ Args:
16
+ input (mx.array): Input tensor [N, C, ...] where ... represents spatial dimensions
17
+ size (int or tuple): Output size
18
+ scale_factor (float or tuple): Multiplier for spatial size
19
+ mode (str): 'nearest' or 'linear'
20
+ align_corners (bool): If True, align corners of input and output tensors
21
+ """
22
+ ndim = input.ndim
23
+ if ndim < 3:
24
+ raise ValueError(f"Expected at least 3D input (N, C, D1), got {ndim}D")
25
+
26
+ spatial_dims = ndim - 2
27
+
28
+ # Handle size and scale_factor
29
+ if size is not None and scale_factor is not None:
30
+ raise ValueError("Only one of size or scale_factor should be defined")
31
+ elif size is None and scale_factor is None:
32
+ raise ValueError("One of size or scale_factor must be defined")
33
+
34
+ # Convert single values to tuples
35
+ if size is not None and not isinstance(size, (list, tuple)):
36
+ size = [size] * spatial_dims
37
+ if scale_factor is not None and not isinstance(scale_factor, (list, tuple)):
38
+ scale_factor = [scale_factor] * spatial_dims
39
+
40
+ # Calculate output size from scale factor if needed
41
+ if size is None:
42
+ size = []
43
+ for i in range(spatial_dims):
44
+ # Use ceiling instead of floor to match PyTorch behavior
45
+ curr_size = max(1, int(mx.ceil(input.shape[i + 2] * scale_factor[i])))
46
+ size.append(curr_size)
47
+
48
+ # Handle 1D case (N, C, W)
49
+ if spatial_dims == 1:
50
+ return interpolate1d(input, size[0], mode, align_corners)
51
+ else:
52
+ raise ValueError(
53
+ f"Only 1D interpolation currently supported, got {spatial_dims}D"
54
+ )
55
+
56
+
57
+ def interpolate1d(
58
+ input: mx.array,
59
+ size: int,
60
+ mode: str = "linear",
61
+ align_corners: Optional[bool] = None,
62
+ ) -> mx.array:
63
+ """1D interpolation implementation."""
64
+ batch_size, channels, in_width = input.shape
65
+
66
+ # Handle edge cases
67
+ if size < 1:
68
+ size = 1
69
+ if in_width < 1:
70
+ in_width = 1
71
+
72
+ if mode == "nearest":
73
+ if size == 1:
74
+ indices = mx.array([0])
75
+ else:
76
+ scale = in_width / size
77
+ indices = mx.floor(mx.arange(size) * scale).astype(mx.int32)
78
+ indices = mx.clip(indices, 0, in_width - 1)
79
+ return input[:, :, indices]
80
+
81
+ # Linear interpolation
82
+ if align_corners and size > 1:
83
+ x = mx.arange(size) * ((in_width - 1) / (size - 1))
84
+ else:
85
+ if size == 1:
86
+ x = mx.array([0.0])
87
+ else:
88
+ x = mx.arange(size) * (in_width / size)
89
+ if not align_corners:
90
+ x = x + 0.5 * (in_width / size) - 0.5
91
+
92
+ # Handle the case where input width is 1
93
+ if in_width == 1:
94
+ output = mx.broadcast_to(input, (batch_size, channels, size))
95
+ return output
96
+
97
+ x_low = mx.floor(x).astype(mx.int32)
98
+ x_high = mx.minimum(x_low + 1, in_width - 1)
99
+ x_frac = x - x_low
100
+
101
+ # Pre-compute indices to avoid repeated computation
102
+ y_low = input[:, :, x_low]
103
+ y_high = input[:, :, x_high]
104
+
105
+ # Vectorized interpolation
106
+ output = y_low * (1 - x_frac)[None, None, :] + y_high * x_frac[None, None, :]
107
+
108
+ return output
@@ -0,0 +1,4 @@
1
+ from .kokoro import Model
2
+ from .pipeline import KokoroPipeline
3
+
4
+ __all__ = ["KokoroPipeline", "Model"]