voxmlx 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,210 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # vim
7
+ *.swp
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py.cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # UV
101
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ #uv.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+ #poetry.toml
113
+
114
+ # pdm
115
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
117
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
118
+ #pdm.lock
119
+ #pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # pixi
124
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
125
+ #pixi.lock
126
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
127
+ # in the .venv directory. It is recommended not to include this directory in version control.
128
+ .pixi
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule
135
+ celerybeat.pid
136
+
137
+ # SageMath parsed files
138
+ *.sage.py
139
+
140
+ # Environments
141
+ .env
142
+ .envrc
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
180
+
181
+ # Abstra
182
+ # Abstra is an AI-powered process automation framework.
183
+ # Ignore directories containing user credentials, local state, and settings.
184
+ # Learn more at https://abstra.io/docs
185
+ .abstra/
186
+
187
+ # Visual Studio Code
188
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
189
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
190
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
191
+ # you could uncomment the following to ignore the entire vscode folder
192
+ # .vscode/
193
+
194
+ # Ruff stuff:
195
+ .ruff_cache/
196
+
197
+ # PyPI configuration file
198
+ .pypirc
199
+
200
+ # Cursor
201
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
202
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
203
+ # refer to https://docs.cursor.com/context/ignore-files
204
+ .cursorignore
205
+ .cursorindexingignore
206
+
207
+ # Marimo
208
+ marimo/_static/
209
+ marimo/_lsp/
210
+ __marimo__/
voxmlx-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Awni Hannun
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
voxmlx-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: voxmlx
3
+ Version: 0.0.1
4
+ Summary: Voxtral Mini Realtime speech-to-text in MLX
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: mlx>=0.20.0
10
+ Requires-Dist: numpy
11
+ Requires-Dist: soundfile
12
+ Requires-Dist: sounddevice
13
+ Requires-Dist: huggingface-hub
14
+ Requires-Dist: sentencepiece
15
+ Dynamic: license-file
16
+
17
+ # voxmlx
18
+
19
+ Realtime speech-to-text with
20
+ [Voxtral Mini Realtime](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602)
21
+ in [MLX](https://github.com/ml-explore/mlx).
22
+
23
+ ## Install
24
+
25
+ ```bash
26
+ pip install voxmlx
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### `voxmlx`
32
+
33
+ Transcribe audio from a file or stream from the microphone in real-time.
34
+
35
+ **Stream from microphone:**
36
+
37
+ ```bash
38
+ voxmlx
39
+ ```
40
+
41
+ **Transcribe a file:**
42
+
43
+ ```bash
44
+ voxmlx --audio audio.flac
45
+ ```
46
+
47
+ **Options:**
48
+
49
+ | Flag | Description | Default |
50
+ |------|-------------|---------|
51
+ | `--audio` | Path to audio file (omit to stream from mic) | None |
52
+ | `--model` | Model path or HuggingFace model ID | `mlx-community/Voxtral-Mini-4B-Realtime-6bit` |
53
+ | `--temp` | Sampling temperature (`0` = greedy) | `0.0` |
54
+
55
+ ### `voxmlx-convert`
56
+
57
+ Convert Voxtral weights to voxmlx/MLX format with optional quantization.
58
+
59
+ **Basic conversion:**
60
+
61
+ ```bash
62
+ voxmlx-convert --mlx-path voxtral-mlx
63
+ ```
64
+
65
+ **4-bit quantized conversion:**
66
+
67
+ ```bash
68
+ voxmlx-convert -q --mlx-path voxtral-mlx-4bit
69
+ ```
70
+
71
+ **Convert and upload to HuggingFace:**
72
+
73
+ ```bash
74
+ voxmlx-convert -q --mlx-path voxtral-mlx-4bit --upload-repo username/voxtral-mlx-4bit
75
+ ```
76
+
77
+ **Options:**
78
+
79
+ | Flag | Description | Default |
80
+ |------|-------------|---------|
81
+ | `--hf-path` | HuggingFace model ID or local path | `mistralai/Voxtral-Mini-4B-Realtime-2602` |
82
+ | `--mlx-path` | Output directory | `mlx_model` |
83
+ | `-q`, `--quantize` | Quantize the model | Off |
84
+ | `--group-size` | Quantization group size | `64` |
85
+ | `--bits` | Bits per weight | `4` |
86
+ | `--dtype` | Cast weights (`float16`, `bfloat16`, `float32`) | None |
87
+ | `--upload-repo` | HuggingFace repo to upload converted model | None |
88
+
89
+ ### Python API
90
+
91
+ ```python
92
+ from voxmlx import transcribe
93
+
94
+ text = transcribe("audio.flac")
95
+ print(text)
96
+ ```
voxmlx-0.0.1/README.md ADDED
@@ -0,0 +1,80 @@
1
+ # voxmlx
2
+
3
+ Realtime speech-to-text with
4
+ [Voxtral Mini Realtime](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602)
5
+ in [MLX](https://github.com/ml-explore/mlx).
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install voxmlx
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ### `voxmlx`
16
+
17
+ Transcribe audio from a file or stream from the microphone in real-time.
18
+
19
+ **Stream from microphone:**
20
+
21
+ ```bash
22
+ voxmlx
23
+ ```
24
+
25
+ **Transcribe a file:**
26
+
27
+ ```bash
28
+ voxmlx --audio audio.flac
29
+ ```
30
+
31
+ **Options:**
32
+
33
+ | Flag | Description | Default |
34
+ |------|-------------|---------|
35
+ | `--audio` | Path to audio file (omit to stream from mic) | None |
36
+ | `--model` | Model path or HuggingFace model ID | `mlx-community/Voxtral-Mini-4B-Realtime-6bit` |
37
+ | `--temp` | Sampling temperature (`0` = greedy) | `0.0` |
38
+
39
+ ### `voxmlx-convert`
40
+
41
+ Convert Voxtral weights to voxmlx/MLX format with optional quantization.
42
+
43
+ **Basic conversion:**
44
+
45
+ ```bash
46
+ voxmlx-convert --mlx-path voxtral-mlx
47
+ ```
48
+
49
+ **4-bit quantized conversion:**
50
+
51
+ ```bash
52
+ voxmlx-convert -q --mlx-path voxtral-mlx-4bit
53
+ ```
54
+
55
+ **Convert and upload to HuggingFace:**
56
+
57
+ ```bash
58
+ voxmlx-convert -q --mlx-path voxtral-mlx-4bit --upload-repo username/voxtral-mlx-4bit
59
+ ```
60
+
61
+ **Options:**
62
+
63
+ | Flag | Description | Default |
64
+ |------|-------------|---------|
65
+ | `--hf-path` | HuggingFace model ID or local path | `mistralai/Voxtral-Mini-4B-Realtime-2602` |
66
+ | `--mlx-path` | Output directory | `mlx_model` |
67
+ | `-q`, `--quantize` | Quantize the model | Off |
68
+ | `--group-size` | Quantization group size | `64` |
69
+ | `--bits` | Bits per weight | `4` |
70
+ | `--dtype` | Cast weights (`float16`, `bfloat16`, `float32`) | None |
71
+ | `--upload-repo` | HuggingFace repo to upload converted model | None |
72
+
73
+ ### Python API
74
+
75
+ ```python
76
+ from voxmlx import transcribe
77
+
78
+ text = transcribe("audio.flac")
79
+ print(text)
80
+ ```
@@ -0,0 +1,26 @@
1
+ [build-system]
2
+ requires = ["setuptools", "setuptools-scm"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "voxmlx"
7
+ dynamic = ["version"]
8
+ description = "Voxtral Mini Realtime speech-to-text in MLX"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ dependencies = [
13
+ "mlx>=0.20.0",
14
+ "numpy",
15
+ "soundfile",
16
+ "sounddevice",
17
+ "huggingface-hub",
18
+ "sentencepiece",
19
+ ]
20
+
21
+ [tool.setuptools.dynamic]
22
+ version = {attr = "voxmlx.__version__"}
23
+
24
+ [project.scripts]
25
+ voxmlx = "voxmlx:main"
26
+ voxmlx-convert = "voxmlx.convert:main"
voxmlx-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,81 @@
1
+ __version__ = "0.0.1"
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
7
+ from mistral_common.tokens.tokenizers.tekken import Tekkenizer
8
+
9
+ from .generate import generate
10
+ from .weights import download_model, load_model as _load_weights
11
+
12
+
13
+ def _load_tokenizer(model_path: Path) -> Tekkenizer:
14
+ tekken_path = model_path / "tekken.json"
15
+ return Tekkenizer.from_file(str(tekken_path))
16
+
17
+
18
+ def _build_prompt_tokens(
19
+ sp: Tekkenizer,
20
+ n_left_pad_tokens: int = 32,
21
+ num_delay_tokens: int = 6,
22
+ ) -> tuple[list[int], int]:
23
+ streaming_pad = sp.get_special_token("[STREAMING_PAD]")
24
+ prefix_len = n_left_pad_tokens + num_delay_tokens # 38 STREAMING_PAD tokens
25
+ tokens = [sp.bos_id] + [streaming_pad] * prefix_len
26
+ return tokens, num_delay_tokens
27
+
28
+
29
+ def load_model(model_path: str = "mlx-community/Voxtral-Mini-4B-Realtime-6bit"):
30
+ if not Path(model_path).exists():
31
+ model_path = download_model(model_path)
32
+ else:
33
+ model_path = Path(model_path)
34
+
35
+ model, config = _load_weights(model_path)
36
+ sp = _load_tokenizer(model_path)
37
+ return model, sp, config
38
+
39
+
40
+ def transcribe(
41
+ audio_path: str,
42
+ model_path: str = "mlx-community/Voxtral-Mini-4B-Realtime-6bit",
43
+ temperature: float = 0.0,
44
+ ) -> str:
45
+ model, sp, config = load_model(model_path)
46
+
47
+ prompt_tokens, n_delay_tokens = _build_prompt_tokens(sp)
48
+
49
+ output_tokens = generate(
50
+ model,
51
+ audio_path,
52
+ prompt_tokens,
53
+ n_delay_tokens=n_delay_tokens,
54
+ temperature=temperature,
55
+ eos_token_id=sp.eos_id,
56
+ )
57
+
58
+ return sp.decode(output_tokens, special_token_policy=SpecialTokenPolicy.IGNORE)
59
+
60
+
61
+ def main():
62
+ parser = argparse.ArgumentParser(description="Voxtral Mini Realtime speech-to-text")
63
+ parser.add_argument("--audio", default=None, help="Path to audio file (omit to stream from mic)")
64
+ parser.add_argument("--model", default="mlx-community/Voxtral-Mini-4B-Realtime-6bit", help="Model path or HF model ID")
65
+ parser.add_argument("--temp", type=float, default=0.0, help="Sampling temperature (0 = greedy)")
66
+ args = parser.parse_args()
67
+
68
+ if args.audio is not None:
69
+ text = transcribe(
70
+ args.audio,
71
+ model_path=args.model,
72
+ temperature=args.temp,
73
+ )
74
+ print(text)
75
+ else:
76
+ from .stream import stream_transcribe
77
+
78
+ stream_transcribe(
79
+ model_path=args.model,
80
+ temperature=args.temp,
81
+ )