mlx-kquant 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_kquant-0.1.0/CHANGELOG.md +23 -0
- mlx_kquant-0.1.0/CMakeLists.txt +209 -0
- mlx_kquant-0.1.0/LICENSE +21 -0
- mlx_kquant-0.1.0/MANIFEST.in +23 -0
- mlx_kquant-0.1.0/PKG-INFO +377 -0
- mlx_kquant-0.1.0/README.md +339 -0
- mlx_kquant-0.1.0/bindings.cpp +219 -0
- mlx_kquant-0.1.0/cmake/patch-gguflib.cmake +79 -0
- mlx_kquant-0.1.0/metal/kq_quantized.metal +334 -0
- mlx_kquant-0.1.0/metal/kq_quantized_encode.metal +86 -0
- mlx_kquant-0.1.0/metal/kq_quantized_nax.metal +106 -0
- mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized.h +6351 -0
- mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_encode.h +1239 -0
- mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_legacy.h +2004 -0
- mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_nax.h +2568 -0
- mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/quantized_utils.h +113 -0
- mlx_kquant-0.1.0/mlx_kquant/__init__.py +46 -0
- mlx_kquant-0.1.0/mlx_kquant/__main__.py +5 -0
- mlx_kquant-0.1.0/mlx_kquant/_deps.py +23 -0
- mlx_kquant-0.1.0/mlx_kquant/_install.py +76 -0
- mlx_kquant-0.1.0/mlx_kquant/_version.py +1 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/__init__.py +74 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/calibrate.py +178 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/chat.py +443 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/fuse.py +218 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/inspect.py +121 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/lora.py +63 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/quantize.py +126 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/run.py +154 -0
- mlx_kquant-0.1.0/mlx_kquant/cli/verify.py +59 -0
- mlx_kquant-0.1.0/mlx_kquant/codec_geometry.py +72 -0
- mlx_kquant-0.1.0/mlx_kquant/convert.py +241 -0
- mlx_kquant-0.1.0/mlx_kquant/imatrix.py +138 -0
- mlx_kquant-0.1.0/mlx_kquant/licenses/gguf-tools-LICENSE +26 -0
- mlx_kquant-0.1.0/mlx_kquant/licenses/llama.cpp-LICENSE +29 -0
- mlx_kquant-0.1.0/mlx_kquant/licenses/mlx-LICENSE +29 -0
- mlx_kquant-0.1.0/mlx_kquant/loader.py +211 -0
- mlx_kquant-0.1.0/mlx_kquant/mlx_lm_patch.py +358 -0
- mlx_kquant-0.1.0/mlx_kquant/nn.py +238 -0
- mlx_kquant-0.1.0/mlx_kquant/py.typed +0 -0
- mlx_kquant-0.1.0/mlx_kquant/recipes.py +418 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/PKG-INFO +377 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/SOURCES.txt +96 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/dependency_links.txt +1 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/entry_points.txt +3 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/requires.txt +13 -0
- mlx_kquant-0.1.0/mlx_kquant.egg-info/top_level.txt +1 -0
- mlx_kquant-0.1.0/pyproject.toml +120 -0
- mlx_kquant-0.1.0/scripts/check-codecs.py +124 -0
- mlx_kquant-0.1.0/setup.cfg +4 -0
- mlx_kquant-0.1.0/setup.py +28 -0
- mlx_kquant-0.1.0/src/kquant.h +271 -0
- mlx_kquant-0.1.0/src/kquant_codec.cpp +41 -0
- mlx_kquant-0.1.0/src/kquant_codec.h +25 -0
- mlx_kquant-0.1.0/src/kquant_cpu_decode.cpp +1205 -0
- mlx_kquant-0.1.0/src/kquant_cpu_decode.h +98 -0
- mlx_kquant-0.1.0/src/kquant_cpu_encode.cpp +990 -0
- mlx_kquant-0.1.0/src/kquant_cpu_encode.h +33 -0
- mlx_kquant-0.1.0/src/kquant_cpu_neon.cpp +883 -0
- mlx_kquant-0.1.0/src/kquant_cpu_neon.h +52 -0
- mlx_kquant-0.1.0/src/kquant_dequantize.cpp +143 -0
- mlx_kquant-0.1.0/src/kquant_encode.cpp +181 -0
- mlx_kquant-0.1.0/src/kquant_gather.cpp +821 -0
- mlx_kquant-0.1.0/src/kquant_gguf.cpp +567 -0
- mlx_kquant-0.1.0/src/kquant_gguf.h +65 -0
- mlx_kquant-0.1.0/src/kquant_internal.h +23 -0
- mlx_kquant-0.1.0/src/kquant_matmul.cpp +580 -0
- mlx_kquant-0.1.0/src/kquant_metal_internal.h +271 -0
- mlx_kquant-0.1.0/src/kquant_ops.cpp +422 -0
- mlx_kquant-0.1.0/tests/conftest.py +21 -0
- mlx_kquant-0.1.0/tests/fixtures/SHA256SUMS +10 -0
- mlx_kquant-0.1.0/tests/fixtures/q2_k.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q2_k_moe.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q3_k.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q3_k_moe.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q4_k.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q4_k_moe.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q5_k.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q5_k_moe.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q6_k.npz +0 -0
- mlx_kquant-0.1.0/tests/fixtures/q6_k_moe.npz +0 -0
- mlx_kquant-0.1.0/tests/gen_fixtures.py +92 -0
- mlx_kquant-0.1.0/tests/test_cli.py +523 -0
- mlx_kquant-0.1.0/tests/test_codecs.py +177 -0
- mlx_kquant-0.1.0/tests/test_cpu_decode.py +222 -0
- mlx_kquant-0.1.0/tests/test_cpu_neon.py +175 -0
- mlx_kquant-0.1.0/tests/test_dequant.py +155 -0
- mlx_kquant-0.1.0/tests/test_encode.py +252 -0
- mlx_kquant-0.1.0/tests/test_fuse_cli.py +351 -0
- mlx_kquant-0.1.0/tests/test_gather.py +285 -0
- mlx_kquant-0.1.0/tests/test_gguf.py +94 -0
- mlx_kquant-0.1.0/tests/test_loader.py +275 -0
- mlx_kquant-0.1.0/tests/test_lora_patch.py +225 -0
- mlx_kquant-0.1.0/tests/test_matmul.py +162 -0
- mlx_kquant-0.1.0/tests/test_matmul_cold.py +136 -0
- mlx_kquant-0.1.0/tests/test_nn.py +207 -0
- mlx_kquant-0.1.0/tests/test_recipes.py +242 -0
- mlx_kquant-0.1.0/tests/test_vjp.py +195 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project aims to
|
|
5
|
+
adhere to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.1.0]
|
|
8
|
+
|
|
9
|
+
First public release. A C++/Metal extension for a stock `mlx==0.31.2` wheel that
|
|
10
|
+
adds the K-quant superblock and per-block integer codecs as native MLX ops
|
|
11
|
+
(`kq.dequantize` / `quantized_matmul` / `gather_qmm` / `quantize`), with Metal and
|
|
12
|
+
portable CPU paths for all ten codecs. On top of the ops, the `mlx-kquant` CLI
|
|
13
|
+
quantizes an HF / mlx-lm model into a K-quant MLX safetensors checkpoint and runs,
|
|
14
|
+
chats with, LoRA-fine-tunes, and fuses it, with importance-matrix calibration and
|
|
15
|
+
per-tensor recipe inspection. A loader runs those checkpoints on stock mlx-lm.
|
|
16
|
+
|
|
17
|
+
### Notes
|
|
18
|
+
- `requires-python >= 3.10` (mlx 0.31.2 ships no cp39 wheel).
|
|
19
|
+
- The GPU path is macOS 26 (Tahoe) or later on Apple Silicon (Metal); the NAX
|
|
20
|
+
matmul kernel needs the Metal 4 SDK (`MetalPerformancePrimitives`). Linux is
|
|
21
|
+
supported CPU-only - build against `mlx[cpu]==0.31.2`; model forwards there also
|
|
22
|
+
need `MLX_DISABLE_COMPILE=1` (an upstream MLX CPU-JIT limitation under GCC, not
|
|
23
|
+
mlx-kquant).
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.27)
|
|
2
|
+
|
|
3
|
+
project(mlx_kquant LANGUAGES CXX C)
|
|
4
|
+
|
|
5
|
+
# ----------------------------- Setup -----------------------------
|
|
6
|
+
# C++20: MLX 0.31.2's public headers (device.h, stream.h) use defaulted
|
|
7
|
+
# comparison operators, a C++20 feature. Apple Clang accepts them as an
|
|
8
|
+
# extension under -std=c++17, but GCC (the Linux toolchain) rejects them, so a
|
|
9
|
+
# C++17 standard breaks the Metal-free Linux build. 20 builds cleanly on both.
|
|
10
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
11
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
12
|
+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
13
|
+
|
|
14
|
+
option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)
|
|
15
|
+
|
|
16
|
+
# ----------------------------- Dependencies -----------------------------
|
|
17
|
+
# Pick the right interpreter. mlx.extension.CMakeBuild invokes cmake WITHOUT
|
|
18
|
+
# forwarding the building interpreter, so CMake's FindPython can otherwise latch
|
|
19
|
+
# onto an unrelated system/uv Python that lacks mlx & nanobind (then
|
|
20
|
+
# `python -m mlx --cmake-dir` returns empty and find_package(MLX) fails).
|
|
21
|
+
# Precedence: an explicit -DPython_EXECUTABLE (e.g. via CMAKE_ARGS) wins; else an
|
|
22
|
+
# active virtualenv ($VIRTUAL_ENV); else CMake's normal search.
|
|
23
|
+
if(NOT Python_EXECUTABLE AND DEFINED ENV{VIRTUAL_ENV})
|
|
24
|
+
set(Python_EXECUTABLE "$ENV{VIRTUAL_ENV}/bin/python")
|
|
25
|
+
endif()
|
|
26
|
+
set(Python_FIND_VIRTUALENV FIRST)
|
|
27
|
+
|
|
28
|
+
find_package(
|
|
29
|
+
Python 3.9
|
|
30
|
+
COMPONENTS Interpreter Development.Module
|
|
31
|
+
REQUIRED)
|
|
32
|
+
|
|
33
|
+
execute_process(
|
|
34
|
+
COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
|
|
35
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
36
|
+
OUTPUT_VARIABLE nanobind_ROOT)
|
|
37
|
+
find_package(nanobind CONFIG REQUIRED)
|
|
38
|
+
|
|
39
|
+
execute_process(
|
|
40
|
+
COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
|
|
41
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
42
|
+
OUTPUT_VARIABLE MLX_ROOT)
|
|
43
|
+
find_package(MLX CONFIG REQUIRED)
|
|
44
|
+
|
|
45
|
+
# NOTE: do NOT splice ${MLX_CXX_FLAGS} into CMAKE_CXX_FLAGS - it is a CMake list
|
|
46
|
+
# (-DACCELERATE_NEW_LAPACK;-D_METAL_), and embedding the ';' into the flags
|
|
47
|
+
# *string* corrupts every compile command (`/bin/sh: -D_METAL_: command not
|
|
48
|
+
# found`). MLXConfig already attaches these as INTERFACE_COMPILE_OPTIONS on the
|
|
49
|
+
# imported `mlx` target, so -D_METAL_ propagates correctly to anything that
|
|
50
|
+
# links `mlx` (below).
|
|
51
|
+
|
|
52
|
+
# ----------------------------- gguflib (GGUF parser) -----------------------------
|
|
53
|
+
# antirez/gguf-tools single-file GGUF parser, for the C++ kq.load_gguf loader.
|
|
54
|
+
# PIN: a ref new enough to define GGUF_TYPE_Q5_0/Q5_1/Q2_K..Q6_K + correct
|
|
55
|
+
# K-quant block sizes (older refs only know Q4_0/Q4_1/Q8_0).
|
|
56
|
+
include(FetchContent)
|
|
57
|
+
FetchContent_Declare(
|
|
58
|
+
gguflib
|
|
59
|
+
GIT_REPOSITORY https://github.com/antirez/gguf-tools/
|
|
60
|
+
GIT_TAG fdfafbed766db0a1e9019b07994cd88f133d1aab
|
|
61
|
+
# The pinned ref only knows GGML types up to BF16 (30); a tensor of any newer
|
|
62
|
+
# codec trips gguf_get_tensor's `>= GGUF_TYPE_COUNT` guard and truncates the
|
|
63
|
+
# tensor list. Teach the parser the post-BF16 type block-geometry (incl. the
|
|
64
|
+
# MXFP4/NVFP4 float codecs MLX has native kernels for). Idempotent; see script.
|
|
65
|
+
PATCH_COMMAND ${CMAKE_COMMAND} -DGGUFLIB_DIR=<SOURCE_DIR>
|
|
66
|
+
-P ${CMAKE_CURRENT_LIST_DIR}/cmake/patch-gguflib.cmake)
|
|
67
|
+
FetchContent_MakeAvailable(gguflib)
|
|
68
|
+
add_library(gguflib STATIC ${gguflib_SOURCE_DIR}/fp16.c
|
|
69
|
+
${gguflib_SOURCE_DIR}/gguflib.c)
|
|
70
|
+
# Hidden visibility so the gguf_* symbols stay LOCAL to whichever image links
|
|
71
|
+
# them whole-archive (below) and never interpose / collide at runtime with the
|
|
72
|
+
# identically-named gguf_* symbols that libmlx also exports.
|
|
73
|
+
set_target_properties(gguflib PROPERTIES POSITION_INDEPENDENT_CODE ON
|
|
74
|
+
C_VISIBILITY_PRESET hidden)
|
|
75
|
+
# gguflib uses assert() to reject malformed tensor headers (e.g. ndim > 8).
|
|
76
|
+
# -DNDEBUG (release builds) would compile those out, leaving out-of-bounds
|
|
77
|
+
# reads/writes unguarded when loading untrusted GGUF files. Force NDEBUG off.
|
|
78
|
+
target_compile_options(gguflib PRIVATE -UNDEBUG)
|
|
79
|
+
|
|
80
|
+
# Whole-archive flag for the patched gguflib (used on both linking images below):
|
|
81
|
+
# pull all its (hidden) gguf_* objects into the image so they win over libmlx's
|
|
82
|
+
# exported copy. Apple ld spells this -force_load; GNU ld / lld use the
|
|
83
|
+
# --whole-archive / --no-whole-archive bracket.
|
|
84
|
+
if(APPLE)
|
|
85
|
+
set(KQ_WHOLE_ARCHIVE_GGUFLIB "-Wl,-force_load,$<TARGET_FILE:gguflib>")
|
|
86
|
+
else()
|
|
87
|
+
set(KQ_WHOLE_ARCHIVE_GGUFLIB
|
|
88
|
+
"-Wl,--whole-archive,$<TARGET_FILE:gguflib>,--no-whole-archive")
|
|
89
|
+
endif()
|
|
90
|
+
|
|
91
|
+
# ----------------------------- C++ library -----------------------------
|
|
92
|
+
add_library(mlx_kquant_ext)
|
|
93
|
+
|
|
94
|
+
target_sources(
|
|
95
|
+
mlx_kquant_ext
|
|
96
|
+
PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/kquant_codec.cpp
|
|
97
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_decode.cpp
|
|
98
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_encode.cpp
|
|
99
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_ops.cpp
|
|
100
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_dequantize.cpp
|
|
101
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_matmul.cpp
|
|
102
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_gather.cpp
|
|
103
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_encode.cpp
|
|
104
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_gguf.cpp)
|
|
105
|
+
|
|
106
|
+
target_include_directories(mlx_kquant_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src)
|
|
107
|
+
# gguflib include/link are PUBLIC: target_sources above lists the .cpp as PUBLIC,
|
|
108
|
+
# so it is also compiled into the `_ext` nanobind module (consumer), which then
|
|
109
|
+
# needs gguflib.h on its include path and the gguf_* symbols at link time.
|
|
110
|
+
target_include_directories(mlx_kquant_ext PUBLIC ${gguflib_SOURCE_DIR})
|
|
111
|
+
target_link_libraries(mlx_kquant_ext PUBLIC mlx)
|
|
112
|
+
# Apple: route the CPU matmul's large-M GEMM through Accelerate (engages the
|
|
113
|
+
# AMX/SME matrix units). PUBLIC so the nanobind module, which re-compiles the
|
|
114
|
+
# PUBLIC sources above, sees the same define and link. Other platforms keep
|
|
115
|
+
# the portable threaded-scalar fallback.
|
|
116
|
+
if(APPLE)
|
|
117
|
+
target_compile_definitions(mlx_kquant_ext PUBLIC KQ_USE_ACCELERATE)
|
|
118
|
+
target_link_libraries(mlx_kquant_ext PUBLIC "-framework Accelerate")
|
|
119
|
+
endif()
|
|
120
|
+
# arm64: NEON-dotprod int8 GEMV kernels for the fused small-M CPU matmul.
|
|
121
|
+
# The TU is compiled only on arm64/aarch64 hosts; execution is further gated
|
|
122
|
+
# at runtime (KQ_CPU_NEON=0 kill switch; Linux aarch64 hwcap dotprod check).
|
|
123
|
+
# Every other target keeps the portable scalar path (the header stubs
|
|
124
|
+
# kq_neon_kernel to nullptr when KQ_CPU_NEON_TU is undefined).
|
|
125
|
+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
|
|
126
|
+
target_sources(mlx_kquant_ext
|
|
127
|
+
PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_neon.cpp)
|
|
128
|
+
target_compile_definitions(mlx_kquant_ext PUBLIC KQ_CPU_NEON_TU)
|
|
129
|
+
if(NOT APPLE)
|
|
130
|
+
# Linux toolchains may default to an armv8.0 baseline without dotprod;
|
|
131
|
+
# enable it for this one TU - dotprod instructions only execute after the
|
|
132
|
+
# runtime hwcap check passes.
|
|
133
|
+
set_source_files_properties(
|
|
134
|
+
${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_neon.cpp
|
|
135
|
+
PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+dotprod")
|
|
136
|
+
endif()
|
|
137
|
+
endif()
|
|
138
|
+
# libmlx bundles its OWN (older) gguflib and exports its gguf_* symbols. A plain
|
|
139
|
+
# link can bind our gguf_get_tensor()/gguf_open() calls to MLX's copy, whose
|
|
140
|
+
# GGUF_TYPE_COUNT predates the MXFP4/NVFP4 float codecs - so a tensor of type
|
|
141
|
+
# >= 31 trips its `>= GGUF_TYPE_COUNT` guard and the tensor list silently
|
|
142
|
+
# truncates at the first such tensor. Linking our patched gguflib whole-archive
|
|
143
|
+
# pulls its objects into this image as hidden-visibility definitions, so every
|
|
144
|
+
# gguf_* call resolves in-image to the patched parser instead of to libmlx.
|
|
145
|
+
target_link_libraries(mlx_kquant_ext PUBLIC gguflib)
|
|
146
|
+
target_link_options(mlx_kquant_ext PRIVATE ${KQ_WHOLE_ARCHIVE_GGUFLIB})
|
|
147
|
+
|
|
148
|
+
# ----------------------------- Metal library -----------------------------
|
|
149
|
+
# The kq_* kernels compiled into a single mlx_kquant.metallib.
|
|
150
|
+
# The repo's metal/ dir is searched BEFORE the stock-wheel include tree so the
|
|
151
|
+
# repo's kq_*.h + quantized_utils.h (which adds load_vector) win;
|
|
152
|
+
# steel/gemm/*, utils.h, etc. fall through to the stock wheel headers.
|
|
153
|
+
if(MLX_BUILD_METAL)
|
|
154
|
+
mlx_build_metallib(
|
|
155
|
+
TARGET
|
|
156
|
+
mlx_kquant_metallib
|
|
157
|
+
TITLE
|
|
158
|
+
mlx_kquant
|
|
159
|
+
SOURCES
|
|
160
|
+
${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized.metal
|
|
161
|
+
${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized_nax.metal
|
|
162
|
+
${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized_encode.metal
|
|
163
|
+
INCLUDE_DIRS
|
|
164
|
+
${CMAKE_CURRENT_LIST_DIR}/metal
|
|
165
|
+
${MLX_INCLUDE_DIRS}
|
|
166
|
+
OUTPUT_DIRECTORY
|
|
167
|
+
${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
|
|
168
|
+
|
|
169
|
+
add_dependencies(mlx_kquant_ext mlx_kquant_metallib)
|
|
170
|
+
endif()
|
|
171
|
+
|
|
172
|
+
# ----------------------------- Python bindings -----------------------------
|
|
173
|
+
nanobind_add_module(
|
|
174
|
+
_ext
|
|
175
|
+
NB_STATIC
|
|
176
|
+
STABLE_ABI
|
|
177
|
+
LTO
|
|
178
|
+
NOMINSIZE
|
|
179
|
+
NB_DOMAIN
|
|
180
|
+
mlx
|
|
181
|
+
${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
|
|
182
|
+
target_link_libraries(_ext PRIVATE mlx_kquant_ext)
|
|
183
|
+
# kquant_gguf.cpp is a PUBLIC source (above), so it is also compiled into this
|
|
184
|
+
# module and the LTO link keeps its own gguf_* references - force_load the
|
|
185
|
+
# patched gguflib here too so they don't fall back to libmlx's exported copy
|
|
186
|
+
# (see the mlx_kquant_ext force_load note for the full rationale).
|
|
187
|
+
target_link_libraries(_ext PRIVATE gguflib)
|
|
188
|
+
target_link_options(_ext PRIVATE ${KQ_WHOLE_ARCHIVE_GGUFLIB})
|
|
189
|
+
|
|
190
|
+
if(BUILD_SHARED_LIBS)
|
|
191
|
+
if(APPLE)
|
|
192
|
+
# @loader_path finds the co-located libmlx_kquant_ext.dylib (same package
|
|
193
|
+
# dir); the second rpath resolves @rpath/libmlx.dylib against the *user's*
|
|
194
|
+
# installed mlx wheel at runtime. _ext.so lives in site-packages/mlx_kquant/,
|
|
195
|
+
# libmlx in site-packages/mlx/lib/, so ../mlx/lib reaches it. (The absolute
|
|
196
|
+
# build-tree rpath MLXConfig adds works only on the build machine; this is
|
|
197
|
+
# what makes a redistributed wheel find the ABI-pinned mlx==0.31.2. delocate
|
|
198
|
+
# must therefore *exclude* libmlx from vendoring - see [tool.cibuildwheel].)
|
|
199
|
+
target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
|
|
200
|
+
target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path/../mlx/lib)
|
|
201
|
+
else()
|
|
202
|
+
# ELF equivalent of @loader_path: $ORIGIN is the dir of _ext.so. The
|
|
203
|
+
# co-located libmlx_kquant_ext.so sits beside it; ../mlx/lib reaches the
|
|
204
|
+
# user's installed mlx wheel's libmlx.so (auditwheel must *exclude* libmlx.so
|
|
205
|
+
# from vendoring, mirroring the macOS delocate exclude).
|
|
206
|
+
target_link_options(_ext PRIVATE "-Wl,-rpath,\$ORIGIN")
|
|
207
|
+
target_link_options(_ext PRIVATE "-Wl,-rpath,\$ORIGIN/../mlx/lib")
|
|
208
|
+
endif()
|
|
209
|
+
endif()
|
mlx_kquant-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright © 2026 Asher Feldman
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Build inputs the sdist needs to compile the extension from source.
|
|
2
|
+
include CMakeLists.txt
|
|
3
|
+
include setup.py
|
|
4
|
+
include bindings.cpp
|
|
5
|
+
recursive-include src *.cpp *.h
|
|
6
|
+
recursive-include metal *.metal *.h
|
|
7
|
+
recursive-include cmake *.cmake
|
|
8
|
+
|
|
9
|
+
# Project metadata and tooling.
|
|
10
|
+
include LICENSE
|
|
11
|
+
include README.md
|
|
12
|
+
include CHANGELOG.md
|
|
13
|
+
recursive-include scripts *.py
|
|
14
|
+
|
|
15
|
+
# Tests and fixtures, so an sdist build can run its own checks.
|
|
16
|
+
recursive-include tests *.py *.npz
|
|
17
|
+
include tests/fixtures/SHA256SUMS
|
|
18
|
+
|
|
19
|
+
# Never ship machine-specific build artifacts in the sdist (the wheel carries
|
|
20
|
+
# these via package-data; the sdist must not, or it would build them in stale).
|
|
21
|
+
global-exclude *.so *.dylib *.metallib *.air *.pyc
|
|
22
|
+
prune tests/fixtures/__pycache__
|
|
23
|
+
global-exclude .DS_Store
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlx-kquant
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: GGUF K-quant dequantize / quantized-matmul / gather-qmm / quantize ops for MLX, via custom Metal kernels.
|
|
5
|
+
Author: Asher Feldman
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/asher/mlx-kquant
|
|
8
|
+
Project-URL: Repository, https://github.com/asher/mlx-kquant
|
|
9
|
+
Project-URL: Issues, https://github.com/asher/mlx-kquant/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/asher/mlx-kquant/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: mlx,quantization,gguf,k-quant,metal,apple-silicon,llm
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Operating System :: MacOS
|
|
20
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
21
|
+
Classifier: Environment :: GPU
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: mlx==0.31.2
|
|
27
|
+
Provides-Extra: tools
|
|
28
|
+
Requires-Dist: mlx-lm>=0.27; extra == "tools"
|
|
29
|
+
Requires-Dist: transformers; extra == "tools"
|
|
30
|
+
Requires-Dist: gguf; extra == "tools"
|
|
31
|
+
Requires-Dist: numpy; extra == "tools"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff; extra == "dev"
|
|
35
|
+
Requires-Dist: numpy; extra == "dev"
|
|
36
|
+
Requires-Dist: gguf; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# mlx-kquant
|
|
40
|
+
|
|
41
|
+
[](https://github.com/asher/mlx-kquant/actions/workflows/ci.yml)
|
|
42
|
+
|
|
43
|
+
Bring **K-quant precision to [MLX](https://github.com/ml-explore/mlx)** on Apple Silicon: a C++/Metal
|
|
44
|
+
**extension** for a stock `mlx` wheel that adds the K-quant superblock and per-block integer codecs
|
|
45
|
+
as native MLX ops, plus a toolchain that quantizes a model into a **K-quant MLX safetensors
|
|
46
|
+
checkpoint** and runs, LoRA-trains, and fuses it.
|
|
47
|
+
|
|
48
|
+
Two layers:
|
|
49
|
+
|
|
50
|
+
- **Ops** (C++/Metal) - a `kq.*` namespace (`dequantize`, `quantized_matmul`, `gather_qmm`,
|
|
51
|
+
`quantize`) backed by Metal kernels compiled to a `.metallib` at build time (no runtime JIT). All
|
|
52
|
+
ten codecs: `q2_k, q3_k, q4_k, q5_k, q6_k` and `q4_0, q4_1, q5_0, q5_1, q8_0`.
|
|
53
|
+
- **Tooling** (Python) - `mlx-kquant quantize / run / chat / lora / fuse` (plus `verify`, `inspect`,
|
|
54
|
+
`calibrate-imatrix`) and a `loader` that create and run K-quant checkpoints in **MLX-native
|
|
55
|
+
safetensors** format.
|
|
56
|
+
|
|
57
|
+
## Why
|
|
58
|
+
|
|
59
|
+
K-quant is the precision recipe behind the strongest small-footprint community quants; this makes it
|
|
60
|
+
first-class on MLX. Quantize an HF / `mlx-lm` model to a uniform- or mixed-precision K-quant
|
|
61
|
+
checkpoint, then load, generate, LoRA-train, and fuse it - all on a stock `mlx` wheel, all in MLX
|
|
62
|
+
safetensors. The kernels are tuned for real models (matrix-contiguity handling for fused MoE experts,
|
|
63
|
+
single-pass NAX matmul), see [Performance](#performance).
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
**macOS 26.2 (Tahoe) or later on Apple Silicon** (the GPU path), with the Metal toolchain
|
|
68
|
+
(`xcrun metal`) and the exact pinned MLX wheel.
|
|
69
|
+
|
|
70
|
+
```sh
|
|
71
|
+
pip install "mlx==0.31.2" # pinned, ABI-matched stock wheel (pulls the Metal backend)
|
|
72
|
+
pip install -e . # builds _ext + mlx_kquant.metallib
|
|
73
|
+
pip install -e ".[tools]" # + mlx-lm, for the CLI subcommands (quantize / run / chat / ...)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**Linux (CPU-only)** also builds, with no Metal toolchain. The ops run on their portable `eval_cpu`
|
|
77
|
+
paths and no metallib is produced. (The tuned matmul/gather are Apple-Silicon-targeted: arm64 Linux
|
|
78
|
+
picks up the NEON int8 GEMV when the CPU has dot-product, but the Accelerate GEMM is Apple-only, and
|
|
79
|
+
x86_64 stays on the scalar/threaded path.) The base `mlx` wheel ships no backend on Linux, so
|
|
80
|
+
install the CPU one explicitly:
|
|
81
|
+
|
|
82
|
+
```sh
|
|
83
|
+
pip install "mlx[cpu]==0.31.2" # base frontend + libmlx CPU backend
|
|
84
|
+
pip install -e . --no-build-isolation
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
CPU is for portability and CI, not throughput. Running a full model forward on Linux also needs
|
|
88
|
+
`MLX_DISABLE_COMPILE=1`, see [Limitations](#limitations).
|
|
89
|
+
|
|
90
|
+
Smoke-test the toolchain:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import mlx_kquant as kq
|
|
94
|
+
kq.codecs() # -> ['q2_k', 'q3_k', ..., 'q8_0']
|
|
95
|
+
kq.metallib_loads() # -> True (the bundled metallib opened on the Metal device)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
> The extension links `libmlx` and its kernels `#include` MLX's steel-GEMM headers, so it is bound to
|
|
99
|
+
> an exact MLX ABI **and** header API. The pin is intentionally `==`, never `>=`; moving to a newer
|
|
100
|
+
> `mlx` may require updating the bundled headers and recompiling. See [Version pinning](#version-pinning).
|
|
101
|
+
|
|
102
|
+
## Quickstart
|
|
103
|
+
|
|
104
|
+
Quantize a checkpoint and run it, load it through mlx-lm, fine-tune it with LoRA, or build directly on
|
|
105
|
+
the `kq.*` ops.
|
|
106
|
+
|
|
107
|
+
### Create and run a checkpoint
|
|
108
|
+
|
|
109
|
+
The CLI (the `[tools]` extra adds `mlx-lm`) quantizes an HF / `mlx-lm` model into a K-quant **MLX
|
|
110
|
+
safetensors** checkpoint and runs it:
|
|
111
|
+
|
|
112
|
+
```sh
|
|
113
|
+
pip install "mlx-kquant[tools]"
|
|
114
|
+
mlx-kquant quantize --model Qwen/Qwen3-0.6B --preset q4_k_m --mlx-path qwen3-q4
|
|
115
|
+
mlx-kquant run --model qwen3-q4 --prompt "Explain entropy in one sentence."
|
|
116
|
+
mlx-kquant chat --model qwen3-q4 --temp 0.7 # interactive REPL (mlx-lm chat)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
`run` takes the usual sampling knobs (`--temp`, `--top-p`, `--top-k`, `--min-p`, `--seed`,
|
|
120
|
+
`--repetition-penalty`, `--presence-penalty`, `--frequency-penalty`) and chat-template controls
|
|
121
|
+
(`--system-prompt`, `--no-chat-template`, `--chat-template-config` for template kwargs such as
|
|
122
|
+
`'{"enable_thinking": false}'`). The `chat` REPL has a line-editable prompt with persistent
|
|
123
|
+
history (`--no-history` or in-chat `/history off|on|clear` to control it) and in-chat sampling
|
|
124
|
+
control (`/temp`, `/top-p`, `/top-k`, `/min-p`, `/max-tokens`, and the three penalties;
|
|
125
|
+
`/sampling` shows current values); `/load <file>` prefills the next prompt from a text file for
|
|
126
|
+
editing; `/clear` resets the conversation and wipes the screen; Tab completes `/commands` and
|
|
127
|
+
paths; Ctrl-C cancels the in-flight reply (at an idle
|
|
128
|
+
prompt it exits, as does Ctrl-D). `--max-kv-size` bounds the KV cache for long sessions (a rotating
|
|
129
|
+
window, set at start).
|
|
130
|
+
|
|
131
|
+
The result is a standard MLX checkpoint (`config.json` + sharded safetensors, weights as K-quant wire
|
|
132
|
+
bytes). Load it in code with the bundled loader:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import mlx.core as mx
|
|
136
|
+
from mlx_kquant.loader import load
|
|
137
|
+
|
|
138
|
+
model, config = load("qwen3-q4") # KQuant* layers swapped in, on a stock mlx-lm model
|
|
139
|
+
mx.eval(model(mx.array([[1, 2, 3]])))
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
`mlx-kquant lora` (train an adapter) and `mlx-kquant fuse` (merge it back) round out the toolchain -
|
|
143
|
+
see [LoRA fine-tuning](#lora-fine-tuning). Run `mlx-kquant --help` for every subcommand.
|
|
144
|
+
|
|
145
|
+
### Using with mlx-lm
|
|
146
|
+
|
|
147
|
+
In-process, a kquant checkpoint also loads through **stock mlx-lm**: one idempotent call installs the
|
|
148
|
+
load shim, and from then on `mlx_lm.load` / `mlx_lm.generate` (and anything built on
|
|
149
|
+
`mlx_lm.utils.load_model`, e.g. an eval harness or your own serving loop) open a kquant checkpoint
|
|
150
|
+
transparently:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from mlx_kquant.mlx_lm_patch import patch_mlx_lm_load
|
|
154
|
+
patch_mlx_lm_load() # process-wide, idempotent; call once before mlx_lm.load
|
|
155
|
+
|
|
156
|
+
from mlx_lm import load, generate
|
|
157
|
+
model, tokenizer = load("qwen3-q4")
|
|
158
|
+
print(generate(model, tokenizer, "Explain entropy.", max_tokens=64))
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
This is the load-only shim for inference / eval / serving; `patch_mlx_lm_lora()`
|
|
162
|
+
([below](#lora-fine-tuning)) adds the train/merge shims on top. The bundled `mlx_kquant.loader.load`
|
|
163
|
+
(above) is the standalone path when you don't need the rest of mlx-lm.
|
|
164
|
+
|
|
165
|
+
### LoRA fine-tuning
|
|
166
|
+
|
|
167
|
+
A kquant checkpoint is a frozen base you can adapt with LoRA. Attach an adapter for inference, train
|
|
168
|
+
one (the matmul/gather ops define a gradient-through-the-base `vjp`, so the adapter is differentiable
|
|
169
|
+
while the quantized weights stay frozen), and merge it back with `mlx-kquant fuse` (re-encode to
|
|
170
|
+
kquant, or `--dequantize` to float). One call wires it into stock mlx-lm:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from mlx_kquant.mlx_lm_patch import patch_mlx_lm_lora
|
|
174
|
+
patch_mlx_lm_lora() # before building LoRA layers / loading adapters; idempotent
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
See **[docs/lora.md](https://github.com/asher/mlx-kquant/blob/main/docs/lora.md)** for attach / train / merge workflows. (DoRA on a kquant base is
|
|
178
|
+
not supported - use LoRA.)
|
|
179
|
+
|
|
180
|
+
### Using K-quant ops directly
|
|
181
|
+
|
|
182
|
+
Under the toolchain, the four `kq.*` ops operate on raw K-quant wire bytes. K-quant scales live
|
|
183
|
+
*inside* the packed bytes, so the `scales` argument is a vestigial placeholder (the API keeps it for
|
|
184
|
+
shape symmetry with MLX's affine quant); `kq.quantize` returns one for you.
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import mlx.core as mx
|
|
188
|
+
import mlx_kquant as kq
|
|
189
|
+
|
|
190
|
+
N, K = 256, 512 # q4_k: K must be a multiple of 256
|
|
191
|
+
w = mx.random.normal((N, K))
|
|
192
|
+
|
|
193
|
+
# encode float -> K-quant wire bytes (CPU or Metal); optional imatrix steers the encoder
|
|
194
|
+
wq, scales = kq.quantize(w, "q4_k") # wq: uint8 [N, bytes_per_row]
|
|
195
|
+
|
|
196
|
+
# dequantize back to float
|
|
197
|
+
deq = kq.dequantize(wq, scales, "q4_k") # float16 [N, K]
|
|
198
|
+
|
|
199
|
+
# quantized matmul: x @ dequant(w).T (transpose=True => w is [N, K])
|
|
200
|
+
x = mx.random.normal((8, K))
|
|
201
|
+
y = kq.quantized_matmul(x, wq, scales, "q4_k", transpose=True) # [8, N]
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Mixture-of-experts (gathered) matmul:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
E, N, K = 128, 704, 2816
|
|
208
|
+
we = mx.random.normal((E, N, K))
|
|
209
|
+
weq, sc = kq.quantize(we, "q4_k") # per-expert wire bytes
|
|
210
|
+
x = mx.random.normal((1, 8, K)) # (tokens, top_k, K)
|
|
211
|
+
idx = mx.array([[0, 5, 9, 17, 33, 41, 88, 120]], dtype=mx.uint32)
|
|
212
|
+
out = kq.gather_qmm(x, weq, sc, "q4_k", rhs_indices=idx, transpose=True)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Ready-made modules that store the wire bytes and dispatch the matching `kq.*` op ship in
|
|
216
|
+
`mlx_kquant.nn`:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from mlx_kquant.nn import KQuantLinear, KQuantEmbedding, KQuantSwitchLinear
|
|
220
|
+
|
|
221
|
+
x = mx.random.normal((8, 512)) # a (tokens, in_dims) activation batch
|
|
222
|
+
lin = KQuantLinear(in_dims=512, out_dims=256, bias=False, codec="q4_k")
|
|
223
|
+
lin.weight = wq # the uint8 wire bytes from kq.quantize, above
|
|
224
|
+
lin.scales = scales # [1] placeholder (scales live in the bytes)
|
|
225
|
+
y = lin(x) # kq.quantized_matmul under the hood
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
`KQuantEmbedding` (with a tied-`as_linear`), the `gather_qmm`-backed `KQuantSwitchLinear` for MoE
|
|
229
|
+
experts, and `KQuantMultiLinear` (absorbed-MLA) are exported alongside it. To swap the quantizable
|
|
230
|
+
leaves of a whole constructed mlx-lm model in one call, use
|
|
231
|
+
`mlx_kquant.nn.install_kquant_modules(model, {"<path>.weight": "q4_k", ...})`.
|
|
232
|
+
|
|
233
|
+
The `[tools]` layer is itself a worked reference for wiring `kq.*` into the MLX ecosystem: the loader,
|
|
234
|
+
encoder, layer modules, and the mlx-lm monkeypatch are all small and self-contained. See
|
|
235
|
+
**[docs/integration.md](https://github.com/asher/mlx-kquant/blob/main/docs/integration.md)** if you're building on the ops.
|
|
236
|
+
|
|
237
|
+
## Performance
|
|
238
|
+
|
|
239
|
+
The Metal kernels use a single-pass NAX matmul and matrix-contiguity handling for fused MoE expert
|
|
240
|
+
weights. Measured on an M5 Max (128 GB):
|
|
241
|
+
|
|
242
|
+
| Model | Codec | Decode (tok/s) | Prefill pp512 (tok/s) |
|
|
243
|
+
|-------|-------|---------------:|----------------------:|
|
|
244
|
+
| gemma-4-26B-A4B-it (MoE) | q4_k_xl | ~111 | ~2330 |
|
|
245
|
+
| Qwen3.5-9B (dense) | q5_k_xl | ~83 | ~2396 |
|
|
246
|
+
|
|
247
|
+
Transposed matmuls with a small row count (the speculative-decode verify regime) automatically route
|
|
248
|
+
through a weight-read-amortizing `verify_qmv` kernel; `KQ_DISABLE_VERIFY_QMV=1` forces the plain
|
|
249
|
+
per-row `qmv` path (see [Environment variables](#environment-variables)).
|
|
250
|
+
|
|
251
|
+
## How it works
|
|
252
|
+
|
|
253
|
+
- **Own ops.** Four `Primitive` subclasses (`KQuantDequantize`, `KQuantMatmul`, `KQuantGatherQMM`,
|
|
254
|
+
`KQuantQuantize`) and their op functions live entirely in the extension.
|
|
255
|
+
- **Precompiled metallib on stock headers.** The `kq_*` kernels are compiled against the stock
|
|
256
|
+
wheel's steel-GEMM headers into `mlx_kquant.metallib` at build time; host dispatch resolves them
|
|
257
|
+
through MLX's exported `Device::get_kernel`. No JIT, no steel host structs.
|
|
258
|
+
- **Codec registry** derives `group_size`/`bits` from the codec name, so callers pass only
|
|
259
|
+
`kquant_type`.
|
|
260
|
+
- **CPU and GPU execution.** Every op - the decode ops (`dequantize` / `quantized_matmul` /
|
|
261
|
+
`gather_qmm`) and `quantize` (encode) - runs on either stream, covering all 10 codecs, so the full
|
|
262
|
+
quantize/decode pipeline (and the op tests) runs in CI without a GPU. The per-block `dequantize` is
|
|
263
|
+
a scalar, bit-exact (per-codec, vs the `gguf.quants` reference quantizer) decoder. The CPU **matmul**
|
|
264
|
+
and **gather** are tuned for Apple Silicon: a shared worker pool over output rows, NEON int8
|
|
265
|
+
dot-product GEMV for the small-M (decode) shape, and an Accelerate (AMX/SME) GEMM for the large-M
|
|
266
|
+
(prefill) shape. The NEON path quantizes activations to int8 (lossy, as ggml does), so its matmul
|
|
267
|
+
matches at tolerance, not bit-exactly; `KQ_CPU_NEON=0` forces the scalar path for exact parity.
|
|
268
|
+
|
|
269
|
+
## Environment variables
|
|
270
|
+
|
|
271
|
+
All optional; the defaults are right for normal use.
|
|
272
|
+
|
|
273
|
+
- `KQ_CPU_THREADS` - worker-pool size for the CPU ops (default: hardware concurrency; `1` runs them
|
|
274
|
+
inline). `KQ_CPU_SPIN_US` sets a spin-before-park window for the pool (default `0` = park).
|
|
275
|
+
- `KQ_CPU_NEON=0` - disable the arm64 NEON int8 GEMV kernels and run the scalar decode-then-dot
|
|
276
|
+
matmul, which is bit-exact (the NEON path is tolerance-level; see [How it works](#how-it-works)).
|
|
277
|
+
- `KQ_DISABLE_VERIFY_QMV=1` - on Metal, force the plain per-row `qmv` path instead of the
|
|
278
|
+
weight-read-amortizing `verify_qmv` kernel. An A/B debugging lever, not a tuning knob.
|
|
279
|
+
|
|
280
|
+
## Quant recipes
|
|
281
|
+
|
|
282
|
+
A **preset** is a named mixed-precision recipe. It classifies each tensor by *role* (attention
|
|
283
|
+
q/k/v/o, embeddings, `lm_head`, MoE routed vs shared experts, the FFN down-projection) and maps each
|
|
284
|
+
role to a codec - spending bits where they move the output most and staying frugal on the bulk
|
|
285
|
+
feed-forward weights, to beat a uniform quant at the same byte budget.
|
|
286
|
+
|
|
287
|
+
```sh
|
|
288
|
+
mlx-kquant quantize --model <src> --preset q4_k_m --mlx-path out # a mixed recipe
|
|
289
|
+
mlx-kquant quantize --model <src> --kquant-type q6_k --mlx-path out # one codec, every tensor
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
Naming follows the ggml convention: the family (`q4_k`, `q5_k`, ...) sets the baseline codec and the
|
|
293
|
+
suffix sets how much extra precision the recipe spends:
|
|
294
|
+
|
|
295
|
+
- `_s` / `_m` / `_xl` - small / medium / extra: increasing bumps on the sensitive tensors (the value
|
|
296
|
+
and output projections, the down-projection on a subset of layers, the linear-attention
|
|
297
|
+
projections).
|
|
298
|
+
- `_moe` - expert-aware: routed experts at the baseline, shared experts a step above.
|
|
299
|
+
- bare `q6_k` / `q8` - uniform (every tensor at one codec), equivalent to passing `--kquant-type`.
|
|
300
|
+
|
|
301
|
+
`mlx-kquant quantize --list-presets` prints the full, authoritative mapping for every preset; it is
|
|
302
|
+
generated from the recipe tables, so it never drifts from what the encoder actually does. The recipes
|
|
303
|
+
are informed by our analysis of the mixed-precision quants that [Unsloth][unsloth] and
|
|
304
|
+
[bartowski][bartowski] publish on Hugging Face, together with llama.cpp's own per-layer
|
|
305
|
+
"use more bits" schedule.
|
|
306
|
+
|
|
307
|
+
[unsloth]: https://huggingface.co/unsloth
|
|
308
|
+
[bartowski]: https://huggingface.co/bartowski
|
|
309
|
+
|
|
310
|
+
## Codec reference
|
|
311
|
+
|
|
312
|
+
| Codec | Block | Bits | Bytes/block | Notes |
|
|
313
|
+
|-------|------:|-----:|------------:|-------|
|
|
314
|
+
| q2_k | 256 | 2 | 84 | K-quant superblock |
|
|
315
|
+
| q3_k | 256 | 3 | 110 | K-quant superblock |
|
|
316
|
+
| q4_k | 256 | 4 | 144 | K-quant superblock |
|
|
317
|
+
| q5_k | 256 | 5 | 176 | K-quant superblock |
|
|
318
|
+
| q6_k | 256 | 6 | 210 | K-quant superblock |
|
|
319
|
+
| q4_0 | 32 | 4 | 18 | block scale |
|
|
320
|
+
| q4_1 | 32 | 4 | 20 | block scale + min |
|
|
321
|
+
| q5_0 | 32 | 5 | 22 | block scale |
|
|
322
|
+
| q5_1 | 32 | 5 | 24 | block scale + min |
|
|
323
|
+
| q8_0 | 32 | 8 | 34 | block scale |
|
|
324
|
+
|
|
325
|
+
## Version pinning
|
|
326
|
+
|
|
327
|
+
Pinned to `mlx==0.31.2`. The kernels include MLX's steel headers and the extension links `libmlx`,
|
|
328
|
+
binding it to that release's ABI and header API. To move to a newer MLX: update the bundled headers
|
|
329
|
+
under `metal/mlx/backend/metal/kernels/` for that wheel, rebuild, and re-run the test suite.
|
|
330
|
+
|
|
331
|
+
## Tests
|
|
332
|
+
|
|
333
|
+
```sh
|
|
334
|
+
python -m pytest tests/
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Requirements
|
|
338
|
+
|
|
339
|
+
- **macOS 26.2 (Tahoe) or later on Apple Silicon** (M-series) with a working Metal toolchain
|
|
340
|
+
(`xcrun metal`) for the GPU build-from-source install.
|
|
341
|
+
- **Linux** (x86_64 or aarch64) is supported CPU-only: build against `mlx[cpu]==0.31.2`, no Metal
|
|
342
|
+
toolchain required. See [Install](#install) and [Limitations](#limitations).
|
|
343
|
+
- **Python >= 3.10** (the pinned `mlx==0.31.2` ships no cp39 wheel).
|
|
344
|
+
- **`mlx==0.31.2`** exactly - the kernels include MLX's steel headers and the extension links
|
|
345
|
+
`libmlx`, so the ABI is version-locked (see [Version pinning](#version-pinning)).
|
|
346
|
+
|
|
347
|
+
## Limitations
|
|
348
|
+
|
|
349
|
+
- **GPU path is Apple-Silicon Metal only.** No ROCm or CUDA support. Every op also has a CPU path
|
|
350
|
+
(`stream=mx.cpu`) covering all 10 codecs, so the extension still builds and runs without Metal (see
|
|
351
|
+
[How it works](#how-it-works) and [Install](#install)).
|
|
352
|
+
- **Linux model forwards need `MLX_DISABLE_COMPILE=1`.** Stock MLX's CPU compile JIT generates C++
|
|
353
|
+
that redeclares GCC's built-in `_Float32`/`_Float64`/`_Float128` types, which `g++` rejects, so any
|
|
354
|
+
model forward through MLX's compile path fails on Linux+GCC. Disabling the JIT runs those graphs
|
|
355
|
+
eagerly with identical numerics. This is an upstream MLX-on-Linux limitation independent of
|
|
356
|
+
mlx-kquant - the `kq.*` ops have their own `eval_cpu` and never touch the JIT.
|
|
357
|
+
- **LoRA, not DoRA.** LoRA adapters train, attach, and fuse on a kquant base (see
|
|
358
|
+
[docs/lora.md](https://github.com/asher/mlx-kquant/blob/main/docs/lora.md)); DoRA is not yet supported. `fuse` re-encodes to kquant or, with
|
|
359
|
+
`--dequantize`, to float; both modes run on CPU or Metal.
|
|
360
|
+
|
|
361
|
+
## License
|
|
362
|
+
|
|
363
|
+
MIT - see [LICENSE](https://github.com/asher/mlx-kquant/blob/main/LICENSE).
|
|
364
|
+
|
|
365
|
+
### Acknowledgements
|
|
366
|
+
|
|
367
|
+
mlx-kquant builds on three MIT-licensed projects; their license texts ship in the wheel under
|
|
368
|
+
[`mlx_kquant/licenses/`](https://github.com/asher/mlx-kquant/tree/main/mlx_kquant/licenses):
|
|
369
|
+
|
|
370
|
+
- **[llama.cpp / ggml](https://github.com/ggml-org/llama.cpp)** - the K-quant and block codec formats
|
|
371
|
+
and the quantization / dequantization algorithms that encode and decode them are derived from
|
|
372
|
+
ggml's reference implementation.
|
|
373
|
+
- **[gguf-tools](https://github.com/antirez/gguf-tools)** - used to implement a zero-copy GGUF loader
|
|
374
|
+
for downstream projects, statically linked into built wheels.
|
|
375
|
+
- **[MLX](https://github.com/ml-explore/mlx)** - the extension links `libmlx`, the kernels compile
|
|
376
|
+
against MLX's bundled headers, and parts of the Metal kernels are adapted from MLX's quantized and
|
|
377
|
+
steel-GEMM kernels.
|