mlx-kquant 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. mlx_kquant-0.1.0/CHANGELOG.md +23 -0
  2. mlx_kquant-0.1.0/CMakeLists.txt +209 -0
  3. mlx_kquant-0.1.0/LICENSE +21 -0
  4. mlx_kquant-0.1.0/MANIFEST.in +23 -0
  5. mlx_kquant-0.1.0/PKG-INFO +377 -0
  6. mlx_kquant-0.1.0/README.md +339 -0
  7. mlx_kquant-0.1.0/bindings.cpp +219 -0
  8. mlx_kquant-0.1.0/cmake/patch-gguflib.cmake +79 -0
  9. mlx_kquant-0.1.0/metal/kq_quantized.metal +334 -0
  10. mlx_kquant-0.1.0/metal/kq_quantized_encode.metal +86 -0
  11. mlx_kquant-0.1.0/metal/kq_quantized_nax.metal +106 -0
  12. mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized.h +6351 -0
  13. mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_encode.h +1239 -0
  14. mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_legacy.h +2004 -0
  15. mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/kq_quantized_nax.h +2568 -0
  16. mlx_kquant-0.1.0/metal/mlx/backend/metal/kernels/quantized_utils.h +113 -0
  17. mlx_kquant-0.1.0/mlx_kquant/__init__.py +46 -0
  18. mlx_kquant-0.1.0/mlx_kquant/__main__.py +5 -0
  19. mlx_kquant-0.1.0/mlx_kquant/_deps.py +23 -0
  20. mlx_kquant-0.1.0/mlx_kquant/_install.py +76 -0
  21. mlx_kquant-0.1.0/mlx_kquant/_version.py +1 -0
  22. mlx_kquant-0.1.0/mlx_kquant/cli/__init__.py +74 -0
  23. mlx_kquant-0.1.0/mlx_kquant/cli/calibrate.py +178 -0
  24. mlx_kquant-0.1.0/mlx_kquant/cli/chat.py +443 -0
  25. mlx_kquant-0.1.0/mlx_kquant/cli/fuse.py +218 -0
  26. mlx_kquant-0.1.0/mlx_kquant/cli/inspect.py +121 -0
  27. mlx_kquant-0.1.0/mlx_kquant/cli/lora.py +63 -0
  28. mlx_kquant-0.1.0/mlx_kquant/cli/quantize.py +126 -0
  29. mlx_kquant-0.1.0/mlx_kquant/cli/run.py +154 -0
  30. mlx_kquant-0.1.0/mlx_kquant/cli/verify.py +59 -0
  31. mlx_kquant-0.1.0/mlx_kquant/codec_geometry.py +72 -0
  32. mlx_kquant-0.1.0/mlx_kquant/convert.py +241 -0
  33. mlx_kquant-0.1.0/mlx_kquant/imatrix.py +138 -0
  34. mlx_kquant-0.1.0/mlx_kquant/licenses/gguf-tools-LICENSE +26 -0
  35. mlx_kquant-0.1.0/mlx_kquant/licenses/llama.cpp-LICENSE +29 -0
  36. mlx_kquant-0.1.0/mlx_kquant/licenses/mlx-LICENSE +29 -0
  37. mlx_kquant-0.1.0/mlx_kquant/loader.py +211 -0
  38. mlx_kquant-0.1.0/mlx_kquant/mlx_lm_patch.py +358 -0
  39. mlx_kquant-0.1.0/mlx_kquant/nn.py +238 -0
  40. mlx_kquant-0.1.0/mlx_kquant/py.typed +0 -0
  41. mlx_kquant-0.1.0/mlx_kquant/recipes.py +418 -0
  42. mlx_kquant-0.1.0/mlx_kquant.egg-info/PKG-INFO +377 -0
  43. mlx_kquant-0.1.0/mlx_kquant.egg-info/SOURCES.txt +96 -0
  44. mlx_kquant-0.1.0/mlx_kquant.egg-info/dependency_links.txt +1 -0
  45. mlx_kquant-0.1.0/mlx_kquant.egg-info/entry_points.txt +3 -0
  46. mlx_kquant-0.1.0/mlx_kquant.egg-info/requires.txt +13 -0
  47. mlx_kquant-0.1.0/mlx_kquant.egg-info/top_level.txt +1 -0
  48. mlx_kquant-0.1.0/pyproject.toml +120 -0
  49. mlx_kquant-0.1.0/scripts/check-codecs.py +124 -0
  50. mlx_kquant-0.1.0/setup.cfg +4 -0
  51. mlx_kquant-0.1.0/setup.py +28 -0
  52. mlx_kquant-0.1.0/src/kquant.h +271 -0
  53. mlx_kquant-0.1.0/src/kquant_codec.cpp +41 -0
  54. mlx_kquant-0.1.0/src/kquant_codec.h +25 -0
  55. mlx_kquant-0.1.0/src/kquant_cpu_decode.cpp +1205 -0
  56. mlx_kquant-0.1.0/src/kquant_cpu_decode.h +98 -0
  57. mlx_kquant-0.1.0/src/kquant_cpu_encode.cpp +990 -0
  58. mlx_kquant-0.1.0/src/kquant_cpu_encode.h +33 -0
  59. mlx_kquant-0.1.0/src/kquant_cpu_neon.cpp +883 -0
  60. mlx_kquant-0.1.0/src/kquant_cpu_neon.h +52 -0
  61. mlx_kquant-0.1.0/src/kquant_dequantize.cpp +143 -0
  62. mlx_kquant-0.1.0/src/kquant_encode.cpp +181 -0
  63. mlx_kquant-0.1.0/src/kquant_gather.cpp +821 -0
  64. mlx_kquant-0.1.0/src/kquant_gguf.cpp +567 -0
  65. mlx_kquant-0.1.0/src/kquant_gguf.h +65 -0
  66. mlx_kquant-0.1.0/src/kquant_internal.h +23 -0
  67. mlx_kquant-0.1.0/src/kquant_matmul.cpp +580 -0
  68. mlx_kquant-0.1.0/src/kquant_metal_internal.h +271 -0
  69. mlx_kquant-0.1.0/src/kquant_ops.cpp +422 -0
  70. mlx_kquant-0.1.0/tests/conftest.py +21 -0
  71. mlx_kquant-0.1.0/tests/fixtures/SHA256SUMS +10 -0
  72. mlx_kquant-0.1.0/tests/fixtures/q2_k.npz +0 -0
  73. mlx_kquant-0.1.0/tests/fixtures/q2_k_moe.npz +0 -0
  74. mlx_kquant-0.1.0/tests/fixtures/q3_k.npz +0 -0
  75. mlx_kquant-0.1.0/tests/fixtures/q3_k_moe.npz +0 -0
  76. mlx_kquant-0.1.0/tests/fixtures/q4_k.npz +0 -0
  77. mlx_kquant-0.1.0/tests/fixtures/q4_k_moe.npz +0 -0
  78. mlx_kquant-0.1.0/tests/fixtures/q5_k.npz +0 -0
  79. mlx_kquant-0.1.0/tests/fixtures/q5_k_moe.npz +0 -0
  80. mlx_kquant-0.1.0/tests/fixtures/q6_k.npz +0 -0
  81. mlx_kquant-0.1.0/tests/fixtures/q6_k_moe.npz +0 -0
  82. mlx_kquant-0.1.0/tests/gen_fixtures.py +92 -0
  83. mlx_kquant-0.1.0/tests/test_cli.py +523 -0
  84. mlx_kquant-0.1.0/tests/test_codecs.py +177 -0
  85. mlx_kquant-0.1.0/tests/test_cpu_decode.py +222 -0
  86. mlx_kquant-0.1.0/tests/test_cpu_neon.py +175 -0
  87. mlx_kquant-0.1.0/tests/test_dequant.py +155 -0
  88. mlx_kquant-0.1.0/tests/test_encode.py +252 -0
  89. mlx_kquant-0.1.0/tests/test_fuse_cli.py +351 -0
  90. mlx_kquant-0.1.0/tests/test_gather.py +285 -0
  91. mlx_kquant-0.1.0/tests/test_gguf.py +94 -0
  92. mlx_kquant-0.1.0/tests/test_loader.py +275 -0
  93. mlx_kquant-0.1.0/tests/test_lora_patch.py +225 -0
  94. mlx_kquant-0.1.0/tests/test_matmul.py +162 -0
  95. mlx_kquant-0.1.0/tests/test_matmul_cold.py +136 -0
  96. mlx_kquant-0.1.0/tests/test_nn.py +207 -0
  97. mlx_kquant-0.1.0/tests/test_recipes.py +242 -0
  98. mlx_kquant-0.1.0/tests/test_vjp.py +195 -0
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project aims to
5
+ adhere to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.1.0]
8
+
9
+ First public release. A C++/Metal extension for a stock `mlx==0.31.2` wheel that
10
+ adds the K-quant superblock and per-block integer codecs as native MLX ops
11
+ (`kq.dequantize` / `quantized_matmul` / `gather_qmm` / `quantize`), with Metal and
12
+ portable CPU paths for all ten codecs. On top of the ops, the `mlx-kquant` CLI
13
+ quantizes an HF / mlx-lm model into a K-quant MLX safetensors checkpoint and runs,
14
+ chats with, LoRA-fine-tunes, and fuses it, with importance-matrix calibration and
15
+ per-tensor recipe inspection. A loader runs those checkpoints on stock mlx-lm.
16
+
17
+ ### Notes
18
+ - `requires-python >= 3.10` (mlx 0.31.2 ships no cp39 wheel).
19
+ - The GPU path is macOS 26 (Tahoe) or later on Apple Silicon (Metal); the NAX
20
+ matmul kernel needs the Metal 4 SDK (`MetalPerformancePrimitives`). Linux is
21
+ supported CPU-only - build against `mlx[cpu]==0.31.2`; model forwards there also
22
+ need `MLX_DISABLE_COMPILE=1` (an upstream MLX CPU-JIT limitation under GCC, not
23
+ mlx-kquant).
@@ -0,0 +1,209 @@
1
+ cmake_minimum_required(VERSION 3.27)
2
+
3
+ project(mlx_kquant LANGUAGES CXX C)
4
+
5
+ # ----------------------------- Setup -----------------------------
6
+ # C++20: MLX 0.31.2's public headers (device.h, stream.h) use defaulted
7
+ # comparison operators, a C++20 feature. Apple Clang accepts them as an
8
+ # extension under -std=c++17, but GCC (the Linux toolchain) rejects them, so a
9
+ # C++17 standard breaks the Metal-free Linux build. 20 builds cleanly on both.
10
+ set(CMAKE_CXX_STANDARD 20)
11
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
12
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
13
+
14
+ option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)
15
+
16
+ # ----------------------------- Dependencies -----------------------------
17
+ # Pick the right interpreter. mlx.extension.CMakeBuild invokes cmake WITHOUT
18
+ # forwarding the building interpreter, so CMake's FindPython can otherwise latch
19
+ # onto an unrelated system/uv Python that lacks mlx & nanobind (then
20
+ # `python -m mlx --cmake-dir` returns empty and find_package(MLX) fails).
21
+ # Precedence: an explicit -DPython_EXECUTABLE (e.g. via CMAKE_ARGS) wins; else an
22
+ # active virtualenv ($VIRTUAL_ENV); else CMake's normal search.
23
+ if(NOT Python_EXECUTABLE AND DEFINED ENV{VIRTUAL_ENV})
24
+ set(Python_EXECUTABLE "$ENV{VIRTUAL_ENV}/bin/python")
25
+ endif()
26
+ set(Python_FIND_VIRTUALENV FIRST)
27
+
28
+ find_package(
29
+ Python 3.9
30
+ COMPONENTS Interpreter Development.Module
31
+ REQUIRED)
32
+
33
+ execute_process(
34
+ COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
35
+ OUTPUT_STRIP_TRAILING_WHITESPACE
36
+ OUTPUT_VARIABLE nanobind_ROOT)
37
+ find_package(nanobind CONFIG REQUIRED)
38
+
39
+ execute_process(
40
+ COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
41
+ OUTPUT_STRIP_TRAILING_WHITESPACE
42
+ OUTPUT_VARIABLE MLX_ROOT)
43
+ find_package(MLX CONFIG REQUIRED)
44
+
45
+ # NOTE: do NOT splice ${MLX_CXX_FLAGS} into CMAKE_CXX_FLAGS - it is a CMake list
46
+ # (-DACCELERATE_NEW_LAPACK;-D_METAL_), and embedding the ';' into the flags
47
+ # *string* corrupts every compile command (`/bin/sh: -D_METAL_: command not
48
+ # found`). MLXConfig already attaches these as INTERFACE_COMPILE_OPTIONS on the
49
+ # imported `mlx` target, so -D_METAL_ propagates correctly to anything that
50
+ # links `mlx` (below).
51
+
52
+ # ----------------------------- gguflib (GGUF parser) -----------------------------
53
+ # antirez/gguf-tools single-file GGUF parser, for the C++ kq.load_gguf loader.
54
+ # PIN: a ref new enough to define GGUF_TYPE_Q5_0/Q5_1/Q2_K..Q6_K + correct
55
+ # K-quant block sizes (older refs only know Q4_0/Q4_1/Q8_0).
56
+ include(FetchContent)
57
+ FetchContent_Declare(
58
+ gguflib
59
+ GIT_REPOSITORY https://github.com/antirez/gguf-tools/
60
+ GIT_TAG fdfafbed766db0a1e9019b07994cd88f133d1aab
61
+ # The pinned ref only knows GGML types up to BF16 (30); a tensor of any newer
62
+ # codec trips gguf_get_tensor's `>= GGUF_TYPE_COUNT` guard and truncates the
63
+ # tensor list. Teach the parser the post-BF16 type block-geometry (incl. the
64
+ # MXFP4/NVFP4 float codecs MLX has native kernels for). Idempotent; see script.
65
+ PATCH_COMMAND ${CMAKE_COMMAND} -DGGUFLIB_DIR=<SOURCE_DIR>
66
+ -P ${CMAKE_CURRENT_LIST_DIR}/cmake/patch-gguflib.cmake)
67
+ FetchContent_MakeAvailable(gguflib)
68
+ add_library(gguflib STATIC ${gguflib_SOURCE_DIR}/fp16.c
69
+ ${gguflib_SOURCE_DIR}/gguflib.c)
70
+ # Hidden visibility so the gguf_* symbols stay LOCAL to whichever image links
71
+ # them whole-archive (below) and never interpose / collide at runtime with the
72
+ # identically-named gguf_* symbols that libmlx also exports.
73
+ set_target_properties(gguflib PROPERTIES POSITION_INDEPENDENT_CODE ON
74
+ C_VISIBILITY_PRESET hidden)
75
+ # gguflib uses assert() to reject malformed tensor headers (e.g. ndim > 8).
76
+ # -DNDEBUG (release builds) would compile those out, leaving out-of-bounds
77
+ # reads/writes unguarded when loading untrusted GGUF files. Force NDEBUG off.
78
+ target_compile_options(gguflib PRIVATE -UNDEBUG)
79
+
80
+ # Whole-archive flag for the patched gguflib (used on both linking images below):
81
+ # pull all its (hidden) gguf_* objects into the image so they win over libmlx's
82
+ # exported copy. Apple ld spells this -force_load; GNU ld / lld use the
83
+ # --whole-archive / --no-whole-archive bracket.
84
+ if(APPLE)
85
+ set(KQ_WHOLE_ARCHIVE_GGUFLIB "-Wl,-force_load,$<TARGET_FILE:gguflib>")
86
+ else()
87
+ set(KQ_WHOLE_ARCHIVE_GGUFLIB
88
+ "-Wl,--whole-archive,$<TARGET_FILE:gguflib>,--no-whole-archive")
89
+ endif()
90
+
91
+ # ----------------------------- C++ library -----------------------------
92
+ add_library(mlx_kquant_ext)
93
+
94
+ target_sources(
95
+ mlx_kquant_ext
96
+ PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/kquant_codec.cpp
97
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_decode.cpp
98
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_encode.cpp
99
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_ops.cpp
100
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_dequantize.cpp
101
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_matmul.cpp
102
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_gather.cpp
103
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_encode.cpp
104
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_gguf.cpp)
105
+
106
+ target_include_directories(mlx_kquant_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src)
107
+ # gguflib include/link are PUBLIC: target_sources above lists the .cpp as PUBLIC,
108
+ # so it is also compiled into the `_ext` nanobind module (consumer), which then
109
+ # needs gguflib.h on its include path and the gguf_* symbols at link time.
110
+ target_include_directories(mlx_kquant_ext PUBLIC ${gguflib_SOURCE_DIR})
111
+ target_link_libraries(mlx_kquant_ext PUBLIC mlx)
112
+ # Apple: route the CPU matmul's large-M GEMM through Accelerate (engages the
113
+ # AMX/SME matrix units). PUBLIC so the nanobind module, which re-compiles the
114
+ # PUBLIC sources above, sees the same define and link. Other platforms keep
115
+ # the portable threaded-scalar fallback.
116
+ if(APPLE)
117
+ target_compile_definitions(mlx_kquant_ext PUBLIC KQ_USE_ACCELERATE)
118
+ target_link_libraries(mlx_kquant_ext PUBLIC "-framework Accelerate")
119
+ endif()
120
+ # arm64: NEON-dotprod int8 GEMV kernels for the fused small-M CPU matmul.
121
+ # The TU is compiled only on arm64/aarch64 hosts; execution is further gated
122
+ # at runtime (KQ_CPU_NEON=0 kill switch; Linux aarch64 hwcap dotprod check).
123
+ # Every other target keeps the portable scalar path (the header stubs
124
+ # kq_neon_kernel to nullptr when KQ_CPU_NEON_TU is undefined).
125
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
126
+ target_sources(mlx_kquant_ext
127
+ PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_neon.cpp)
128
+ target_compile_definitions(mlx_kquant_ext PUBLIC KQ_CPU_NEON_TU)
129
+ if(NOT APPLE)
130
+ # Linux toolchains may default to an armv8.0 baseline without dotprod;
131
+ # enable it for this one TU - dotprod instructions only execute after the
132
+ # runtime hwcap check passes.
133
+ set_source_files_properties(
134
+ ${CMAKE_CURRENT_LIST_DIR}/src/kquant_cpu_neon.cpp
135
+ PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+dotprod")
136
+ endif()
137
+ endif()
138
+ # libmlx bundles its OWN (older) gguflib and exports its gguf_* symbols. A plain
139
+ # link can bind our gguf_get_tensor()/gguf_open() calls to MLX's copy, whose
140
+ # GGUF_TYPE_COUNT predates the MXFP4/NVFP4 float codecs - so a tensor of type
141
+ # >= 31 trips its `>= GGUF_TYPE_COUNT` guard and the tensor list silently
142
+ # truncates at the first such tensor. Linking our patched gguflib whole-archive
143
+ # pulls its objects into this image as hidden-visibility definitions, so every
144
+ # gguf_* call resolves in-image to the patched parser instead of to libmlx.
145
+ target_link_libraries(mlx_kquant_ext PUBLIC gguflib)
146
+ target_link_options(mlx_kquant_ext PRIVATE ${KQ_WHOLE_ARCHIVE_GGUFLIB})
147
+
148
+ # ----------------------------- Metal library -----------------------------
149
+ # The kq_* kernels compiled into a single mlx_kquant.metallib.
150
+ # The repo's metal/ dir is searched BEFORE the stock-wheel include tree so the
151
+ # repo's kq_*.h + quantized_utils.h (which adds load_vector) win;
152
+ # steel/gemm/*, utils.h, etc. fall through to the stock wheel headers.
153
+ if(MLX_BUILD_METAL)
154
+ mlx_build_metallib(
155
+ TARGET
156
+ mlx_kquant_metallib
157
+ TITLE
158
+ mlx_kquant
159
+ SOURCES
160
+ ${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized.metal
161
+ ${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized_nax.metal
162
+ ${CMAKE_CURRENT_LIST_DIR}/metal/kq_quantized_encode.metal
163
+ INCLUDE_DIRS
164
+ ${CMAKE_CURRENT_LIST_DIR}/metal
165
+ ${MLX_INCLUDE_DIRS}
166
+ OUTPUT_DIRECTORY
167
+ ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
168
+
169
+ add_dependencies(mlx_kquant_ext mlx_kquant_metallib)
170
+ endif()
171
+
172
+ # ----------------------------- Python bindings -----------------------------
173
+ nanobind_add_module(
174
+ _ext
175
+ NB_STATIC
176
+ STABLE_ABI
177
+ LTO
178
+ NOMINSIZE
179
+ NB_DOMAIN
180
+ mlx
181
+ ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
182
+ target_link_libraries(_ext PRIVATE mlx_kquant_ext)
183
+ # kquant_gguf.cpp is a PUBLIC source (above), so it is also compiled into this
184
+ # module and the LTO link keeps its own gguf_* references - force_load the
185
+ # patched gguflib here too so they don't fall back to libmlx's exported copy
186
+ # (see the mlx_kquant_ext force_load note for the full rationale).
187
+ target_link_libraries(_ext PRIVATE gguflib)
188
+ target_link_options(_ext PRIVATE ${KQ_WHOLE_ARCHIVE_GGUFLIB})
189
+
190
+ if(BUILD_SHARED_LIBS)
191
+ if(APPLE)
192
+ # @loader_path finds the co-located libmlx_kquant_ext.dylib (same package
193
+ # dir); the second rpath resolves @rpath/libmlx.dylib against the *user's*
194
+ # installed mlx wheel at runtime. _ext.so lives in site-packages/mlx_kquant/,
195
+ # libmlx in site-packages/mlx/lib/, so ../mlx/lib reaches it. (The absolute
196
+ # build-tree rpath MLXConfig adds works only on the build machine; this is
197
+ # what makes a redistributed wheel find the ABI-pinned mlx==0.31.2. delocate
198
+ # must therefore *exclude* libmlx from vendoring - see [tool.cibuildwheel].)
199
+ target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
200
+ target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path/../mlx/lib)
201
+ else()
202
+ # ELF equivalent of @loader_path: $ORIGIN is the dir of _ext.so. The
203
+ # co-located libmlx_kquant_ext.so sits beside it; ../mlx/lib reaches the
204
+ # user's installed mlx wheel's libmlx.so (auditwheel must *exclude* libmlx.so
205
+ # from vendoring, mirroring the macOS delocate exclude).
206
+ target_link_options(_ext PRIVATE "-Wl,-rpath,\$ORIGIN")
207
+ target_link_options(_ext PRIVATE "-Wl,-rpath,\$ORIGIN/../mlx/lib")
208
+ endif()
209
+ endif()
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright © 2026 Asher Feldman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,23 @@
1
+ # Build inputs the sdist needs to compile the extension from source.
2
+ include CMakeLists.txt
3
+ include setup.py
4
+ include bindings.cpp
5
+ recursive-include src *.cpp *.h
6
+ recursive-include metal *.metal *.h
7
+ recursive-include cmake *.cmake
8
+
9
+ # Project metadata and tooling.
10
+ include LICENSE
11
+ include README.md
12
+ include CHANGELOG.md
13
+ recursive-include scripts *.py
14
+
15
+ # Tests and fixtures, so an sdist build can run its own checks.
16
+ recursive-include tests *.py *.npz
17
+ include tests/fixtures/SHA256SUMS
18
+
19
+ # Never ship machine-specific build artifacts in the sdist (the wheel carries
20
+ # these via package-data; the sdist must not, or it would build them in stale).
21
+ global-exclude *.so *.dylib *.metallib *.air *.pyc
22
+ prune tests/fixtures/__pycache__
23
+ global-exclude .DS_Store
@@ -0,0 +1,377 @@
1
+ Metadata-Version: 2.4
2
+ Name: mlx-kquant
3
+ Version: 0.1.0
4
+ Summary: GGUF K-quant dequantize / quantized-matmul / gather-qmm / quantize ops for MLX, via custom Metal kernels.
5
+ Author: Asher Feldman
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/asher/mlx-kquant
8
+ Project-URL: Repository, https://github.com/asher/mlx-kquant
9
+ Project-URL: Issues, https://github.com/asher/mlx-kquant/issues
10
+ Project-URL: Changelog, https://github.com/asher/mlx-kquant/blob/main/CHANGELOG.md
11
+ Keywords: mlx,quantization,gguf,k-quant,metal,apple-silicon,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Operating System :: MacOS
20
+ Classifier: Operating System :: POSIX :: Linux
21
+ Classifier: Environment :: GPU
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: mlx==0.31.2
27
+ Provides-Extra: tools
28
+ Requires-Dist: mlx-lm>=0.27; extra == "tools"
29
+ Requires-Dist: transformers; extra == "tools"
30
+ Requires-Dist: gguf; extra == "tools"
31
+ Requires-Dist: numpy; extra == "tools"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest; extra == "dev"
34
+ Requires-Dist: ruff; extra == "dev"
35
+ Requires-Dist: numpy; extra == "dev"
36
+ Requires-Dist: gguf; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # mlx-kquant
40
+
41
+ [![ci](https://github.com/asher/mlx-kquant/actions/workflows/ci.yml/badge.svg)](https://github.com/asher/mlx-kquant/actions/workflows/ci.yml)
42
+
43
+ Bring **K-quant precision to [MLX](https://github.com/ml-explore/mlx)** on Apple Silicon: a C++/Metal
44
+ **extension** for a stock `mlx` wheel that adds the K-quant superblock and per-block integer codecs
45
+ as native MLX ops, plus a toolchain that quantizes a model into a **K-quant MLX safetensors
46
+ checkpoint** and runs, LoRA-trains, and fuses it.
47
+
48
+ Two layers:
49
+
50
+ - **Ops** (C++/Metal) - a `kq.*` namespace (`dequantize`, `quantized_matmul`, `gather_qmm`,
51
+ `quantize`) backed by Metal kernels compiled to a `.metallib` at build time (no runtime JIT). All
52
+ ten codecs: `q2_k, q3_k, q4_k, q5_k, q6_k` and `q4_0, q4_1, q5_0, q5_1, q8_0`.
53
+ - **Tooling** (Python) - `mlx-kquant quantize / run / chat / lora / fuse` (plus `verify`, `inspect`,
54
+ `calibrate-imatrix`) and a `loader` that create and run K-quant checkpoints in **MLX-native
55
+ safetensors** format.
56
+
57
+ ## Why
58
+
59
+ K-quant is the precision recipe behind the strongest small-footprint community quants; this makes it
60
+ first-class on MLX. Quantize an HF / `mlx-lm` model to a uniform- or mixed-precision K-quant
61
+ checkpoint, then load, generate, LoRA-train, and fuse it - all on a stock `mlx` wheel, all in MLX
62
+ safetensors. The kernels are tuned for real models (matrix-contiguity handling for fused MoE experts,
63
+ single-pass NAX matmul), see [Performance](#performance).
64
+
65
+ ## Install
66
+
67
+ **macOS 26.2 (Tahoe) or later on Apple Silicon** (the GPU path), with the Metal toolchain
68
+ (`xcrun metal`) and the exact pinned MLX wheel.
69
+
70
+ ```sh
71
+ pip install "mlx==0.31.2" # pinned, ABI-matched stock wheel (pulls the Metal backend)
72
+ pip install -e . # builds _ext + mlx_kquant.metallib
73
+ pip install -e ".[tools]" # + mlx-lm, for the CLI subcommands (quantize / run / chat / ...)
74
+ ```
75
+
76
+ **Linux (CPU-only)** also builds, with no Metal toolchain. The ops run on their portable `eval_cpu`
77
+ paths and no metallib is produced. (The tuned matmul/gather are Apple-Silicon-targeted: arm64 Linux
78
+ picks up the NEON int8 GEMV when the CPU has dot-product, but the Accelerate GEMM is Apple-only, and
79
+ x86_64 stays on the scalar/threaded path.) The base `mlx` wheel ships no backend on Linux, so
80
+ install the CPU one explicitly:
81
+
82
+ ```sh
83
+ pip install "mlx[cpu]==0.31.2" # base frontend + libmlx CPU backend
84
+ pip install -e . --no-build-isolation
85
+ ```
86
+
87
+ CPU is for portability and CI, not throughput. Running a full model forward on Linux also needs
88
+ `MLX_DISABLE_COMPILE=1`, see [Limitations](#limitations).
89
+
90
+ Smoke-test the toolchain:
91
+
92
+ ```python
93
+ import mlx_kquant as kq
94
+ kq.codecs() # -> ['q2_k', 'q3_k', ..., 'q8_0']
95
+ kq.metallib_loads() # -> True (the bundled metallib opened on the Metal device)
96
+ ```
97
+
98
+ > The extension links `libmlx` and its kernels `#include` MLX's steel-GEMM headers, so it is bound to
99
+ > an exact MLX ABI **and** header API. The pin is intentionally `==`, never `>=`; moving to a newer
100
+ > `mlx` may require updating the bundled headers and recompiling. See [Version pinning](#version-pinning).
101
+
102
+ ## Quickstart
103
+
104
+ Quantize a checkpoint and run it, load it through mlx-lm, fine-tune it with LoRA, or build directly on
105
+ the `kq.*` ops.
106
+
107
+ ### Create and run a checkpoint
108
+
109
+ The CLI (the `[tools]` extra adds `mlx-lm`) quantizes an HF / `mlx-lm` model into a K-quant **MLX
110
+ safetensors** checkpoint and runs it:
111
+
112
+ ```sh
113
+ pip install "mlx-kquant[tools]"
114
+ mlx-kquant quantize --model Qwen/Qwen3-0.6B --preset q4_k_m --mlx-path qwen3-q4
115
+ mlx-kquant run --model qwen3-q4 --prompt "Explain entropy in one sentence."
116
+ mlx-kquant chat --model qwen3-q4 --temp 0.7 # interactive REPL (mlx-lm chat)
117
+ ```
118
+
119
+ `run` takes the usual sampling knobs (`--temp`, `--top-p`, `--top-k`, `--min-p`, `--seed`,
120
+ `--repetition-penalty`, `--presence-penalty`, `--frequency-penalty`) and chat-template controls
121
+ (`--system-prompt`, `--no-chat-template`, `--chat-template-config` for template kwargs such as
122
+ `'{"enable_thinking": false}'`). The `chat` REPL has a line-editable prompt with persistent
123
+ history (`--no-history` or in-chat `/history off|on|clear` to control it) and in-chat sampling
124
+ control (`/temp`, `/top-p`, `/top-k`, `/min-p`, `/max-tokens`, and the three penalties;
125
+ `/sampling` shows current values); `/load <file>` prefills the next prompt from a text file for
126
+ editing; `/clear` resets the conversation and wipes the screen; Tab completes `/commands` and
127
+ paths; Ctrl-C cancels the in-flight reply (at an idle
128
+ prompt it exits, as does Ctrl-D). `--max-kv-size` bounds the KV cache for long sessions (a rotating
129
+ window, set at start).
130
+
131
+ The result is a standard MLX checkpoint (`config.json` + sharded safetensors, weights as K-quant wire
132
+ bytes). Load it in code with the bundled loader:
133
+
134
+ ```python
135
+ import mlx.core as mx
136
+ from mlx_kquant.loader import load
137
+
138
+ model, config = load("qwen3-q4") # KQuant* layers swapped in, on a stock mlx-lm model
139
+ mx.eval(model(mx.array([[1, 2, 3]])))
140
+ ```
141
+
142
+ `mlx-kquant lora` (train an adapter) and `mlx-kquant fuse` (merge it back) round out the toolchain -
143
+ see [LoRA fine-tuning](#lora-fine-tuning). Run `mlx-kquant --help` for every subcommand.
144
+
145
+ ### Using with mlx-lm
146
+
147
+ In-process, a kquant checkpoint also loads through **stock mlx-lm**: one idempotent call installs the
148
+ load shim, and from then on `mlx_lm.load` / `mlx_lm.generate` (and anything built on
149
+ `mlx_lm.utils.load_model`, e.g. an eval harness or your own serving loop) open a kquant checkpoint
150
+ transparently:
151
+
152
+ ```python
153
+ from mlx_kquant.mlx_lm_patch import patch_mlx_lm_load
154
+ patch_mlx_lm_load() # process-wide, idempotent; call once before mlx_lm.load
155
+
156
+ from mlx_lm import load, generate
157
+ model, tokenizer = load("qwen3-q4")
158
+ print(generate(model, tokenizer, "Explain entropy.", max_tokens=64))
159
+ ```
160
+
161
+ This is the load-only shim for inference / eval / serving; `patch_mlx_lm_lora()`
162
+ ([below](#lora-fine-tuning)) adds the train/merge shims on top. The bundled `mlx_kquant.loader.load`
163
+ (above) is the standalone path when you don't need the rest of mlx-lm.
164
+
165
+ ### LoRA fine-tuning
166
+
167
+ A kquant checkpoint is a frozen base you can adapt with LoRA. Attach an adapter for inference, train
168
+ one (the matmul/gather ops define a gradient-through-the-base `vjp`, so the adapter is differentiable
169
+ while the quantized weights stay frozen), and merge it back with `mlx-kquant fuse` (re-encode to
170
+ kquant, or `--dequantize` to float). One call wires it into stock mlx-lm:
171
+
172
+ ```python
173
+ from mlx_kquant.mlx_lm_patch import patch_mlx_lm_lora
174
+ patch_mlx_lm_lora() # before building LoRA layers / loading adapters; idempotent
175
+ ```
176
+
177
+ See **[docs/lora.md](https://github.com/asher/mlx-kquant/blob/main/docs/lora.md)** for attach / train / merge workflows. (DoRA on a kquant base is
178
+ not supported - use LoRA.)
179
+
180
+ ### Using K-quant ops directly
181
+
182
+ Under the toolchain, the four `kq.*` ops operate on raw K-quant wire bytes. K-quant scales live
183
+ *inside* the packed bytes, so the `scales` argument is a vestigial placeholder (the API keeps it for
184
+ shape symmetry with MLX's affine quant); `kq.quantize` returns one for you.
185
+
186
+ ```python
187
+ import mlx.core as mx
188
+ import mlx_kquant as kq
189
+
190
+ N, K = 256, 512 # q4_k: K must be a multiple of 256
191
+ w = mx.random.normal((N, K))
192
+
193
+ # encode float -> K-quant wire bytes (CPU or Metal); optional imatrix steers the encoder
194
+ wq, scales = kq.quantize(w, "q4_k") # wq: uint8 [N, bytes_per_row]
195
+
196
+ # dequantize back to float
197
+ deq = kq.dequantize(wq, scales, "q4_k") # float16 [N, K]
198
+
199
+ # quantized matmul: x @ dequant(w).T (transpose=True => w is [N, K])
200
+ x = mx.random.normal((8, K))
201
+ y = kq.quantized_matmul(x, wq, scales, "q4_k", transpose=True) # [8, N]
202
+ ```
203
+
204
+ Mixture-of-experts (gathered) matmul:
205
+
206
+ ```python
207
+ E, N, K = 128, 704, 2816
208
+ we = mx.random.normal((E, N, K))
209
+ weq, sc = kq.quantize(we, "q4_k") # per-expert wire bytes
210
+ x = mx.random.normal((1, 8, K)) # (tokens, top_k, K)
211
+ idx = mx.array([[0, 5, 9, 17, 33, 41, 88, 120]], dtype=mx.uint32)
212
+ out = kq.gather_qmm(x, weq, sc, "q4_k", rhs_indices=idx, transpose=True)
213
+ ```
214
+
215
+ Ready-made modules that store the wire bytes and dispatch the matching `kq.*` op ship in
216
+ `mlx_kquant.nn`:
217
+
218
+ ```python
219
+ from mlx_kquant.nn import KQuantLinear, KQuantEmbedding, KQuantSwitchLinear
220
+
221
+ x = mx.random.normal((8, 512)) # a (tokens, in_dims) activation batch
222
+ lin = KQuantLinear(in_dims=512, out_dims=256, bias=False, codec="q4_k")
223
+ lin.weight = wq # the uint8 wire bytes from kq.quantize, above
224
+ lin.scales = scales # [1] placeholder (scales live in the bytes)
225
+ y = lin(x) # kq.quantized_matmul under the hood
226
+ ```
227
+
228
+ `KQuantEmbedding` (with a tied-`as_linear`), the `gather_qmm`-backed `KQuantSwitchLinear` for MoE
229
+ experts, and `KQuantMultiLinear` (absorbed-MLA) are exported alongside it. To swap the quantizable
230
+ leaves of a whole constructed mlx-lm model in one call, use
231
+ `mlx_kquant.nn.install_kquant_modules(model, {"<path>.weight": "q4_k", ...})`.
232
+
233
+ The `[tools]` layer is itself a worked reference for wiring `kq.*` into the MLX ecosystem: the loader,
234
+ encoder, layer modules, and the mlx-lm monkeypatch are all small and self-contained. See
235
+ **[docs/integration.md](https://github.com/asher/mlx-kquant/blob/main/docs/integration.md)** if you're building on the ops.
236
+
237
+ ## Performance
238
+
239
+ The Metal kernels use a single-pass NAX matmul and matrix-contiguity handling for fused MoE expert
240
+ weights. Measured on an M5 Max (128 GB):
241
+
242
+ | Model | Codec | Decode (tok/s) | Prefill pp512 (tok/s) |
243
+ |-------|-------|---------------:|----------------------:|
244
+ | gemma-4-26B-A4B-it (MoE) | q4_k_xl | ~111 | ~2330 |
245
+ | Qwen3.5-9B (dense) | q5_k_xl | ~83 | ~2396 |
246
+
247
+ Transposed matmuls with a small row count (the speculative-decode verify regime) automatically route
248
+ through a weight-read-amortizing `verify_qmv` kernel; `KQ_DISABLE_VERIFY_QMV=1` forces the plain
249
+ per-row `qmv` path (see [Environment variables](#environment-variables)).
250
+
251
+ ## How it works
252
+
253
+ - **Own ops.** Four `Primitive` subclasses (`KQuantDequantize`, `KQuantMatmul`, `KQuantGatherQMM`,
254
+ `KQuantQuantize`) and their op functions live entirely in the extension.
255
+ - **Precompiled metallib on stock headers.** The `kq_*` kernels are compiled against the stock
256
+ wheel's steel-GEMM headers into `mlx_kquant.metallib` at build time; host dispatch resolves them
257
+ through MLX's exported `Device::get_kernel`. No JIT, no steel host structs.
258
+ - **Codec registry** derives `group_size`/`bits` from the codec name, so callers pass only
259
+ `kquant_type`.
260
+ - **CPU and GPU execution.** Every op - the decode ops (`dequantize` / `quantized_matmul` /
261
+ `gather_qmm`) and `quantize` (encode) - runs on either stream, covering all 10 codecs, so the full
262
+ quantize/decode pipeline (and the op tests) runs in CI without a GPU. The per-block `dequantize` is
263
+ a scalar, bit-exact (per-codec, vs the `gguf.quants` reference quantizer) decoder. The CPU **matmul**
264
+ and **gather** are tuned for Apple Silicon: a shared worker pool over output rows, NEON int8
265
+ dot-product GEMV for the small-M (decode) shape, and an Accelerate (AMX/SME) GEMM for the large-M
266
+ (prefill) shape. The NEON path quantizes activations to int8 (lossy, as ggml does), so its matmul
267
+ matches at tolerance, not bit-exactly; `KQ_CPU_NEON=0` forces the scalar path for exact parity.
268
+
269
+ ## Environment variables
270
+
271
+ All optional; the defaults are right for normal use.
272
+
273
+ - `KQ_CPU_THREADS` - worker-pool size for the CPU ops (default: hardware concurrency; `1` runs them
274
+ inline). `KQ_CPU_SPIN_US` sets a spin-before-park window for the pool (default `0` = park).
275
+ - `KQ_CPU_NEON=0` - disable the arm64 NEON int8 GEMV kernels and run the scalar decode-then-dot
276
+ matmul, which is bit-exact (the NEON path is tolerance-level; see [How it works](#how-it-works)).
277
+ - `KQ_DISABLE_VERIFY_QMV=1` - on Metal, force the plain per-row `qmv` path instead of the
278
+ weight-read-amortizing `verify_qmv` kernel. An A/B debugging lever, not a tuning knob.
279
+
280
+ ## Quant recipes
281
+
282
+ A **preset** is a named mixed-precision recipe. It classifies each tensor by *role* (attention
283
+ q/k/v/o, embeddings, `lm_head`, MoE routed vs shared experts, the FFN down-projection) and maps each
284
+ role to a codec - spending bits where they move the output most and staying frugal on the bulk
285
+ feed-forward weights, to beat a uniform quant at the same byte budget.
286
+
287
+ ```sh
288
+ mlx-kquant quantize --model <src> --preset q4_k_m --mlx-path out # a mixed recipe
289
+ mlx-kquant quantize --model <src> --kquant-type q6_k --mlx-path out # one codec, every tensor
290
+ ```
291
+
292
+ Naming follows the ggml convention: the family (`q4_k`, `q5_k`, ...) sets the baseline codec and the
293
+ suffix sets how much extra precision the recipe spends:
294
+
295
+ - `_s` / `_m` / `_xl` - small / medium / extra: increasing bumps on the sensitive tensors (the value
296
+ and output projections, the down-projection on a subset of layers, the linear-attention
297
+ projections).
298
+ - `_moe` - expert-aware: routed experts at the baseline, shared experts a step above.
299
+ - bare `q6_k` / `q8` - uniform (every tensor at one codec), equivalent to passing `--kquant-type`.
300
+
301
+ `mlx-kquant quantize --list-presets` prints the full, authoritative mapping for every preset; it is
302
+ generated from the recipe tables, so it never drifts from what the encoder actually does. The recipes
303
+ are informed by our analysis of the mixed-precision quants that [Unsloth][unsloth] and
304
+ [bartowski][bartowski] publish on Hugging Face, together with llama.cpp's own per-layer
305
+ "use more bits" schedule.
306
+
307
+ [unsloth]: https://huggingface.co/unsloth
308
+ [bartowski]: https://huggingface.co/bartowski
309
+
310
+ ## Codec reference
311
+
312
+ | Codec | Block | Bits | Bytes/block | Notes |
313
+ |-------|------:|-----:|------------:|-------|
314
+ | q2_k | 256 | 2 | 84 | K-quant superblock |
315
+ | q3_k | 256 | 3 | 110 | K-quant superblock |
316
+ | q4_k | 256 | 4 | 144 | K-quant superblock |
317
+ | q5_k | 256 | 5 | 176 | K-quant superblock |
318
+ | q6_k | 256 | 6 | 210 | K-quant superblock |
319
+ | q4_0 | 32 | 4 | 18 | block scale |
320
+ | q4_1 | 32 | 4 | 20 | block scale + min |
321
+ | q5_0 | 32 | 5 | 22 | block scale |
322
+ | q5_1 | 32 | 5 | 24 | block scale + min |
323
+ | q8_0 | 32 | 8 | 34 | block scale |
324
+
325
+ ## Version pinning
326
+
327
+ Pinned to `mlx==0.31.2`. The kernels include MLX's steel headers and the extension links `libmlx`,
328
+ binding it to that release's ABI and header API. To move to a newer MLX: update the bundled headers
329
+ under `metal/mlx/backend/metal/kernels/` for that wheel, rebuild, and re-run the test suite.
330
+
331
+ ## Tests
332
+
333
+ ```sh
334
+ python -m pytest tests/
335
+ ```
336
+
337
+ ## Requirements
338
+
339
+ - **macOS 26.2 (Tahoe) or later on Apple Silicon** (M-series) with a working Metal toolchain
340
+ (`xcrun metal`) for the GPU build-from-source install.
341
+ - **Linux** (x86_64 or aarch64) is supported CPU-only: build against `mlx[cpu]==0.31.2`, no Metal
342
+ toolchain required. See [Install](#install) and [Limitations](#limitations).
343
+ - **Python >= 3.10** (the pinned `mlx==0.31.2` ships no cp39 wheel).
344
+ - **`mlx==0.31.2`** exactly - the kernels include MLX's steel headers and the extension links
345
+ `libmlx`, so the ABI is version-locked (see [Version pinning](#version-pinning)).
346
+
347
+ ## Limitations
348
+
349
+ - **GPU path is Apple-Silicon Metal only.** No ROCm or CUDA support. Every op also has a CPU path
350
+ (`stream=mx.cpu`) covering all 10 codecs, so the extension still builds and runs without Metal (see
351
+ [How it works](#how-it-works) and [Install](#install)).
352
+ - **Linux model forwards need `MLX_DISABLE_COMPILE=1`.** Stock MLX's CPU compile JIT generates C++
353
+ that redeclares GCC's built-in `_Float32`/`_Float64`/`_Float128` types, which `g++` rejects, so any
354
+ model forward through MLX's compile path fails on Linux+GCC. Disabling the JIT runs those graphs
355
+ eagerly with identical numerics. This is an upstream MLX-on-Linux limitation independent of
356
+ mlx-kquant - the `kq.*` ops have their own `eval_cpu` and never touch the JIT.
357
+ - **LoRA, not DoRA.** LoRA adapters train, attach, and fuse on a kquant base (see
358
+ [docs/lora.md](https://github.com/asher/mlx-kquant/blob/main/docs/lora.md)); DoRA is not yet supported. `fuse` re-encodes to kquant or, with
359
+ `--dequantize`, to float; both modes run on CPU or Metal.
360
+
361
+ ## License
362
+
363
+ MIT - see [LICENSE](https://github.com/asher/mlx-kquant/blob/main/LICENSE).
364
+
365
+ ### Acknowledgements
366
+
367
+ mlx-kquant builds on three MIT-licensed projects; their license texts ship in the wheel under
368
+ [`mlx_kquant/licenses/`](https://github.com/asher/mlx-kquant/tree/main/mlx_kquant/licenses):
369
+
370
+ - **[llama.cpp / ggml](https://github.com/ggml-org/llama.cpp)** - the K-quant and block codec formats
371
+ and the quantization / dequantization algorithms that encode and decode them are derived from
372
+ ggml's reference implementation.
373
+ - **[gguf-tools](https://github.com/antirez/gguf-tools)** - used to implement a zero-copy GGUF loader
374
+ for downstream projects, statically linked into built wheels.
375
+ - **[MLX](https://github.com/ml-explore/mlx)** - the extension links `libmlx`, the kernels compile
376
+ against MLX's bundled headers, and parts of the Metal kernels are adapted from MLX's quantized and
377
+ steel-GEMM kernels.