nexusquant-kv 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. nexusquant_kv-0.4.0/LICENSE +98 -0
  2. nexusquant_kv-0.4.0/PKG-INFO +142 -0
  3. nexusquant_kv-0.4.0/README.md +110 -0
  4. nexusquant_kv-0.4.0/nexusquant/__init__.py +32 -0
  5. nexusquant_kv-0.4.0/nexusquant/core/__init__.py +40 -0
  6. nexusquant_kv-0.4.0/nexusquant/core/compression_accounting.py +372 -0
  7. nexusquant_kv-0.4.0/nexusquant/core/dp_allocator.py +161 -0
  8. nexusquant_kv-0.4.0/nexusquant/core/e8_lattice.py +124 -0
  9. nexusquant_kv-0.4.0/nexusquant/core/entropy_coder.py +615 -0
  10. nexusquant_kv-0.4.0/nexusquant/core/hadamard.py +87 -0
  11. nexusquant_kv-0.4.0/nexusquant/core/mp_threshold.py +79 -0
  12. nexusquant_kv-0.4.0/nexusquant/core/nsn.py +233 -0
  13. nexusquant_kv-0.4.0/nexusquant/core/optimal_shrinkage.py +258 -0
  14. nexusquant_kv-0.4.0/nexusquant/core/rope_utils.py +109 -0
  15. nexusquant_kv-0.4.0/nexusquant/core/tcc.py +470 -0
  16. nexusquant_kv-0.4.0/nexusquant/core/temporal_codec.py +157 -0
  17. nexusquant_kv-0.4.0/nexusquant/core/token_merger.py +123 -0
  18. nexusquant_kv-0.4.0/nexusquant/integrations/__init__.py +1 -0
  19. nexusquant_kv-0.4.0/nexusquant/integrations/huggingface.py +617 -0
  20. nexusquant_kv-0.4.0/nexusquant/pipeline.py +651 -0
  21. nexusquant_kv-0.4.0/nexusquant_kv.egg-info/PKG-INFO +142 -0
  22. nexusquant_kv-0.4.0/nexusquant_kv.egg-info/SOURCES.txt +26 -0
  23. nexusquant_kv-0.4.0/nexusquant_kv.egg-info/dependency_links.txt +1 -0
  24. nexusquant_kv-0.4.0/nexusquant_kv.egg-info/requires.txt +10 -0
  25. nexusquant_kv-0.4.0/nexusquant_kv.egg-info/top_level.txt +1 -0
  26. nexusquant_kv-0.4.0/pyproject.toml +45 -0
  27. nexusquant_kv-0.4.0/setup.cfg +4 -0
  28. nexusquant_kv-0.4.0/tests/test_core.py +174 -0
@@ -0,0 +1,98 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity.
18
+
19
+ "You" (or "Your") shall mean an individual or Legal Entity
20
+ exercising permissions granted by this License.
21
+
22
+ "Source" form shall mean the preferred form for making modifications,
23
+ including but not limited to software source code, documentation
24
+ source, and configuration files.
25
+
26
+ "Object" form shall mean any form resulting from mechanical
27
+ transformation or translation of a Source form, including but
28
+ not limited to compiled object code, generated documentation,
29
+ and conversions to other media types.
30
+
31
+ "Work" shall mean the work of authorship, whether in Source or
32
+ Object form, made available under the License.
33
+
34
+ "Contribution" shall mean any work of authorship submitted to the
35
+ Licensor for inclusion in the Work.
36
+
37
+ "Contributor" shall mean Licensor and any Legal Entity on behalf
38
+ of whom a Contribution has been received by the Licensor.
39
+
40
+ 2. Grant of Copyright License. Subject to the terms and conditions of
41
+ this License, each Contributor hereby grants to You a perpetual,
42
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
43
+ copyright license to reproduce, prepare Derivative Works of,
44
+ publicly display, publicly perform, sublicense, and distribute the
45
+ Work and such Derivative Works in Source or Object form.
46
+
47
+ 3. Grant of Patent License. Subject to the terms and conditions of
48
+ this License, each Contributor hereby grants to You a perpetual,
49
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
50
+ patent license to make, have made, use, offer to sell, sell,
51
+ import, and otherwise transfer the Work.
52
+
53
+ 4. Redistribution. You may reproduce and distribute copies of the
54
+ Work or Derivative Works thereof in any medium, with or without
55
+ modifications, and in Source or Object form, provided that You
56
+ meet the following conditions:
57
+
58
+ (a) You must give any other recipients of the Work or
59
+ Derivative Works a copy of this License; and
60
+
61
+ (b) You must cause any modified files to carry prominent notices
62
+ stating that You changed the files; and
63
+
64
+ (c) You must retain, in the Source form of any Derivative Works
65
+ that You distribute, all copyright, patent, trademark, and
66
+ attribution notices from the Source form of the Work; and
67
+
68
+ (d) If the Work includes a "NOTICE" text file, You must include
69
+ a readable copy of the attribution notices contained within
70
+ such NOTICE file.
71
+
72
+ 5. Submission of Contributions.
73
+
74
+ 6. Trademarks. This License does not grant permission to use the trade
75
+ names, trademarks, service marks, or product names of the Licensor.
76
+
77
+ 7. Disclaimer of Warranty. Unless required by applicable law or
78
+ agreed to in writing, Licensor provides the Work on an "AS IS" BASIS,
79
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
80
+
81
+ 8. Limitation of Liability. In no event shall any Contributor be
82
+ liable for any damages arising from the Work.
83
+
84
+ 9. Accepting Warranty or Additional Liability.
85
+
86
+ Copyright 2026 NexusQuant Team
87
+
88
+ Licensed under the Apache License, Version 2.0 (the "License");
89
+ you may not use this file except in compliance with the License.
90
+ You may obtain a copy of the License at
91
+
92
+ http://www.apache.org/licenses/LICENSE-2.0
93
+
94
+ Unless required by applicable law or agreed to in writing, software
95
+ distributed under the License is distributed on an "AS IS" BASIS,
96
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
97
+ See the License for the specific language governing permissions and
98
+ limitations under the License.
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: nexusquant-kv
3
+ Version: 0.4.0
4
+ Summary: Training-free KV cache compression via E8 lattice quantization and attention-aware token eviction
5
+ Author: Joao Marques
6
+ License: Apache-2.0
7
+ Project-URL: Repository, https://github.com/jagmarques/nexusquant
8
+ Keywords: kv-cache,compression,quantization,llm,inference,e8-lattice,token-eviction,transformers,long-context,memory-efficient,attention,vector-quantization
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: torch>=2.0
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: zstandard>=0.22
26
+ Provides-Extra: hf
27
+ Requires-Dist: transformers>=4.40; extra == "hf"
28
+ Provides-Extra: all
29
+ Requires-Dist: transformers>=4.40; extra == "all"
30
+ Requires-Dist: accelerate>=0.26; extra == "all"
31
+ Dynamic: license-file
32
+
33
+ <p align="center">
34
+ <strong>NexusQuant</strong>
35
+ </p>
36
+ <p align="center">
37
+ Compress your LLM's KV cache 10-33x. Training-free. One line of code.
38
+ </p>
39
+ <p align="center">
40
+ <a href="https://github.com/jagmarques/nexusquant/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=flat-square" alt="License"></a>
41
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.9+-blue?style=flat-square&logo=python&logoColor=white" alt="Python"></a>
42
+ <a href="https://github.com/jagmarques/nexusquant"><img src="https://img.shields.io/github/stars/jagmarques/nexusquant?style=social" alt="Stars"></a>
43
+ </p>
44
+
45
+ ---
46
+
47
+ Token eviction + E8 lattice quantization, applied once after prefill. No training, no calibration data, no model modifications.
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install nexusquant
53
+ pip install "nexusquant[hf]" # with HuggingFace transformers
54
+ ```
55
+
56
+ ## Quickstart
57
+
58
+ ```python
59
+ from nexusquant import nexusquant_evict
60
+
61
+ with nexusquant_evict(model, quality="balanced"):
62
+ output = model.generate(input_ids, max_new_tokens=512)
63
+ ```
64
+
65
+ ## Why
66
+
67
+ | Without NexusQuant | With NexusQuant |
68
+ |---|---|
69
+ | 128K context → 80 GB KV cache | 128K context → 5 GB KV cache (17x) |
70
+ | OOM at 32K on a single A100 | 500K+ tokens on one A100 |
71
+ | Needs 8× A100 cluster for long context | Single GPU, single machine |
72
+ | Deploy a fine-tuned retrieval model | One `with` block, no code changes |
73
+
74
+ ## Quality presets
75
+
76
+ Measured on Mistral-7B, A100, FP16. Compression ratios include all overhead (scales, indices, metadata).
77
+
78
+ | Preset | Compression | PPL degradation | Context on 80 GB |
79
+ |---|---|---|---|
80
+ | `high` | 10x | +0.4% | ~1.3M tokens |
81
+ | `balanced` | 17x | +1.3% | ~2.2M tokens |
82
+ | `max` | 33x | +2.6% | ~4.2M tokens |
83
+
84
+ Validated on Mistral-7B, TinyLlama-1.1B, Llama-3-8B across academic, technical, and creative text.
85
+
86
+ ## How it works
87
+
88
+ 1. **Importance scoring** — rank tokens by cross-head attention weight (key-key dot product)
89
+ 2. **Token eviction** — drop lowest-scoring tokens; always keep BOS and a recent sliding window
90
+ 3. **RoPE removal** — undo rotary embeddings on keys so they share a common subspace, reducing quantization error ~0.7 pp
91
+ 4. **Hadamard rotation** — spread energy uniformly across dimensions so no outlier dominates the quantization scale
92
+ 5. **E8 lattice quantization** — quantize 8-float groups onto the E8 root lattice (densest sphere packing in 8D), 2 bits/dim
93
+ 6. **Delta coding + zstd** — consecutive tokens produce similar lattice indices; storing deltas then compressing with zstd yields another 2-3x on the index stream
94
+
95
+ Token eviction reduces *count* (2.5x at 60% eviction). E8 quantization reduces *precision* (~7x after entropy coding). Combined: 17x.
96
+
97
+ ## Compared to
98
+
99
+ | Method | Compression | PPL degradation | Training required |
100
+ |---|---|---|---|
101
+ | **NexusQuant** | **10-33x** | **+0.4-2.6%** | **No** |
102
+ | TurboQuant (Google) | ~5-6x | ~0% | No |
103
+ | KVTC (NVIDIA) | up to 20x | <1% | Yes (calibration, ~10 min) |
104
+ | CommVQ (Apple) | ~8x | ~0% | Yes (full retraining) |
105
+ | Palu | 11x | ~25% rel | Yes (calibration) |
106
+
107
+ NexusQuant is the highest-compression training-free method. KVTC achieves comparable ratios with better quality but requires calibration data. Competitor numbers are from their published papers, not reproduced on our hardware.
108
+
109
+ ## Supported models
110
+
111
+ Any HuggingFace causal LM using split-half RoPE (the standard since Llama-2):
112
+
113
+ - Llama family (Llama-2, Llama-3, Llama-3.1)
114
+ - Mistral / Mixtral
115
+ - Qwen
116
+ - Phi
117
+ - Gemma
118
+
119
+ Not yet supported: models with interleaved RoPE (GPT-NeoX, GPT-J).
120
+
121
+ ## Limitations
122
+
123
+ - **Quality is text-dependent.** Creative/narrative text degrades more than structured/technical text at the same compression ratio. Test on your actual workload before deploying.
124
+ - **Short prefixes hurt.** Prefixes under 500 tokens see more degradation than the numbers above, which were measured at 1600-3500 tokens. The importance scorer needs enough tokens to distinguish signal from noise.
125
+ - **E8 quantization is CPU-bound.** A production deployment needs Triton/CUDA kernels for the quantization step. The current implementation writes dequantized values back to the cache for compatibility — actual GPU memory savings require native compact storage.
126
+ - **Eviction is permanent.** Evicted tokens are gone. If your task requires precise recall of a specific token, measure eviction sensitivity on that task first.
127
+
128
+ ## Citation
129
+
130
+ ```bibtex
131
+ @software{nexusquant2025,
132
+ author = {Marques, Joao},
133
+ title = {{NexusQuant}: Training-Free {KV} Cache Compression via {E8} Lattice Quantization},
134
+ year = {2025},
135
+ url = {https://github.com/jagmarques/nexusquant},
136
+ license = {Apache-2.0},
137
+ }
138
+ ```
139
+
140
+ ## License
141
+
142
+ Apache 2.0. See [LICENSE](LICENSE).
@@ -0,0 +1,110 @@
1
+ <p align="center">
2
+ <strong>NexusQuant</strong>
3
+ </p>
4
+ <p align="center">
5
+ Compress your LLM's KV cache 10-33x. Training-free. One line of code.
6
+ </p>
7
+ <p align="center">
8
+ <a href="https://github.com/jagmarques/nexusquant/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=flat-square" alt="License"></a>
9
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.9+-blue?style=flat-square&logo=python&logoColor=white" alt="Python"></a>
10
+ <a href="https://github.com/jagmarques/nexusquant"><img src="https://img.shields.io/github/stars/jagmarques/nexusquant?style=social" alt="Stars"></a>
11
+ </p>
12
+
13
+ ---
14
+
15
+ Token eviction + E8 lattice quantization, applied once after prefill. No training, no calibration data, no model modifications.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install nexusquant
21
+ pip install "nexusquant[hf]" # with HuggingFace transformers
22
+ ```
23
+
24
+ ## Quickstart
25
+
26
+ ```python
27
+ from nexusquant import nexusquant_evict
28
+
29
+ with nexusquant_evict(model, quality="balanced"):
30
+ output = model.generate(input_ids, max_new_tokens=512)
31
+ ```
32
+
33
+ ## Why
34
+
35
+ | Without NexusQuant | With NexusQuant |
36
+ |---|---|
37
+ | 128K context → 80 GB KV cache | 128K context → 5 GB KV cache (17x) |
38
+ | OOM at 32K on a single A100 | 500K+ tokens on one A100 |
39
+ | Needs 8× A100 cluster for long context | Single GPU, single machine |
40
+ | Deploy a fine-tuned retrieval model | One `with` block, no code changes |
41
+
42
+ ## Quality presets
43
+
44
+ Measured on Mistral-7B, A100, FP16. Compression ratios include all overhead (scales, indices, metadata).
45
+
46
+ | Preset | Compression | PPL degradation | Context on 80 GB |
47
+ |---|---|---|---|
48
+ | `high` | 10x | +0.4% | ~1.3M tokens |
49
+ | `balanced` | 17x | +1.3% | ~2.2M tokens |
50
+ | `max` | 33x | +2.6% | ~4.2M tokens |
51
+
52
+ Validated on Mistral-7B, TinyLlama-1.1B, Llama-3-8B across academic, technical, and creative text.
53
+
54
+ ## How it works
55
+
56
+ 1. **Importance scoring** — rank tokens by cross-head attention weight (key-key dot product)
57
+ 2. **Token eviction** — drop lowest-scoring tokens; always keep BOS and a recent sliding window
58
+ 3. **RoPE removal** — undo rotary embeddings on keys so they share a common subspace, reducing quantization error ~0.7 pp
59
+ 4. **Hadamard rotation** — spread energy uniformly across dimensions so no outlier dominates the quantization scale
60
+ 5. **E8 lattice quantization** — quantize 8-float groups onto the E8 root lattice (densest sphere packing in 8D), 2 bits/dim
61
+ 6. **Delta coding + zstd** — consecutive tokens produce similar lattice indices; storing deltas then compressing with zstd yields another 2-3x on the index stream
62
+
63
+ Token eviction reduces *count* (2.5x at 60% eviction). E8 quantization reduces *precision* (~7x after entropy coding). Combined: 17x.
64
+
65
+ ## Compared to
66
+
67
+ | Method | Compression | PPL degradation | Training required |
68
+ |---|---|---|---|
69
+ | **NexusQuant** | **10-33x** | **+0.4-2.6%** | **No** |
70
+ | TurboQuant (Google) | ~5-6x | ~0% | No |
71
+ | KVTC (NVIDIA) | up to 20x | <1% | Yes (calibration, ~10 min) |
72
+ | CommVQ (Apple) | ~8x | ~0% | Yes (full retraining) |
73
+ | Palu | 11x | ~25% rel | Yes (calibration) |
74
+
75
+ NexusQuant is the highest-compression training-free method. KVTC achieves comparable ratios with better quality but requires calibration data. Competitor numbers are from their published papers, not reproduced on our hardware.
76
+
77
+ ## Supported models
78
+
79
+ Any HuggingFace causal LM using split-half RoPE (the standard since Llama-2):
80
+
81
+ - Llama family (Llama-2, Llama-3, Llama-3.1)
82
+ - Mistral / Mixtral
83
+ - Qwen
84
+ - Phi
85
+ - Gemma
86
+
87
+ Not yet supported: models with interleaved RoPE (GPT-NeoX, GPT-J).
88
+
89
+ ## Limitations
90
+
91
+ - **Quality is text-dependent.** Creative/narrative text degrades more than structured/technical text at the same compression ratio. Test on your actual workload before deploying.
92
+ - **Short prefixes hurt.** Prefixes under 500 tokens see more degradation than the numbers above, which were measured at 1600-3500 tokens. The importance scorer needs enough tokens to distinguish signal from noise.
93
+ - **E8 quantization is CPU-bound.** A production deployment needs Triton/CUDA kernels for the quantization step. The current implementation writes dequantized values back to the cache for compatibility — actual GPU memory savings require native compact storage.
94
+ - **Eviction is permanent.** Evicted tokens are gone. If your task requires precise recall of a specific token, measure eviction sensitivity on that task first.
95
+
96
+ ## Citation
97
+
98
+ ```bibtex
99
+ @software{nexusquant2025,
100
+ author = {Marques, Joao},
101
+ title = {{NexusQuant}: Training-Free {KV} Cache Compression via {E8} Lattice Quantization},
102
+ year = {2025},
103
+ url = {https://github.com/jagmarques/nexusquant},
104
+ license = {Apache-2.0},
105
+ }
106
+ ```
107
+
108
+ ## License
109
+
110
+ Apache 2.0. See [LICENSE](LICENSE).
@@ -0,0 +1,32 @@
1
+ """NexusQuant: Training-Free KV Cache Compression via E8 Lattice VQ + Token Eviction.
2
+
3
+ GPU-validated numbers (Mistral-7B, A100, all overhead included):
4
+ 10.4x at +0.43% PPL (quality="high", 35% eviction)
5
+ 16.8x at +1.34% PPL (quality="balanced", 60% eviction)
6
+ 33.3x at +2.64% PPL (quality="max", 80% eviction)
7
+
8
+ Training-free. Zero calibration. One line of code. Apache 2.0.
9
+
10
+ Quick start:
11
+ from nexusquant import nexusquant_evict
12
+
13
+ with nexusquant_evict(model, quality="balanced"):
14
+ output = model.generate(input_ids, max_new_tokens=100)
15
+ """
16
+
17
+ __version__ = "0.4.0"
18
+
19
+ from nexusquant.pipeline import (
20
+ NexusQuantFast,
21
+ NexusQuantSimple,
22
+ NexusQuantMax,
23
+ NexusQuantEvict,
24
+ compress_kv_cache,
25
+ )
26
+
27
+ from nexusquant.integrations.huggingface import (
28
+ nexusquant,
29
+ nexusquant_simple,
30
+ nexusquant_max,
31
+ nexusquant_evict,
32
+ )
@@ -0,0 +1,40 @@
1
+ from nexusquant.core.e8_lattice import E8Lattice
2
+ from nexusquant.core.hadamard import hadamard_matrix, fht, ifht
3
+ from nexusquant.core.dp_allocator import (
4
+ dp_bit_allocation,
5
+ calibrate_distortion_factors,
6
+ DISTORTION_THEORETICAL,
7
+ DISTORTION_EMPIRICAL,
8
+ )
9
+ from nexusquant.core.rope_utils import inverse_rope, forward_rope
10
+ from nexusquant.core.token_merger import merge_tokens, merge_and_drop
11
+ from nexusquant.core.entropy_coder import (
12
+ encode_e8,
13
+ decode_e8,
14
+ measure_entropy,
15
+ measure_e8_entropy,
16
+ e8_quantize_with_entropy,
17
+ )
18
+ from nexusquant.core.temporal_codec import (
19
+ compress_indices,
20
+ decompress_indices,
21
+ temporal_delta_encode,
22
+ temporal_delta_decode,
23
+ measure_compression,
24
+ )
25
+ from nexusquant.core.nsn import forward_nsn, inverse_nsn, NSNStats
26
+ from nexusquant.core.tcc import (
27
+ forward_tcc,
28
+ inverse_tcc,
29
+ compute_compression_stats,
30
+ TCCCompressed,
31
+ )
32
+ from nexusquant.core.optimal_shrinkage import (
33
+ estimate_noise_sigma,
34
+ optimal_shrinkage_frobenius,
35
+ optimal_hard_threshold,
36
+ effective_dims,
37
+ variance_retained,
38
+ reconstruct_with_shrinkage,
39
+ mp_median,
40
+ )