nexusquant-kv 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexusquant_kv-0.4.0/LICENSE +98 -0
- nexusquant_kv-0.4.0/PKG-INFO +142 -0
- nexusquant_kv-0.4.0/README.md +110 -0
- nexusquant_kv-0.4.0/nexusquant/__init__.py +32 -0
- nexusquant_kv-0.4.0/nexusquant/core/__init__.py +40 -0
- nexusquant_kv-0.4.0/nexusquant/core/compression_accounting.py +372 -0
- nexusquant_kv-0.4.0/nexusquant/core/dp_allocator.py +161 -0
- nexusquant_kv-0.4.0/nexusquant/core/e8_lattice.py +124 -0
- nexusquant_kv-0.4.0/nexusquant/core/entropy_coder.py +615 -0
- nexusquant_kv-0.4.0/nexusquant/core/hadamard.py +87 -0
- nexusquant_kv-0.4.0/nexusquant/core/mp_threshold.py +79 -0
- nexusquant_kv-0.4.0/nexusquant/core/nsn.py +233 -0
- nexusquant_kv-0.4.0/nexusquant/core/optimal_shrinkage.py +258 -0
- nexusquant_kv-0.4.0/nexusquant/core/rope_utils.py +109 -0
- nexusquant_kv-0.4.0/nexusquant/core/tcc.py +470 -0
- nexusquant_kv-0.4.0/nexusquant/core/temporal_codec.py +157 -0
- nexusquant_kv-0.4.0/nexusquant/core/token_merger.py +123 -0
- nexusquant_kv-0.4.0/nexusquant/integrations/__init__.py +1 -0
- nexusquant_kv-0.4.0/nexusquant/integrations/huggingface.py +617 -0
- nexusquant_kv-0.4.0/nexusquant/pipeline.py +651 -0
- nexusquant_kv-0.4.0/nexusquant_kv.egg-info/PKG-INFO +142 -0
- nexusquant_kv-0.4.0/nexusquant_kv.egg-info/SOURCES.txt +26 -0
- nexusquant_kv-0.4.0/nexusquant_kv.egg-info/dependency_links.txt +1 -0
- nexusquant_kv-0.4.0/nexusquant_kv.egg-info/requires.txt +10 -0
- nexusquant_kv-0.4.0/nexusquant_kv.egg-info/top_level.txt +1 -0
- nexusquant_kv-0.4.0/pyproject.toml +45 -0
- nexusquant_kv-0.4.0/setup.cfg +4 -0
- nexusquant_kv-0.4.0/tests/test_core.py +174 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity.
|
|
18
|
+
|
|
19
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
20
|
+
exercising permissions granted by this License.
|
|
21
|
+
|
|
22
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
23
|
+
including but not limited to software source code, documentation
|
|
24
|
+
source, and configuration files.
|
|
25
|
+
|
|
26
|
+
"Object" form shall mean any form resulting from mechanical
|
|
27
|
+
transformation or translation of a Source form, including but
|
|
28
|
+
not limited to compiled object code, generated documentation,
|
|
29
|
+
and conversions to other media types.
|
|
30
|
+
|
|
31
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
32
|
+
Object form, made available under the License.
|
|
33
|
+
|
|
34
|
+
"Contribution" shall mean any work of authorship submitted to the
|
|
35
|
+
Licensor for inclusion in the Work.
|
|
36
|
+
|
|
37
|
+
"Contributor" shall mean Licensor and any Legal Entity on behalf
|
|
38
|
+
of whom a Contribution has been received by the Licensor.
|
|
39
|
+
|
|
40
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
41
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
42
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
43
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
44
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
45
|
+
Work and such Derivative Works in Source or Object form.
|
|
46
|
+
|
|
47
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
48
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
49
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
50
|
+
patent license to make, have made, use, offer to sell, sell,
|
|
51
|
+
import, and otherwise transfer the Work.
|
|
52
|
+
|
|
53
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
54
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
55
|
+
modifications, and in Source or Object form, provided that You
|
|
56
|
+
meet the following conditions:
|
|
57
|
+
|
|
58
|
+
(a) You must give any other recipients of the Work or
|
|
59
|
+
Derivative Works a copy of this License; and
|
|
60
|
+
|
|
61
|
+
(b) You must cause any modified files to carry prominent notices
|
|
62
|
+
stating that You changed the files; and
|
|
63
|
+
|
|
64
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
65
|
+
that You distribute, all copyright, patent, trademark, and
|
|
66
|
+
attribution notices from the Source form of the Work; and
|
|
67
|
+
|
|
68
|
+
(d) If the Work includes a "NOTICE" text file, You must include
|
|
69
|
+
a readable copy of the attribution notices contained within
|
|
70
|
+
such NOTICE file.
|
|
71
|
+
|
|
72
|
+
5. Submission of Contributions.
|
|
73
|
+
|
|
74
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
75
|
+
names, trademarks, service marks, or product names of the Licensor.
|
|
76
|
+
|
|
77
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
78
|
+
agreed to in writing, Licensor provides the Work on an "AS IS" BASIS,
|
|
79
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
|
|
80
|
+
|
|
81
|
+
8. Limitation of Liability. In no event shall any Contributor be
|
|
82
|
+
liable for any damages arising from the Work.
|
|
83
|
+
|
|
84
|
+
9. Accepting Warranty or Additional Liability.
|
|
85
|
+
|
|
86
|
+
Copyright 2026 NexusQuant Team
|
|
87
|
+
|
|
88
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
89
|
+
you may not use this file except in compliance with the License.
|
|
90
|
+
You may obtain a copy of the License at
|
|
91
|
+
|
|
92
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
93
|
+
|
|
94
|
+
Unless required by applicable law or agreed to in writing, software
|
|
95
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
96
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
97
|
+
See the License for the specific language governing permissions and
|
|
98
|
+
limitations under the License.
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nexusquant-kv
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Training-free KV cache compression via E8 lattice quantization and attention-aware token eviction
|
|
5
|
+
Author: Joao Marques
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Repository, https://github.com/jagmarques/nexusquant
|
|
8
|
+
Keywords: kv-cache,compression,quantization,llm,inference,e8-lattice,token-eviction,transformers,long-context,memory-efficient,attention,vector-quantization
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: torch>=2.0
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: zstandard>=0.22
|
|
26
|
+
Provides-Extra: hf
|
|
27
|
+
Requires-Dist: transformers>=4.40; extra == "hf"
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: transformers>=4.40; extra == "all"
|
|
30
|
+
Requires-Dist: accelerate>=0.26; extra == "all"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<strong>NexusQuant</strong>
|
|
35
|
+
</p>
|
|
36
|
+
<p align="center">
|
|
37
|
+
Compress your LLM's KV cache 10-33x. Training-free. One line of code.
|
|
38
|
+
</p>
|
|
39
|
+
<p align="center">
|
|
40
|
+
<a href="https://github.com/jagmarques/nexusquant/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=flat-square" alt="License"></a>
|
|
41
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.9+-blue?style=flat-square&logo=python&logoColor=white" alt="Python"></a>
|
|
42
|
+
<a href="https://github.com/jagmarques/nexusquant"><img src="https://img.shields.io/github/stars/jagmarques/nexusquant?style=social" alt="Stars"></a>
|
|
43
|
+
</p>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
Token eviction + E8 lattice quantization, applied once after prefill. No training, no calibration data, no model modifications.
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install nexusquant
|
|
53
|
+
pip install "nexusquant[hf]" # with HuggingFace transformers
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quickstart
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from nexusquant import nexusquant_evict
|
|
60
|
+
|
|
61
|
+
with nexusquant_evict(model, quality="balanced"):
|
|
62
|
+
output = model.generate(input_ids, max_new_tokens=512)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Why
|
|
66
|
+
|
|
67
|
+
| Without NexusQuant | With NexusQuant |
|
|
68
|
+
|---|---|
|
|
69
|
+
| 128K context → 80 GB KV cache | 128K context → 5 GB KV cache (17x) |
|
|
70
|
+
| OOM at 32K on a single A100 | 500K+ tokens on one A100 |
|
|
71
|
+
| Needs 8× A100 cluster for long context | Single GPU, single machine |
|
|
72
|
+
| Deploy a fine-tuned retrieval model | One `with` block, no code changes |
|
|
73
|
+
|
|
74
|
+
## Quality presets
|
|
75
|
+
|
|
76
|
+
Measured on Mistral-7B, A100, FP16. Compression ratios include all overhead (scales, indices, metadata).
|
|
77
|
+
|
|
78
|
+
| Preset | Compression | PPL degradation | Context on 80 GB |
|
|
79
|
+
|---|---|---|---|
|
|
80
|
+
| `high` | 10x | +0.4% | ~1.3M tokens |
|
|
81
|
+
| `balanced` | 17x | +1.3% | ~2.2M tokens |
|
|
82
|
+
| `max` | 33x | +2.6% | ~4.2M tokens |
|
|
83
|
+
|
|
84
|
+
Validated on Mistral-7B, TinyLlama-1.1B, Llama-3-8B across academic, technical, and creative text.
|
|
85
|
+
|
|
86
|
+
## How it works
|
|
87
|
+
|
|
88
|
+
1. **Importance scoring** — rank tokens by cross-head attention weight (key-key dot product)
|
|
89
|
+
2. **Token eviction** — drop lowest-scoring tokens; always keep BOS and a recent sliding window
|
|
90
|
+
3. **RoPE removal** — undo rotary embeddings on keys so they share a common subspace, reducing quantization error ~0.7 pp
|
|
91
|
+
4. **Hadamard rotation** — spread energy uniformly across dimensions so no outlier dominates the quantization scale
|
|
92
|
+
5. **E8 lattice quantization** — quantize 8-float groups onto the E8 root lattice (densest sphere packing in 8D), 2 bits/dim
|
|
93
|
+
6. **Delta coding + zstd** — consecutive tokens produce similar lattice indices; storing deltas then compressing with zstd yields another 2-3x on the index stream
|
|
94
|
+
|
|
95
|
+
Token eviction reduces *count* (2.5x at 60% eviction). E8 quantization reduces *precision* (~7x after entropy coding). Combined: 17x.
|
|
96
|
+
|
|
97
|
+
## Compared to
|
|
98
|
+
|
|
99
|
+
| Method | Compression | PPL degradation | Training required |
|
|
100
|
+
|---|---|---|---|
|
|
101
|
+
| **NexusQuant** | **10-33x** | **+0.4-2.6%** | **No** |
|
|
102
|
+
| TurboQuant (Google) | ~5-6x | ~0% | No |
|
|
103
|
+
| KVTC (NVIDIA) | up to 20x | <1% | Yes (calibration, ~10 min) |
|
|
104
|
+
| CommVQ (Apple) | ~8x | ~0% | Yes (full retraining) |
|
|
105
|
+
| Palu | 11x | ~25% rel | Yes (calibration) |
|
|
106
|
+
|
|
107
|
+
NexusQuant is the highest-compression training-free method. KVTC achieves comparable ratios with better quality but requires calibration data. Competitor numbers are from their published papers, not reproduced on our hardware.
|
|
108
|
+
|
|
109
|
+
## Supported models
|
|
110
|
+
|
|
111
|
+
Any HuggingFace causal LM using split-half RoPE (the standard since Llama-2):
|
|
112
|
+
|
|
113
|
+
- Llama family (Llama-2, Llama-3, Llama-3.1)
|
|
114
|
+
- Mistral / Mixtral
|
|
115
|
+
- Qwen
|
|
116
|
+
- Phi
|
|
117
|
+
- Gemma
|
|
118
|
+
|
|
119
|
+
Not yet supported: models with interleaved RoPE (GPT-NeoX, GPT-J).
|
|
120
|
+
|
|
121
|
+
## Limitations
|
|
122
|
+
|
|
123
|
+
- **Quality is text-dependent.** Creative/narrative text degrades more than structured/technical text at the same compression ratio. Test on your actual workload before deploying.
|
|
124
|
+
- **Short prefixes hurt.** Prefixes under 500 tokens see more degradation than the numbers above, which were measured at 1600-3500 tokens. The importance scorer needs enough tokens to distinguish signal from noise.
|
|
125
|
+
- **E8 quantization is CPU-bound.** A production deployment needs Triton/CUDA kernels for the quantization step. The current implementation writes dequantized values back to the cache for compatibility — actual GPU memory savings require native compact storage.
|
|
126
|
+
- **Eviction is permanent.** Evicted tokens are gone. If your task requires precise recall of a specific token, measure eviction sensitivity on that task first.
|
|
127
|
+
|
|
128
|
+
## Citation
|
|
129
|
+
|
|
130
|
+
```bibtex
|
|
131
|
+
@software{nexusquant2025,
|
|
132
|
+
author = {Marques, Joao},
|
|
133
|
+
title = {{NexusQuant}: Training-Free {KV} Cache Compression via {E8} Lattice Quantization},
|
|
134
|
+
year = {2025},
|
|
135
|
+
url = {https://github.com/jagmarques/nexusquant},
|
|
136
|
+
license = {Apache-2.0},
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
Apache 2.0. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<strong>NexusQuant</strong>
|
|
3
|
+
</p>
|
|
4
|
+
<p align="center">
|
|
5
|
+
Compress your LLM's KV cache 10-33x. Training-free. One line of code.
|
|
6
|
+
</p>
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://github.com/jagmarques/nexusquant/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=flat-square" alt="License"></a>
|
|
9
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.9+-blue?style=flat-square&logo=python&logoColor=white" alt="Python"></a>
|
|
10
|
+
<a href="https://github.com/jagmarques/nexusquant"><img src="https://img.shields.io/github/stars/jagmarques/nexusquant?style=social" alt="Stars"></a>
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
Token eviction + E8 lattice quantization, applied once after prefill. No training, no calibration data, no model modifications.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install nexusquant
|
|
21
|
+
pip install "nexusquant[hf]" # with HuggingFace transformers
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quickstart
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from nexusquant import nexusquant_evict
|
|
28
|
+
|
|
29
|
+
with nexusquant_evict(model, quality="balanced"):
|
|
30
|
+
output = model.generate(input_ids, max_new_tokens=512)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Why
|
|
34
|
+
|
|
35
|
+
| Without NexusQuant | With NexusQuant |
|
|
36
|
+
|---|---|
|
|
37
|
+
| 128K context → 80 GB KV cache | 128K context → 5 GB KV cache (17x) |
|
|
38
|
+
| OOM at 32K on a single A100 | 500K+ tokens on one A100 |
|
|
39
|
+
| Needs 8× A100 cluster for long context | Single GPU, single machine |
|
|
40
|
+
| Deploy a fine-tuned retrieval model | One `with` block, no code changes |
|
|
41
|
+
|
|
42
|
+
## Quality presets
|
|
43
|
+
|
|
44
|
+
Measured on Mistral-7B, A100, FP16. Compression ratios include all overhead (scales, indices, metadata).
|
|
45
|
+
|
|
46
|
+
| Preset | Compression | PPL degradation | Context on 80 GB |
|
|
47
|
+
|---|---|---|---|
|
|
48
|
+
| `high` | 10x | +0.4% | ~1.3M tokens |
|
|
49
|
+
| `balanced` | 17x | +1.3% | ~2.2M tokens |
|
|
50
|
+
| `max` | 33x | +2.6% | ~4.2M tokens |
|
|
51
|
+
|
|
52
|
+
Validated on Mistral-7B, TinyLlama-1.1B, Llama-3-8B across academic, technical, and creative text.
|
|
53
|
+
|
|
54
|
+
## How it works
|
|
55
|
+
|
|
56
|
+
1. **Importance scoring** — rank tokens by cross-head attention weight (key-key dot product)
|
|
57
|
+
2. **Token eviction** — drop lowest-scoring tokens; always keep BOS and a recent sliding window
|
|
58
|
+
3. **RoPE removal** — undo rotary embeddings on keys so they share a common subspace, reducing quantization error ~0.7 pp
|
|
59
|
+
4. **Hadamard rotation** — spread energy uniformly across dimensions so no outlier dominates the quantization scale
|
|
60
|
+
5. **E8 lattice quantization** — quantize 8-float groups onto the E8 root lattice (densest sphere packing in 8D), 2 bits/dim
|
|
61
|
+
6. **Delta coding + zstd** — consecutive tokens produce similar lattice indices; storing deltas then compressing with zstd yields another 2-3x on the index stream
|
|
62
|
+
|
|
63
|
+
Token eviction reduces *count* (2.5x at 60% eviction). E8 quantization reduces *precision* (~7x after entropy coding). Combined: 17x.
|
|
64
|
+
|
|
65
|
+
## Compared to
|
|
66
|
+
|
|
67
|
+
| Method | Compression | PPL degradation | Training required |
|
|
68
|
+
|---|---|---|---|
|
|
69
|
+
| **NexusQuant** | **10-33x** | **+0.4-2.6%** | **No** |
|
|
70
|
+
| TurboQuant (Google) | ~5-6x | ~0% | No |
|
|
71
|
+
| KVTC (NVIDIA) | up to 20x | <1% | Yes (calibration, ~10 min) |
|
|
72
|
+
| CommVQ (Apple) | ~8x | ~0% | Yes (full retraining) |
|
|
73
|
+
| Palu | 11x | ~25% rel | Yes (calibration) |
|
|
74
|
+
|
|
75
|
+
NexusQuant is the highest-compression training-free method. KVTC achieves comparable ratios with better quality but requires calibration data. Competitor numbers are from their published papers, not reproduced on our hardware.
|
|
76
|
+
|
|
77
|
+
## Supported models
|
|
78
|
+
|
|
79
|
+
Any HuggingFace causal LM using split-half RoPE (the standard since Llama-2):
|
|
80
|
+
|
|
81
|
+
- Llama family (Llama-2, Llama-3, Llama-3.1)
|
|
82
|
+
- Mistral / Mixtral
|
|
83
|
+
- Qwen
|
|
84
|
+
- Phi
|
|
85
|
+
- Gemma
|
|
86
|
+
|
|
87
|
+
Not yet supported: models with interleaved RoPE (GPT-NeoX, GPT-J).
|
|
88
|
+
|
|
89
|
+
## Limitations
|
|
90
|
+
|
|
91
|
+
- **Quality is text-dependent.** Creative/narrative text degrades more than structured/technical text at the same compression ratio. Test on your actual workload before deploying.
|
|
92
|
+
- **Short prefixes hurt.** Prefixes under 500 tokens see more degradation than the numbers above, which were measured at 1600-3500 tokens. The importance scorer needs enough tokens to distinguish signal from noise.
|
|
93
|
+
- **E8 quantization is CPU-bound.** A production deployment needs Triton/CUDA kernels for the quantization step. The current implementation writes dequantized values back to the cache for compatibility — actual GPU memory savings require native compact storage.
|
|
94
|
+
- **Eviction is permanent.** Evicted tokens are gone. If your task requires precise recall of a specific token, measure eviction sensitivity on that task first.
|
|
95
|
+
|
|
96
|
+
## Citation
|
|
97
|
+
|
|
98
|
+
```bibtex
|
|
99
|
+
@software{nexusquant2025,
|
|
100
|
+
author = {Marques, Joao},
|
|
101
|
+
title = {{NexusQuant}: Training-Free {KV} Cache Compression via {E8} Lattice Quantization},
|
|
102
|
+
year = {2025},
|
|
103
|
+
url = {https://github.com/jagmarques/nexusquant},
|
|
104
|
+
license = {Apache-2.0},
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
Apache 2.0. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""NexusQuant: Training-Free KV Cache Compression via E8 Lattice VQ + Token Eviction.
|
|
2
|
+
|
|
3
|
+
GPU-validated numbers (Mistral-7B, A100, all overhead included):
|
|
4
|
+
10.4x at +0.43% PPL (quality="high", 35% eviction)
|
|
5
|
+
16.8x at +1.34% PPL (quality="balanced", 60% eviction)
|
|
6
|
+
33.3x at +2.64% PPL (quality="max", 80% eviction)
|
|
7
|
+
|
|
8
|
+
Training-free. Zero calibration. One line of code. Apache 2.0.
|
|
9
|
+
|
|
10
|
+
Quick start:
|
|
11
|
+
from nexusquant import nexusquant_evict
|
|
12
|
+
|
|
13
|
+
with nexusquant_evict(model, quality="balanced"):
|
|
14
|
+
output = model.generate(input_ids, max_new_tokens=100)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "0.4.0"
|
|
18
|
+
|
|
19
|
+
from nexusquant.pipeline import (
|
|
20
|
+
NexusQuantFast,
|
|
21
|
+
NexusQuantSimple,
|
|
22
|
+
NexusQuantMax,
|
|
23
|
+
NexusQuantEvict,
|
|
24
|
+
compress_kv_cache,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from nexusquant.integrations.huggingface import (
|
|
28
|
+
nexusquant,
|
|
29
|
+
nexusquant_simple,
|
|
30
|
+
nexusquant_max,
|
|
31
|
+
nexusquant_evict,
|
|
32
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from nexusquant.core.e8_lattice import E8Lattice
|
|
2
|
+
from nexusquant.core.hadamard import hadamard_matrix, fht, ifht
|
|
3
|
+
from nexusquant.core.dp_allocator import (
|
|
4
|
+
dp_bit_allocation,
|
|
5
|
+
calibrate_distortion_factors,
|
|
6
|
+
DISTORTION_THEORETICAL,
|
|
7
|
+
DISTORTION_EMPIRICAL,
|
|
8
|
+
)
|
|
9
|
+
from nexusquant.core.rope_utils import inverse_rope, forward_rope
|
|
10
|
+
from nexusquant.core.token_merger import merge_tokens, merge_and_drop
|
|
11
|
+
from nexusquant.core.entropy_coder import (
|
|
12
|
+
encode_e8,
|
|
13
|
+
decode_e8,
|
|
14
|
+
measure_entropy,
|
|
15
|
+
measure_e8_entropy,
|
|
16
|
+
e8_quantize_with_entropy,
|
|
17
|
+
)
|
|
18
|
+
from nexusquant.core.temporal_codec import (
|
|
19
|
+
compress_indices,
|
|
20
|
+
decompress_indices,
|
|
21
|
+
temporal_delta_encode,
|
|
22
|
+
temporal_delta_decode,
|
|
23
|
+
measure_compression,
|
|
24
|
+
)
|
|
25
|
+
from nexusquant.core.nsn import forward_nsn, inverse_nsn, NSNStats
|
|
26
|
+
from nexusquant.core.tcc import (
|
|
27
|
+
forward_tcc,
|
|
28
|
+
inverse_tcc,
|
|
29
|
+
compute_compression_stats,
|
|
30
|
+
TCCCompressed,
|
|
31
|
+
)
|
|
32
|
+
from nexusquant.core.optimal_shrinkage import (
|
|
33
|
+
estimate_noise_sigma,
|
|
34
|
+
optimal_shrinkage_frobenius,
|
|
35
|
+
optimal_hard_threshold,
|
|
36
|
+
effective_dims,
|
|
37
|
+
variance_retained,
|
|
38
|
+
reconstruct_with_shrinkage,
|
|
39
|
+
mp_median,
|
|
40
|
+
)
|