onecomp 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onecomp-1.0.0/LICENSE +21 -0
- onecomp-1.0.0/PKG-INFO +303 -0
- onecomp-1.0.0/README.md +218 -0
- onecomp-1.0.0/benchmark/llama3-8b-gptq/quant_benchmark.py +95 -0
- onecomp-1.0.0/benchmark/llama3-8b-jointq/quant_benchmark.py +106 -0
- onecomp-1.0.0/benchmark/llama3-8b-qep-gptq/quant_benchmark.py +101 -0
- onecomp-1.0.0/benchmark/llama3-8b-various/quant_benchmark.py +70 -0
- onecomp-1.0.0/benchmark/qwen3-14b-gptq/quant_benchmark.py +88 -0
- onecomp-1.0.0/benchmark/qwen3-14b-jointq/quant_benchmark.py +106 -0
- onecomp-1.0.0/benchmark/qwen3-8b-gptq/quant_benchmark.py +88 -0
- onecomp-1.0.0/benchmark/qwen3-8b-jointq/quant_benchmark.py +106 -0
- onecomp-1.0.0/example/example_auto_run.py +23 -0
- onecomp-1.0.0/example/example_autobit.py +47 -0
- onecomp-1.0.0/example/example_gptq.py +45 -0
- onecomp-1.0.0/example/example_jointq.py +45 -0
- onecomp-1.0.0/example/example_qep_gptq.py +45 -0
- onecomp-1.0.0/example/example_save_load.py +60 -0
- onecomp-1.0.0/example/post_process/example_lora_sft.py +143 -0
- onecomp-1.0.0/example/post_process/example_lora_sft_knowledge.py +144 -0
- onecomp-1.0.0/example/pre_process/example_llama_preprocess_rtn.py +51 -0
- onecomp-1.0.0/example/pre_process/example_preprocess_save_load.py +122 -0
- onecomp-1.0.0/example/vllm_inference/example_autobit_vllm_inference.py +63 -0
- onecomp-1.0.0/example/vllm_inference/example_gptq_vllm_inference.py +74 -0
- onecomp-1.0.0/onecomp/__init__.py +23 -0
- onecomp-1.0.0/onecomp/__main__.py +8 -0
- onecomp-1.0.0/onecomp/__version__.py +9 -0
- onecomp-1.0.0/onecomp/analyzer/__init__.py +23 -0
- onecomp-1.0.0/onecomp/analyzer/cumulative_error.py +237 -0
- onecomp-1.0.0/onecomp/analyzer/weight_outlier.py +746 -0
- onecomp-1.0.0/onecomp/cli.py +89 -0
- onecomp-1.0.0/onecomp/log.py +32 -0
- onecomp-1.0.0/onecomp/model_config.py +105 -0
- onecomp-1.0.0/onecomp/post_process/__init__.py +24 -0
- onecomp-1.0.0/onecomp/post_process/_base.py +78 -0
- onecomp-1.0.0/onecomp/post_process/blockwise_ptq.py +76 -0
- onecomp-1.0.0/onecomp/post_process/post_process_lora_sft.py +1260 -0
- onecomp-1.0.0/onecomp/pre_process/__init__.py +11 -0
- onecomp-1.0.0/onecomp/pre_process/hadamard_utils.py +120444 -0
- onecomp-1.0.0/onecomp/pre_process/modeling_llama.py +94 -0
- onecomp-1.0.0/onecomp/pre_process/modeling_qwen3.py +93 -0
- onecomp-1.0.0/onecomp/pre_process/optimizer.py +226 -0
- onecomp-1.0.0/onecomp/pre_process/prepare_rotated_model.py +251 -0
- onecomp-1.0.0/onecomp/pre_process/preprocess_args.py +76 -0
- onecomp-1.0.0/onecomp/pre_process/quant_models.py +950 -0
- onecomp-1.0.0/onecomp/pre_process/rotation_utils.py +636 -0
- onecomp-1.0.0/onecomp/pre_process/train_rotation.py +575 -0
- onecomp-1.0.0/onecomp/qep/__init__.py +11 -0
- onecomp-1.0.0/onecomp/qep/_qep_config.py +56 -0
- onecomp-1.0.0/onecomp/qep/_quantize_with_qep.py +133 -0
- onecomp-1.0.0/onecomp/qep/_quantize_with_qep_arch.py +360 -0
- onecomp-1.0.0/onecomp/quantized_model_loader.py +374 -0
- onecomp-1.0.0/onecomp/quantizer/__init__.py +20 -0
- onecomp-1.0.0/onecomp/quantizer/_quantizer.py +915 -0
- onecomp-1.0.0/onecomp/quantizer/arb/__init__.py +7 -0
- onecomp-1.0.0/onecomp/quantizer/arb/_arb.py +140 -0
- onecomp-1.0.0/onecomp/quantizer/arb/arb_impl.py +333 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/__init__.py +13 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/_autobit.py +607 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/activation_stats.py +275 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/dbf_fallback.py +73 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/ilp.py +336 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/manual.py +30 -0
- onecomp-1.0.0/onecomp/quantizer/autobit/visualize.py +395 -0
- onecomp-1.0.0/onecomp/quantizer/cq/__init__.py +7 -0
- onecomp-1.0.0/onecomp/quantizer/cq/_cq.py +113 -0
- onecomp-1.0.0/onecomp/quantizer/cq/cq_impl.py +211 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/__init__.py +14 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/_dbf.py +400 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/admm_extended.py +815 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/balance.py +232 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/config.py +63 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/dbf_impl.py +190 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/dbf_layer.py +263 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/dbf_original.py +788 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/fine_tune.py +310 -0
- onecomp-1.0.0/onecomp/quantizer/dbf/middle.py +1153 -0
- onecomp-1.0.0/onecomp/quantizer/gemlite.py +141 -0
- onecomp-1.0.0/onecomp/quantizer/gptq/__init__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/gptq/_gptq.py +823 -0
- onecomp-1.0.0/onecomp/quantizer/gptq/config.py +101 -0
- onecomp-1.0.0/onecomp/quantizer/gptq/gptq_layer.py +509 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/__init__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/_jointq.py +368 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/__init__.py +14 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/__version__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/clip.py +212 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/__init__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/local_search_advanced.py +135 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/quantize_advanced.py +597 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/quantizer_advanced.py +535 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/gptq.py +351 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/local_search.py +330 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/quantize.py +1098 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/quantize_multi_gpu.py +310 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/quantizer.py +880 -0
- onecomp-1.0.0/onecomp/quantizer/jointq/core/solution.py +421 -0
- onecomp-1.0.0/onecomp/quantizer/onebit/__init__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/onebit/_onebit.py +143 -0
- onecomp-1.0.0/onecomp/quantizer/onebit/onebit_impl.py +287 -0
- onecomp-1.0.0/onecomp/quantizer/onebit/onebit_layer.py +326 -0
- onecomp-1.0.0/onecomp/quantizer/qbb/__init__.py +7 -0
- onecomp-1.0.0/onecomp/quantizer/qbb/_qbb.py +175 -0
- onecomp-1.0.0/onecomp/quantizer/qbb/qbb_impl.py +305 -0
- onecomp-1.0.0/onecomp/quantizer/quip/__init__.py +7 -0
- onecomp-1.0.0/onecomp/quantizer/quip/_quip.py +153 -0
- onecomp-1.0.0/onecomp/quantizer/quip/quant_quip.py +156 -0
- onecomp-1.0.0/onecomp/quantizer/quip/quip_impl.py +180 -0
- onecomp-1.0.0/onecomp/quantizer/quip/utils.py +65 -0
- onecomp-1.0.0/onecomp/quantizer/quip/utils_had.py +85 -0
- onecomp-1.0.0/onecomp/quantizer/quip/vector_balance.py +530 -0
- onecomp-1.0.0/onecomp/quantizer/rtn/__init__.py +9 -0
- onecomp-1.0.0/onecomp/quantizer/rtn/_rtn.py +152 -0
- onecomp-1.0.0/onecomp/quantizer/rtn/quantizer.py +139 -0
- onecomp-1.0.0/onecomp/quantizer/rtn/rtn_impl.py +90 -0
- onecomp-1.0.0/onecomp/rotated_model_config.py +107 -0
- onecomp-1.0.0/onecomp/runner.py +1846 -0
- onecomp-1.0.0/onecomp/runner_methods/__init__.py +10 -0
- onecomp-1.0.0/onecomp/runner_methods/chunked_quantization.py +388 -0
- onecomp-1.0.0/onecomp/runner_methods/jointq_error_propagation.py +687 -0
- onecomp-1.0.0/onecomp/runner_methods/multi_gpu_quantization.py +417 -0
- onecomp-1.0.0/onecomp/utils/__init__.py +40 -0
- onecomp-1.0.0/onecomp/utils/accuracy.py +134 -0
- onecomp-1.0.0/onecomp/utils/activation_capture.py +103 -0
- onecomp-1.0.0/onecomp/utils/activation_check.py +74 -0
- onecomp-1.0.0/onecomp/utils/blockwise.py +273 -0
- onecomp-1.0.0/onecomp/utils/calibration.py +569 -0
- onecomp-1.0.0/onecomp/utils/perplexity.py +157 -0
- onecomp-1.0.0/onecomp/utils/quant_config.py +28 -0
- onecomp-1.0.0/onecomp/utils/vram_estimator.py +324 -0
- onecomp-1.0.0/onecomp.egg-info/PKG-INFO +303 -0
- onecomp-1.0.0/onecomp.egg-info/SOURCES.txt +145 -0
- onecomp-1.0.0/onecomp.egg-info/dependency_links.txt +1 -0
- onecomp-1.0.0/onecomp.egg-info/entry_points.txt +6 -0
- onecomp-1.0.0/onecomp.egg-info/requires.txt +49 -0
- onecomp-1.0.0/onecomp.egg-info/top_level.txt +6 -0
- onecomp-1.0.0/pyproject.toml +141 -0
- onecomp-1.0.0/setup.cfg +4 -0
- onecomp-1.0.0/vllm_plugins/__init__.py +1 -0
- onecomp-1.0.0/vllm_plugins/dbf/__init__.py +11 -0
- onecomp-1.0.0/vllm_plugins/dbf/modules/__init__.py +1 -0
- onecomp-1.0.0/vllm_plugins/dbf/modules/gemlite_linear.py +262 -0
- onecomp-1.0.0/vllm_plugins/dbf/modules/naive.py +285 -0
- onecomp-1.0.0/vllm_plugins/dbf/vllm_plugin.py +459 -0
- onecomp-1.0.0/vllm_plugins/gptq/__init__.py +1 -0
- onecomp-1.0.0/vllm_plugins/gptq/vllm_plugin.py +236 -0
- onecomp-1.0.0/vllm_plugins/utils/__init__.py +1 -0
- onecomp-1.0.0/vllm_plugins/utils/module.py +87 -0
onecomp-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright 2025-2026 Fujitsu Ltd.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
onecomp-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: onecomp
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Python package for LLM compression
|
|
5
|
+
Author: Keiji Kimura
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright 2025-2026 Fujitsu Ltd.
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/FujitsuResearch/OneCompression
|
|
29
|
+
Project-URL: Documentation, https://FujitsuResearch.github.io/OneCompression/
|
|
30
|
+
Project-URL: Repository, https://github.com/FujitsuResearch/OneCompression
|
|
31
|
+
Project-URL: Bug Tracker, https://github.com/FujitsuResearch/OneCompression/issues
|
|
32
|
+
Project-URL: Changelog, https://github.com/FujitsuResearch/OneCompression/blob/main/CHANGELOG.md
|
|
33
|
+
Keywords: llm,quantization,compression,post-training-quantization,gptq
|
|
34
|
+
Classifier: Development Status :: 4 - Beta
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
40
|
+
Classifier: Intended Audience :: Science/Research
|
|
41
|
+
Requires-Python: <3.14,>=3.12
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Requires-Dist: transformers>=5.3.0
|
|
45
|
+
Requires-Dist: accelerate
|
|
46
|
+
Requires-Dist: datasets
|
|
47
|
+
Requires-Dist: lm-eval
|
|
48
|
+
Requires-Dist: primefac
|
|
49
|
+
Requires-Dist: scipy
|
|
50
|
+
Requires-Dist: gemlite>=0.5.1
|
|
51
|
+
Requires-Dist: hqq>=0.2.8
|
|
52
|
+
Requires-Dist: safetensors
|
|
53
|
+
Requires-Dist: ortools>=9.15.6755
|
|
54
|
+
Provides-Extra: cpu
|
|
55
|
+
Requires-Dist: torch; extra == "cpu"
|
|
56
|
+
Requires-Dist: torchvision; extra == "cpu"
|
|
57
|
+
Provides-Extra: cu118
|
|
58
|
+
Requires-Dist: torch; extra == "cu118"
|
|
59
|
+
Requires-Dist: torchvision; extra == "cu118"
|
|
60
|
+
Provides-Extra: cu121
|
|
61
|
+
Requires-Dist: torch; extra == "cu121"
|
|
62
|
+
Requires-Dist: torchvision; extra == "cu121"
|
|
63
|
+
Provides-Extra: cu124
|
|
64
|
+
Requires-Dist: torch; extra == "cu124"
|
|
65
|
+
Requires-Dist: torchvision; extra == "cu124"
|
|
66
|
+
Provides-Extra: cu126
|
|
67
|
+
Requires-Dist: torch; extra == "cu126"
|
|
68
|
+
Requires-Dist: torchvision; extra == "cu126"
|
|
69
|
+
Provides-Extra: cu128
|
|
70
|
+
Requires-Dist: torch; extra == "cu128"
|
|
71
|
+
Requires-Dist: torchvision; extra == "cu128"
|
|
72
|
+
Provides-Extra: dev
|
|
73
|
+
Requires-Dist: black; extra == "dev"
|
|
74
|
+
Requires-Dist: matplotlib>=3.10.8; extra == "dev"
|
|
75
|
+
Requires-Dist: pylint; extra == "dev"
|
|
76
|
+
Requires-Dist: pytest; extra == "dev"
|
|
77
|
+
Provides-Extra: vllm
|
|
78
|
+
Requires-Dist: vllm; extra == "vllm"
|
|
79
|
+
Provides-Extra: docs
|
|
80
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
81
|
+
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
|
82
|
+
Requires-Dist: mkdocs-gen-files; extra == "docs"
|
|
83
|
+
Requires-Dist: mkdocs-literate-nav; extra == "docs"
|
|
84
|
+
Dynamic: license-file
|
|
85
|
+
|
|
86
|
+
# Fujitsu One Compression
|
|
87
|
+
|
|
88
|
+
Fujitsu One Compression (OneComp) is a Python package for LLM compression.
|
|
89
|
+
|
|
90
|
+
## 📖 Documentation
|
|
91
|
+
|
|
92
|
+
Full documentation is available at **[https://FujitsuResearch.github.io/OneCompression/](https://FujitsuResearch.github.io/OneCompression/)**.
|
|
93
|
+
|
|
94
|
+
## 📦 Features
|
|
95
|
+
|
|
96
|
+
- **Quantization Error Propagation (QEP)**: A post-training quantization method that corrects quantization errors by propagating them to subsequent layers, improving the accuracy of quantized LLMs. See [Arai & Ichikawa, NeurIPS 2025](https://openreview.net/forum?id=a3l3K9khbL) for details. The original reference implementation is available at [FujitsuResearch/qep](https://github.com/FujitsuResearch/qep).
|
|
97
|
+
- **vLLM Plugin Integration**: Serve OneComp-quantized models with [vLLM](https://docs.vllm.ai/) via built-in plugins for DBF and Mixed-GPTQ quantization methods.
|
|
98
|
+
- **AutoBit**: Mixed-precision quantization with ILP-based bitwidth assignment. Automatically estimates the target bitwidth from available VRAM and assigns per-layer bitwidths to minimize quantization error under the memory budget.
|
|
99
|
+
- **JointQ**: Joint quantization method that optimizes weight assignments and scale parameters simultaneously for improved quantization accuracy. Supports group-wise quantization (e.g., 4-bit, groupsize=128).
|
|
100
|
+
- **LoRA SFT Post-Process**: Fine-tune quantized models with LoRA adapters for accuracy recovery or domain-specific knowledge injection. Supports SFT loss, teacher distillation, and intermediate block alignment.
|
|
101
|
+
- **Rotation Preprocessing**: SpinQuant/OstQuant-based rotation preprocessing that reduces quantization error by learning optimal rotation matrices before quantization. Rotation/scaling matrices are absorbed into model weights, with online Hadamard hooks automatically registered at load time. Supports Llama and Qwen3 architectures.
|
|
102
|
+
- (TBD)
|
|
103
|
+
|
|
104
|
+
## 🤖 Supported Models
|
|
105
|
+
|
|
106
|
+
OneComp has been verified with the following model architectures.
|
|
107
|
+
Other Hugging Face-compatible models may work but are currently untested.
|
|
108
|
+
|
|
109
|
+
| # | Architecture | Verified Models | Status |
|
|
110
|
+
|---|-------------|-----------------|--------|
|
|
111
|
+
| 1 | Llama | TinyLlama, Llama-2, Llama-3 | ✅ Verified |
|
|
112
|
+
| 2 | Qwen3 | Qwen3-0.6B ~ 32B | ✅ Verified |
|
|
113
|
+
|
|
114
|
+
> **Note:** Support for additional architectures is planned. Contributions and test reports are welcome.
|
|
115
|
+
|
|
116
|
+
## 🔧 Installation
|
|
117
|
+
|
|
118
|
+
### for users (pip)
|
|
119
|
+
|
|
120
|
+
#### 1. Install PyTorch
|
|
121
|
+
|
|
122
|
+
Please install the appropriate version of PyTorch.
|
|
123
|
+
|
|
124
|
+
#### ✅ CPU-only
|
|
125
|
+
```bash
|
|
126
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
#### ✅ CUDA-enabled
|
|
130
|
+
|
|
131
|
+
Choose the appropriate CUDA version for your system:
|
|
132
|
+
|
|
133
|
+
| CUDA Version | Installation Command |
|
|
134
|
+
|--------------|------------------------|
|
|
135
|
+
| CUDA 11.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118` |
|
|
136
|
+
| CUDA 12.1 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
|
|
137
|
+
| CUDA 12.4 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124` |
|
|
138
|
+
| CUDA 12.6 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126` |
|
|
139
|
+
| CUDA 12.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128` |
|
|
140
|
+
|
|
141
|
+
Check your CUDA version:
|
|
142
|
+
```bash
|
|
143
|
+
nvcc --version
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
or
|
|
147
|
+
```bash
|
|
148
|
+
nvidia-smi
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Verify PyTorch GPU support:
|
|
152
|
+
```python
|
|
153
|
+
import torch
|
|
154
|
+
print(torch.cuda.is_available())
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
#### 2. Install `onecomp`
|
|
158
|
+
|
|
159
|
+
Once PyTorch is installed, you can install `onecomp`:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
pip install onecomp
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### for developers (uv : recommended)
|
|
166
|
+
|
|
167
|
+
#### Install `uv`
|
|
168
|
+
|
|
169
|
+
[`uv`](https://docs.astral.sh/uv/getting-started/installation/) is a fast Python package and project manager written in Rust.
|
|
170
|
+
It offers a drop-in replacement for pip and pip-tools while also managing virtual environments and Python installations.
|
|
171
|
+
With its Rust-based dependency resolver and the `uv.lock` lockfile, uv provides deterministic and reproducible environments across development machines and CI pipelines.
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# install uv (for macOS or Linux)
|
|
175
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
176
|
+
|
|
177
|
+
git clone <git repository URL>
|
|
178
|
+
cd OneCompression
|
|
179
|
+
uv sync --extra cu128 --extra dev
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The `uv sync` command creates a Python virtual environment and installs all dependent libraries.
|
|
183
|
+
|
|
184
|
+
The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
|
|
185
|
+
Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
|
|
186
|
+
PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.
|
|
187
|
+
|
|
188
|
+
Adding `--extra dev` installs additional packages for development.
|
|
189
|
+
|
|
190
|
+
To use vLLM for serving quantized models, add `--extra vllm`:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
uv sync --extra cu128 --extra dev --extra vllm
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
> **Note:** `--extra vllm` may take a long time on the first run if a pre-built `xformers` wheel is not available for your Python/CUDA combination (e.g. Python 3.13). Using Python 3.12 typically avoids this.
|
|
197
|
+
|
|
198
|
+
#### Running commands (uv environment)
|
|
199
|
+
|
|
200
|
+
In the environment created by `uv sync`, you can run commands in two ways:
|
|
201
|
+
|
|
202
|
+
##### Option 1: Use `uv run` (no activation needed)
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
uv run pytest tests/ -v
|
|
206
|
+
uv run python example/example1.py
|
|
207
|
+
uv run black --check onecomp/
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
##### Option 2: Activate the virtual environment (traditional approach)
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
source .venv/bin/activate
|
|
214
|
+
pytest tests/ -v
|
|
215
|
+
python example/example1.py
|
|
216
|
+
black --check onecomp/
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### for developers (pip)
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
git clone <git repository URL>
|
|
223
|
+
cd OneCompression
|
|
224
|
+
|
|
225
|
+
# First, install PyTorch with CUDA support for your environment
|
|
226
|
+
pip install torch --index-url https://download.pytorch.org/whl/cu128
|
|
227
|
+
# Then install onecomp with development dependencies
|
|
228
|
+
pip install -e ".[dev]"
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
### Building Documentation Locally
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
uv sync --extra cu128 --extra dev --extra docs
|
|
238
|
+
uv run mkdocs serve
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Then open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser.
|
|
242
|
+
|
|
243
|
+
## 🚀 Examples
|
|
244
|
+
|
|
245
|
+
| Category | Script | Description |
|
|
246
|
+
|----------|--------|-------------|
|
|
247
|
+
| Quantization | [example_gptq.py](./example/example_gptq.py) | GPTQ quantization |
|
|
248
|
+
| | [example_qep_gptq.py](./example/example_qep_gptq.py) | GPTQ + QEP (error propagation) |
|
|
249
|
+
| | [example_jointq.py](./example/example_jointq.py) | JointQ quantization |
|
|
250
|
+
| | [example_autobit.py](./example/example_autobit.py) | AutoBit mixed-precision quantization |
|
|
251
|
+
| | [example_auto_run.py](./example/example_auto_run.py) | AutoBit with automatic VRAM estimation |
|
|
252
|
+
| Save / Load | [example_save_load.py](./example/example_save_load.py) | Save and load quantized models |
|
|
253
|
+
| Rotation Preprocessing | [example_llama_preprocess_rtn.py](./example/pre_process/example_llama_preprocess_rtn.py) | Rotation preprocessing + RTN (TinyLlama) |
|
|
254
|
+
| | [example_preprocess_save_load.py](./example/pre_process/example_preprocess_save_load.py) | Save and load rotation-preprocessed quantized models |
|
|
255
|
+
| Post-Process | [example_lora_sft.py](./example/post_process/example_lora_sft.py) | LoRA SFT post-quantization fine-tuning |
|
|
256
|
+
| | [example_lora_sft_knowledge.py](./example/post_process/example_lora_sft_knowledge.py) | LoRA SFT knowledge injection |
|
|
257
|
+
| vLLM | [example_gptq_vllm_inference.py](./example/vllm_inference/example_gptq_vllm_inference.py) | GPTQ + QEP quantization and vLLM inference |
|
|
258
|
+
| | [example_autobit_vllm_inference.py](./example/vllm_inference/example_autobit_vllm_inference.py) | AutoBit quantization and vLLM inference |
|
|
259
|
+
|
|
260
|
+
## 🔌 vLLM Inference
|
|
261
|
+
|
|
262
|
+
OneComp-quantized models can be served with [vLLM](https://docs.vllm.ai/) via built-in plugins (DBF, Mixed-GPTQ).
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
# uv users
|
|
266
|
+
uv sync --extra cu128 --extra vllm
|
|
267
|
+
|
|
268
|
+
# pip users
|
|
269
|
+
pip install vllm
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
See the [vLLM Inference guide](https://FujitsuResearch.github.io/OneCompression/user-guide/vllm-inference/) for details.
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
## 📄 License
|
|
276
|
+
|
|
277
|
+
See [LICENSE](./LICENSE) for more details.
|
|
278
|
+
|
|
279
|
+
## Citation
|
|
280
|
+
|
|
281
|
+
OneComp technical report (coming soon on ArXiv):
|
|
282
|
+
|
|
283
|
+
```
|
|
284
|
+
@misc{onecomp2026,
|
|
285
|
+
title={TBD},
|
|
286
|
+
author={TBD},
|
|
287
|
+
year={2026},
|
|
288
|
+
note={arXiv preprint coming soon}
|
|
289
|
+
}
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
QEP (Quantization Error Propagation):
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
@inproceedings{
|
|
296
|
+
arai2025quantization,
|
|
297
|
+
title={Quantization Error Propagation: Revisiting Layer-Wise Post-Training Quantization},
|
|
298
|
+
author={Yamato Arai and Yuma Ichikawa},
|
|
299
|
+
booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems},
|
|
300
|
+
year={2025},
|
|
301
|
+
url={https://openreview.net/forum?id=a3l3K9khbL}
|
|
302
|
+
}
|
|
303
|
+
```
|
onecomp-1.0.0/README.md
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Fujitsu One Compression
|
|
2
|
+
|
|
3
|
+
Fujitsu One Compression (OneComp) is a Python package for LLM compression.
|
|
4
|
+
|
|
5
|
+
## 📖 Documentation
|
|
6
|
+
|
|
7
|
+
Full documentation is available at **[https://FujitsuResearch.github.io/OneCompression/](https://FujitsuResearch.github.io/OneCompression/)**.
|
|
8
|
+
|
|
9
|
+
## 📦 Features
|
|
10
|
+
|
|
11
|
+
- **Quantization Error Propagation (QEP)**: A post-training quantization method that corrects quantization errors by propagating them to subsequent layers, improving the accuracy of quantized LLMs. See [Arai & Ichikawa, NeurIPS 2025](https://openreview.net/forum?id=a3l3K9khbL) for details. The original reference implementation is available at [FujitsuResearch/qep](https://github.com/FujitsuResearch/qep).
|
|
12
|
+
- **vLLM Plugin Integration**: Serve OneComp-quantized models with [vLLM](https://docs.vllm.ai/) via built-in plugins for DBF and Mixed-GPTQ quantization methods.
|
|
13
|
+
- **AutoBit**: Mixed-precision quantization with ILP-based bitwidth assignment. Automatically estimates the target bitwidth from available VRAM and assigns per-layer bitwidths to minimize quantization error under the memory budget.
|
|
14
|
+
- **JointQ**: Joint quantization method that optimizes weight assignments and scale parameters simultaneously for improved quantization accuracy. Supports group-wise quantization (e.g., 4-bit, groupsize=128).
|
|
15
|
+
- **LoRA SFT Post-Process**: Fine-tune quantized models with LoRA adapters for accuracy recovery or domain-specific knowledge injection. Supports SFT loss, teacher distillation, and intermediate block alignment.
|
|
16
|
+
- **Rotation Preprocessing**: SpinQuant/OstQuant-based rotation preprocessing that reduces quantization error by learning optimal rotation matrices before quantization. Rotation/scaling matrices are absorbed into model weights, with online Hadamard hooks automatically registered at load time. Supports Llama and Qwen3 architectures.
|
|
17
|
+
- (TBD)
|
|
18
|
+
|
|
19
|
+
## 🤖 Supported Models
|
|
20
|
+
|
|
21
|
+
OneComp has been verified with the following model architectures.
|
|
22
|
+
Other Hugging Face-compatible models may work but are currently untested.
|
|
23
|
+
|
|
24
|
+
| # | Architecture | Verified Models | Status |
|
|
25
|
+
|---|-------------|-----------------|--------|
|
|
26
|
+
| 1 | Llama | TinyLlama, Llama-2, Llama-3 | ✅ Verified |
|
|
27
|
+
| 2 | Qwen3 | Qwen3-0.6B ~ 32B | ✅ Verified |
|
|
28
|
+
|
|
29
|
+
> **Note:** Support for additional architectures is planned. Contributions and test reports are welcome.
|
|
30
|
+
|
|
31
|
+
## 🔧 Installation
|
|
32
|
+
|
|
33
|
+
### for users (pip)
|
|
34
|
+
|
|
35
|
+
#### 1. Install PyTorch
|
|
36
|
+
|
|
37
|
+
Please install the appropriate version of PyTorch.
|
|
38
|
+
|
|
39
|
+
#### ✅ CPU-only
|
|
40
|
+
```bash
|
|
41
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
#### ✅ CUDA-enabled
|
|
45
|
+
|
|
46
|
+
Choose the appropriate CUDA version for your system:
|
|
47
|
+
|
|
48
|
+
| CUDA Version | Installation Command |
|
|
49
|
+
|--------------|------------------------|
|
|
50
|
+
| CUDA 11.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118` |
|
|
51
|
+
| CUDA 12.1 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
|
|
52
|
+
| CUDA 12.4 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124` |
|
|
53
|
+
| CUDA 12.6 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126` |
|
|
54
|
+
| CUDA 12.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128` |
|
|
55
|
+
|
|
56
|
+
Check your CUDA version:
|
|
57
|
+
```bash
|
|
58
|
+
nvcc --version
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
or
|
|
62
|
+
```bash
|
|
63
|
+
nvidia-smi
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Verify PyTorch GPU support:
|
|
67
|
+
```python
|
|
68
|
+
import torch
|
|
69
|
+
print(torch.cuda.is_available())
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### 2. Install `onecomp`
|
|
73
|
+
|
|
74
|
+
Once PyTorch is installed, you can install `onecomp`:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install onecomp
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### for developers (uv : recommended)
|
|
81
|
+
|
|
82
|
+
#### Install `uv`
|
|
83
|
+
|
|
84
|
+
[`uv`](https://docs.astral.sh/uv/getting-started/installation/) is a fast Python package and project manager written in Rust.
|
|
85
|
+
It offers a drop-in replacement for pip and pip-tools while also managing virtual environments and Python installations.
|
|
86
|
+
With its Rust-based dependency resolver and the `uv.lock` lockfile, uv provides deterministic and reproducible environments across development machines and CI pipelines.
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# install uv (for macOS or Linux)
|
|
90
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
91
|
+
|
|
92
|
+
git clone <git repository URL>
|
|
93
|
+
cd OneCompression
|
|
94
|
+
uv sync --extra cu128 --extra dev
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The `uv sync` command creates a Python virtual environment and installs all dependent libraries.
|
|
98
|
+
|
|
99
|
+
The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
|
|
100
|
+
Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
|
|
101
|
+
PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.
|
|
102
|
+
|
|
103
|
+
Adding `--extra dev` installs additional packages for development.
|
|
104
|
+
|
|
105
|
+
To use vLLM for serving quantized models, add `--extra vllm`:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
uv sync --extra cu128 --extra dev --extra vllm
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
> **Note:** `--extra vllm` may take a long time on the first run if a pre-built `xformers` wheel is not available for your Python/CUDA combination (e.g. Python 3.13). Using Python 3.12 typically avoids this.
|
|
112
|
+
|
|
113
|
+
#### Running commands (uv environment)
|
|
114
|
+
|
|
115
|
+
In the environment created by `uv sync`, you can run commands in two ways:
|
|
116
|
+
|
|
117
|
+
##### Option 1: Use `uv run` (no activation needed)
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
uv run pytest tests/ -v
|
|
121
|
+
uv run python example/example1.py
|
|
122
|
+
uv run black --check onecomp/
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
##### Option 2: Activate the virtual environment (traditional approach)
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
source .venv/bin/activate
|
|
129
|
+
pytest tests/ -v
|
|
130
|
+
python example/example1.py
|
|
131
|
+
black --check onecomp/
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### for developers (pip)
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
git clone <git repository URL>
|
|
138
|
+
cd OneCompression
|
|
139
|
+
|
|
140
|
+
# First, install PyTorch with CUDA support for your environment
|
|
141
|
+
pip install torch --index-url https://download.pytorch.org/whl/cu128
|
|
142
|
+
# Then install onecomp with development dependencies
|
|
143
|
+
pip install -e ".[dev]"
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
### Building Documentation Locally
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
uv sync --extra cu128 --extra dev --extra docs
|
|
153
|
+
uv run mkdocs serve
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Then open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser.
|
|
157
|
+
|
|
158
|
+
## 🚀 Examples
|
|
159
|
+
|
|
160
|
+
| Category | Script | Description |
|
|
161
|
+
|----------|--------|-------------|
|
|
162
|
+
| Quantization | [example_gptq.py](./example/example_gptq.py) | GPTQ quantization |
|
|
163
|
+
| | [example_qep_gptq.py](./example/example_qep_gptq.py) | GPTQ + QEP (error propagation) |
|
|
164
|
+
| | [example_jointq.py](./example/example_jointq.py) | JointQ quantization |
|
|
165
|
+
| | [example_autobit.py](./example/example_autobit.py) | AutoBit mixed-precision quantization |
|
|
166
|
+
| | [example_auto_run.py](./example/example_auto_run.py) | AutoBit with automatic VRAM estimation |
|
|
167
|
+
| Save / Load | [example_save_load.py](./example/example_save_load.py) | Save and load quantized models |
|
|
168
|
+
| Rotation Preprocessing | [example_llama_preprocess_rtn.py](./example/pre_process/example_llama_preprocess_rtn.py) | Rotation preprocessing + RTN (TinyLlama) |
|
|
169
|
+
| | [example_preprocess_save_load.py](./example/pre_process/example_preprocess_save_load.py) | Save and load rotation-preprocessed quantized models |
|
|
170
|
+
| Post-Process | [example_lora_sft.py](./example/post_process/example_lora_sft.py) | LoRA SFT post-quantization fine-tuning |
|
|
171
|
+
| | [example_lora_sft_knowledge.py](./example/post_process/example_lora_sft_knowledge.py) | LoRA SFT knowledge injection |
|
|
172
|
+
| vLLM | [example_gptq_vllm_inference.py](./example/vllm_inference/example_gptq_vllm_inference.py) | GPTQ + QEP quantization and vLLM inference |
|
|
173
|
+
| | [example_autobit_vllm_inference.py](./example/vllm_inference/example_autobit_vllm_inference.py) | AutoBit quantization and vLLM inference |
|
|
174
|
+
|
|
175
|
+
## 🔌 vLLM Inference
|
|
176
|
+
|
|
177
|
+
OneComp-quantized models can be served with [vLLM](https://docs.vllm.ai/) via built-in plugins (DBF, Mixed-GPTQ).
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# uv users
|
|
181
|
+
uv sync --extra cu128 --extra vllm
|
|
182
|
+
|
|
183
|
+
# pip users
|
|
184
|
+
pip install vllm
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
See the [vLLM Inference guide](https://FujitsuResearch.github.io/OneCompression/user-guide/vllm-inference/) for details.
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
## 📄 License
|
|
191
|
+
|
|
192
|
+
See [LICENSE](./LICENSE) for more details.
|
|
193
|
+
|
|
194
|
+
## Citation
|
|
195
|
+
|
|
196
|
+
OneComp technical report (coming soon on ArXiv):
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
@misc{onecomp2026,
|
|
200
|
+
title={TBD},
|
|
201
|
+
author={TBD},
|
|
202
|
+
year={2026},
|
|
203
|
+
note={arXiv preprint coming soon}
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
QEP (Quantization Error Propagation):
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
@inproceedings{
|
|
211
|
+
arai2025quantization,
|
|
212
|
+
title={Quantization Error Propagation: Revisiting Layer-Wise Post-Training Quantization},
|
|
213
|
+
author={Yamato Arai and Yuma Ichikawa},
|
|
214
|
+
booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems},
|
|
215
|
+
year={2025},
|
|
216
|
+
url={https://openreview.net/forum?id=a3l3K9khbL}
|
|
217
|
+
}
|
|
218
|
+
```
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""GPTQ Benchmark
|
|
2
|
+
|
|
3
|
+
Run GPTQ for all combinations of bits × group_size in a single pass.
|
|
4
|
+
Shares calibration data accumulation across quantizers for efficiency.
|
|
5
|
+
Results are saved under output_dir.
|
|
6
|
+
|
|
7
|
+
Copyright 2025-2026 Fujitsu Ltd.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python quant_benchmark.py
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import itertools
|
|
14
|
+
|
|
15
|
+
import hydra
|
|
16
|
+
from omegaconf import DictConfig, OmegaConf
|
|
17
|
+
|
|
18
|
+
from onecomp import GPTQ, ModelConfig, Runner
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def create_quantizers(cfg: DictConfig):
|
|
22
|
+
"""Create a list of GPTQ quantizers for all combinations of bits × group_size."""
|
|
23
|
+
quantizers = []
|
|
24
|
+
sym = cfg.gptq.symmetric
|
|
25
|
+
sym_label = "sym" if sym else "asym"
|
|
26
|
+
|
|
27
|
+
for bits, gs in itertools.product(cfg.gptq.bits, cfg.gptq.group_size):
|
|
28
|
+
# Label strings
|
|
29
|
+
gs_label = "pc" if gs is None else f"gs{gs}"
|
|
30
|
+
|
|
31
|
+
# GPTQ: groupsize=-1 means per-channel
|
|
32
|
+
gptq_groupsize = -1 if gs is None else gs
|
|
33
|
+
quantizers.append(
|
|
34
|
+
GPTQ(
|
|
35
|
+
num_layers=cfg.gptq.num_layers,
|
|
36
|
+
wbits=bits,
|
|
37
|
+
sym=sym,
|
|
38
|
+
groupsize=gptq_groupsize,
|
|
39
|
+
blocksize=cfg.gptq.blocksize,
|
|
40
|
+
percdamp=cfg.gptq.percdamp,
|
|
41
|
+
actorder=cfg.gptq.actorder,
|
|
42
|
+
mse=cfg.gptq.mse,
|
|
43
|
+
q_grid=cfg.gptq.q_grid,
|
|
44
|
+
q_norm=cfg.gptq.q_norm,
|
|
45
|
+
calc_quant_error=True,
|
|
46
|
+
name=f"GPTQ_{bits}bit_{gs_label}_{sym_label}",
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return quantizers
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@hydra.main(version_base=None, config_path="conf", config_name="benchmark_llama3-8b")
|
|
54
|
+
def main(cfg: DictConfig):
|
|
55
|
+
print(OmegaConf.to_yaml(cfg))
|
|
56
|
+
|
|
57
|
+
model_config = ModelConfig(path=cfg.model_path, device=cfg.model_device)
|
|
58
|
+
|
|
59
|
+
quantizers = create_quantizers(cfg)
|
|
60
|
+
|
|
61
|
+
print(f"Number of quantizers: {len(quantizers)}")
|
|
62
|
+
for q in quantizers:
|
|
63
|
+
print(f" - {q.name}")
|
|
64
|
+
|
|
65
|
+
# Build Runner
|
|
66
|
+
runner = Runner(
|
|
67
|
+
model_config=model_config,
|
|
68
|
+
quantizers=quantizers,
|
|
69
|
+
max_length=cfg.max_length,
|
|
70
|
+
num_calibration_samples=cfg.num_calibration_samples,
|
|
71
|
+
calibration_strategy=cfg.calibration_strategy,
|
|
72
|
+
calibration_seed=cfg.calibration_seed,
|
|
73
|
+
calibration_batch_size=cfg.calibration_batch_size,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Run quantization
|
|
77
|
+
runner.run()
|
|
78
|
+
|
|
79
|
+
# Save results
|
|
80
|
+
for q in quantizers:
|
|
81
|
+
runner.save_quantization_statistics(
|
|
82
|
+
f"quantization_statistics_{q.name}.json", quantizer=q
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Perplexity evaluation
|
|
86
|
+
if cfg.calc_ppl:
|
|
87
|
+
runner.benchmark_perplexity(original_model=cfg.calc_original_ppl)
|
|
88
|
+
|
|
89
|
+
# Accuracy evaluation
|
|
90
|
+
if cfg.calc_acc:
|
|
91
|
+
runner.benchmark_accuracy(original_model=cfg.calc_original_acc)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
main()
|