onecomp 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. onecomp-1.0.0/LICENSE +21 -0
  2. onecomp-1.0.0/PKG-INFO +303 -0
  3. onecomp-1.0.0/README.md +218 -0
  4. onecomp-1.0.0/benchmark/llama3-8b-gptq/quant_benchmark.py +95 -0
  5. onecomp-1.0.0/benchmark/llama3-8b-jointq/quant_benchmark.py +106 -0
  6. onecomp-1.0.0/benchmark/llama3-8b-qep-gptq/quant_benchmark.py +101 -0
  7. onecomp-1.0.0/benchmark/llama3-8b-various/quant_benchmark.py +70 -0
  8. onecomp-1.0.0/benchmark/qwen3-14b-gptq/quant_benchmark.py +88 -0
  9. onecomp-1.0.0/benchmark/qwen3-14b-jointq/quant_benchmark.py +106 -0
  10. onecomp-1.0.0/benchmark/qwen3-8b-gptq/quant_benchmark.py +88 -0
  11. onecomp-1.0.0/benchmark/qwen3-8b-jointq/quant_benchmark.py +106 -0
  12. onecomp-1.0.0/example/example_auto_run.py +23 -0
  13. onecomp-1.0.0/example/example_autobit.py +47 -0
  14. onecomp-1.0.0/example/example_gptq.py +45 -0
  15. onecomp-1.0.0/example/example_jointq.py +45 -0
  16. onecomp-1.0.0/example/example_qep_gptq.py +45 -0
  17. onecomp-1.0.0/example/example_save_load.py +60 -0
  18. onecomp-1.0.0/example/post_process/example_lora_sft.py +143 -0
  19. onecomp-1.0.0/example/post_process/example_lora_sft_knowledge.py +144 -0
  20. onecomp-1.0.0/example/pre_process/example_llama_preprocess_rtn.py +51 -0
  21. onecomp-1.0.0/example/pre_process/example_preprocess_save_load.py +122 -0
  22. onecomp-1.0.0/example/vllm_inference/example_autobit_vllm_inference.py +63 -0
  23. onecomp-1.0.0/example/vllm_inference/example_gptq_vllm_inference.py +74 -0
  24. onecomp-1.0.0/onecomp/__init__.py +23 -0
  25. onecomp-1.0.0/onecomp/__main__.py +8 -0
  26. onecomp-1.0.0/onecomp/__version__.py +9 -0
  27. onecomp-1.0.0/onecomp/analyzer/__init__.py +23 -0
  28. onecomp-1.0.0/onecomp/analyzer/cumulative_error.py +237 -0
  29. onecomp-1.0.0/onecomp/analyzer/weight_outlier.py +746 -0
  30. onecomp-1.0.0/onecomp/cli.py +89 -0
  31. onecomp-1.0.0/onecomp/log.py +32 -0
  32. onecomp-1.0.0/onecomp/model_config.py +105 -0
  33. onecomp-1.0.0/onecomp/post_process/__init__.py +24 -0
  34. onecomp-1.0.0/onecomp/post_process/_base.py +78 -0
  35. onecomp-1.0.0/onecomp/post_process/blockwise_ptq.py +76 -0
  36. onecomp-1.0.0/onecomp/post_process/post_process_lora_sft.py +1260 -0
  37. onecomp-1.0.0/onecomp/pre_process/__init__.py +11 -0
  38. onecomp-1.0.0/onecomp/pre_process/hadamard_utils.py +120444 -0
  39. onecomp-1.0.0/onecomp/pre_process/modeling_llama.py +94 -0
  40. onecomp-1.0.0/onecomp/pre_process/modeling_qwen3.py +93 -0
  41. onecomp-1.0.0/onecomp/pre_process/optimizer.py +226 -0
  42. onecomp-1.0.0/onecomp/pre_process/prepare_rotated_model.py +251 -0
  43. onecomp-1.0.0/onecomp/pre_process/preprocess_args.py +76 -0
  44. onecomp-1.0.0/onecomp/pre_process/quant_models.py +950 -0
  45. onecomp-1.0.0/onecomp/pre_process/rotation_utils.py +636 -0
  46. onecomp-1.0.0/onecomp/pre_process/train_rotation.py +575 -0
  47. onecomp-1.0.0/onecomp/qep/__init__.py +11 -0
  48. onecomp-1.0.0/onecomp/qep/_qep_config.py +56 -0
  49. onecomp-1.0.0/onecomp/qep/_quantize_with_qep.py +133 -0
  50. onecomp-1.0.0/onecomp/qep/_quantize_with_qep_arch.py +360 -0
  51. onecomp-1.0.0/onecomp/quantized_model_loader.py +374 -0
  52. onecomp-1.0.0/onecomp/quantizer/__init__.py +20 -0
  53. onecomp-1.0.0/onecomp/quantizer/_quantizer.py +915 -0
  54. onecomp-1.0.0/onecomp/quantizer/arb/__init__.py +7 -0
  55. onecomp-1.0.0/onecomp/quantizer/arb/_arb.py +140 -0
  56. onecomp-1.0.0/onecomp/quantizer/arb/arb_impl.py +333 -0
  57. onecomp-1.0.0/onecomp/quantizer/autobit/__init__.py +13 -0
  58. onecomp-1.0.0/onecomp/quantizer/autobit/_autobit.py +607 -0
  59. onecomp-1.0.0/onecomp/quantizer/autobit/activation_stats.py +275 -0
  60. onecomp-1.0.0/onecomp/quantizer/autobit/dbf_fallback.py +73 -0
  61. onecomp-1.0.0/onecomp/quantizer/autobit/ilp.py +336 -0
  62. onecomp-1.0.0/onecomp/quantizer/autobit/manual.py +30 -0
  63. onecomp-1.0.0/onecomp/quantizer/autobit/visualize.py +395 -0
  64. onecomp-1.0.0/onecomp/quantizer/cq/__init__.py +7 -0
  65. onecomp-1.0.0/onecomp/quantizer/cq/_cq.py +113 -0
  66. onecomp-1.0.0/onecomp/quantizer/cq/cq_impl.py +211 -0
  67. onecomp-1.0.0/onecomp/quantizer/dbf/__init__.py +14 -0
  68. onecomp-1.0.0/onecomp/quantizer/dbf/_dbf.py +400 -0
  69. onecomp-1.0.0/onecomp/quantizer/dbf/admm_extended.py +815 -0
  70. onecomp-1.0.0/onecomp/quantizer/dbf/balance.py +232 -0
  71. onecomp-1.0.0/onecomp/quantizer/dbf/config.py +63 -0
  72. onecomp-1.0.0/onecomp/quantizer/dbf/dbf_impl.py +190 -0
  73. onecomp-1.0.0/onecomp/quantizer/dbf/dbf_layer.py +263 -0
  74. onecomp-1.0.0/onecomp/quantizer/dbf/dbf_original.py +788 -0
  75. onecomp-1.0.0/onecomp/quantizer/dbf/fine_tune.py +310 -0
  76. onecomp-1.0.0/onecomp/quantizer/dbf/middle.py +1153 -0
  77. onecomp-1.0.0/onecomp/quantizer/gemlite.py +141 -0
  78. onecomp-1.0.0/onecomp/quantizer/gptq/__init__.py +9 -0
  79. onecomp-1.0.0/onecomp/quantizer/gptq/_gptq.py +823 -0
  80. onecomp-1.0.0/onecomp/quantizer/gptq/config.py +101 -0
  81. onecomp-1.0.0/onecomp/quantizer/gptq/gptq_layer.py +509 -0
  82. onecomp-1.0.0/onecomp/quantizer/jointq/__init__.py +9 -0
  83. onecomp-1.0.0/onecomp/quantizer/jointq/_jointq.py +368 -0
  84. onecomp-1.0.0/onecomp/quantizer/jointq/core/__init__.py +14 -0
  85. onecomp-1.0.0/onecomp/quantizer/jointq/core/__version__.py +9 -0
  86. onecomp-1.0.0/onecomp/quantizer/jointq/core/clip.py +212 -0
  87. onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/__init__.py +9 -0
  88. onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/local_search_advanced.py +135 -0
  89. onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/quantize_advanced.py +597 -0
  90. onecomp-1.0.0/onecomp/quantizer/jointq/core/error_propagation/quantizer_advanced.py +535 -0
  91. onecomp-1.0.0/onecomp/quantizer/jointq/core/gptq.py +351 -0
  92. onecomp-1.0.0/onecomp/quantizer/jointq/core/local_search.py +330 -0
  93. onecomp-1.0.0/onecomp/quantizer/jointq/core/quantize.py +1098 -0
  94. onecomp-1.0.0/onecomp/quantizer/jointq/core/quantize_multi_gpu.py +310 -0
  95. onecomp-1.0.0/onecomp/quantizer/jointq/core/quantizer.py +880 -0
  96. onecomp-1.0.0/onecomp/quantizer/jointq/core/solution.py +421 -0
  97. onecomp-1.0.0/onecomp/quantizer/onebit/__init__.py +9 -0
  98. onecomp-1.0.0/onecomp/quantizer/onebit/_onebit.py +143 -0
  99. onecomp-1.0.0/onecomp/quantizer/onebit/onebit_impl.py +287 -0
  100. onecomp-1.0.0/onecomp/quantizer/onebit/onebit_layer.py +326 -0
  101. onecomp-1.0.0/onecomp/quantizer/qbb/__init__.py +7 -0
  102. onecomp-1.0.0/onecomp/quantizer/qbb/_qbb.py +175 -0
  103. onecomp-1.0.0/onecomp/quantizer/qbb/qbb_impl.py +305 -0
  104. onecomp-1.0.0/onecomp/quantizer/quip/__init__.py +7 -0
  105. onecomp-1.0.0/onecomp/quantizer/quip/_quip.py +153 -0
  106. onecomp-1.0.0/onecomp/quantizer/quip/quant_quip.py +156 -0
  107. onecomp-1.0.0/onecomp/quantizer/quip/quip_impl.py +180 -0
  108. onecomp-1.0.0/onecomp/quantizer/quip/utils.py +65 -0
  109. onecomp-1.0.0/onecomp/quantizer/quip/utils_had.py +85 -0
  110. onecomp-1.0.0/onecomp/quantizer/quip/vector_balance.py +530 -0
  111. onecomp-1.0.0/onecomp/quantizer/rtn/__init__.py +9 -0
  112. onecomp-1.0.0/onecomp/quantizer/rtn/_rtn.py +152 -0
  113. onecomp-1.0.0/onecomp/quantizer/rtn/quantizer.py +139 -0
  114. onecomp-1.0.0/onecomp/quantizer/rtn/rtn_impl.py +90 -0
  115. onecomp-1.0.0/onecomp/rotated_model_config.py +107 -0
  116. onecomp-1.0.0/onecomp/runner.py +1846 -0
  117. onecomp-1.0.0/onecomp/runner_methods/__init__.py +10 -0
  118. onecomp-1.0.0/onecomp/runner_methods/chunked_quantization.py +388 -0
  119. onecomp-1.0.0/onecomp/runner_methods/jointq_error_propagation.py +687 -0
  120. onecomp-1.0.0/onecomp/runner_methods/multi_gpu_quantization.py +417 -0
  121. onecomp-1.0.0/onecomp/utils/__init__.py +40 -0
  122. onecomp-1.0.0/onecomp/utils/accuracy.py +134 -0
  123. onecomp-1.0.0/onecomp/utils/activation_capture.py +103 -0
  124. onecomp-1.0.0/onecomp/utils/activation_check.py +74 -0
  125. onecomp-1.0.0/onecomp/utils/blockwise.py +273 -0
  126. onecomp-1.0.0/onecomp/utils/calibration.py +569 -0
  127. onecomp-1.0.0/onecomp/utils/perplexity.py +157 -0
  128. onecomp-1.0.0/onecomp/utils/quant_config.py +28 -0
  129. onecomp-1.0.0/onecomp/utils/vram_estimator.py +324 -0
  130. onecomp-1.0.0/onecomp.egg-info/PKG-INFO +303 -0
  131. onecomp-1.0.0/onecomp.egg-info/SOURCES.txt +145 -0
  132. onecomp-1.0.0/onecomp.egg-info/dependency_links.txt +1 -0
  133. onecomp-1.0.0/onecomp.egg-info/entry_points.txt +6 -0
  134. onecomp-1.0.0/onecomp.egg-info/requires.txt +49 -0
  135. onecomp-1.0.0/onecomp.egg-info/top_level.txt +6 -0
  136. onecomp-1.0.0/pyproject.toml +141 -0
  137. onecomp-1.0.0/setup.cfg +4 -0
  138. onecomp-1.0.0/vllm_plugins/__init__.py +1 -0
  139. onecomp-1.0.0/vllm_plugins/dbf/__init__.py +11 -0
  140. onecomp-1.0.0/vllm_plugins/dbf/modules/__init__.py +1 -0
  141. onecomp-1.0.0/vllm_plugins/dbf/modules/gemlite_linear.py +262 -0
  142. onecomp-1.0.0/vllm_plugins/dbf/modules/naive.py +285 -0
  143. onecomp-1.0.0/vllm_plugins/dbf/vllm_plugin.py +459 -0
  144. onecomp-1.0.0/vllm_plugins/gptq/__init__.py +1 -0
  145. onecomp-1.0.0/vllm_plugins/gptq/vllm_plugin.py +236 -0
  146. onecomp-1.0.0/vllm_plugins/utils/__init__.py +1 -0
  147. onecomp-1.0.0/vllm_plugins/utils/module.py +87 -0
onecomp-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright 2025-2026 Fujitsu Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
onecomp-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,303 @@
1
+ Metadata-Version: 2.4
2
+ Name: onecomp
3
+ Version: 1.0.0
4
+ Summary: Python package for LLM compression
5
+ Author: Keiji Kimura
6
+ License: MIT License
7
+
8
+ Copyright 2025-2026 Fujitsu Ltd.
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/FujitsuResearch/OneCompression
29
+ Project-URL: Documentation, https://FujitsuResearch.github.io/OneCompression/
30
+ Project-URL: Repository, https://github.com/FujitsuResearch/OneCompression
31
+ Project-URL: Bug Tracker, https://github.com/FujitsuResearch/OneCompression/issues
32
+ Project-URL: Changelog, https://github.com/FujitsuResearch/OneCompression/blob/main/CHANGELOG.md
33
+ Keywords: llm,quantization,compression,post-training-quantization,gptq
34
+ Classifier: Development Status :: 4 - Beta
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
40
+ Classifier: Intended Audience :: Science/Research
41
+ Requires-Python: <3.14,>=3.12
42
+ Description-Content-Type: text/markdown
43
+ License-File: LICENSE
44
+ Requires-Dist: transformers>=5.3.0
45
+ Requires-Dist: accelerate
46
+ Requires-Dist: datasets
47
+ Requires-Dist: lm-eval
48
+ Requires-Dist: primefac
49
+ Requires-Dist: scipy
50
+ Requires-Dist: gemlite>=0.5.1
51
+ Requires-Dist: hqq>=0.2.8
52
+ Requires-Dist: safetensors
53
+ Requires-Dist: ortools>=9.15.6755
54
+ Provides-Extra: cpu
55
+ Requires-Dist: torch; extra == "cpu"
56
+ Requires-Dist: torchvision; extra == "cpu"
57
+ Provides-Extra: cu118
58
+ Requires-Dist: torch; extra == "cu118"
59
+ Requires-Dist: torchvision; extra == "cu118"
60
+ Provides-Extra: cu121
61
+ Requires-Dist: torch; extra == "cu121"
62
+ Requires-Dist: torchvision; extra == "cu121"
63
+ Provides-Extra: cu124
64
+ Requires-Dist: torch; extra == "cu124"
65
+ Requires-Dist: torchvision; extra == "cu124"
66
+ Provides-Extra: cu126
67
+ Requires-Dist: torch; extra == "cu126"
68
+ Requires-Dist: torchvision; extra == "cu126"
69
+ Provides-Extra: cu128
70
+ Requires-Dist: torch; extra == "cu128"
71
+ Requires-Dist: torchvision; extra == "cu128"
72
+ Provides-Extra: dev
73
+ Requires-Dist: black; extra == "dev"
74
+ Requires-Dist: matplotlib>=3.10.8; extra == "dev"
75
+ Requires-Dist: pylint; extra == "dev"
76
+ Requires-Dist: pytest; extra == "dev"
77
+ Provides-Extra: vllm
78
+ Requires-Dist: vllm; extra == "vllm"
79
+ Provides-Extra: docs
80
+ Requires-Dist: mkdocs-material; extra == "docs"
81
+ Requires-Dist: mkdocstrings[python]; extra == "docs"
82
+ Requires-Dist: mkdocs-gen-files; extra == "docs"
83
+ Requires-Dist: mkdocs-literate-nav; extra == "docs"
84
+ Dynamic: license-file
85
+
86
+ # Fujitsu One Compression
87
+
88
+ Fujitsu One Compression (OneComp) is a Python package for LLM compression.
89
+
90
+ ## 📖 Documentation
91
+
92
+ Full documentation is available at **[https://FujitsuResearch.github.io/OneCompression/](https://FujitsuResearch.github.io/OneCompression/)**.
93
+
94
+ ## 📦 Features
95
+
96
+ - **Quantization Error Propagation (QEP)**: A post-training quantization method that corrects quantization errors by propagating them to subsequent layers, improving the accuracy of quantized LLMs. See [Arai & Ichikawa, NeurIPS 2025](https://openreview.net/forum?id=a3l3K9khbL) for details. The original reference implementation is available at [FujitsuResearch/qep](https://github.com/FujitsuResearch/qep).
97
+ - **vLLM Plugin Integration**: Serve OneComp-quantized models with [vLLM](https://docs.vllm.ai/) via built-in plugins for DBF and Mixed-GPTQ quantization methods.
98
+ - **AutoBit**: Mixed-precision quantization with ILP-based bitwidth assignment. Automatically estimates the target bitwidth from available VRAM and assigns per-layer bitwidths to minimize quantization error under the memory budget.
99
+ - **JointQ**: Joint quantization method that optimizes weight assignments and scale parameters simultaneously for improved quantization accuracy. Supports group-wise quantization (e.g., 4-bit, groupsize=128).
100
+ - **LoRA SFT Post-Process**: Fine-tune quantized models with LoRA adapters for accuracy recovery or domain-specific knowledge injection. Supports SFT loss, teacher distillation, and intermediate block alignment.
101
+ - **Rotation Preprocessing**: SpinQuant/OstQuant-based rotation preprocessing that reduces quantization error by learning optimal rotation matrices before quantization. Rotation/scaling matrices are absorbed into model weights, with online Hadamard hooks automatically registered at load time. Supports Llama and Qwen3 architectures.
102
+ - (TBD)
103
+
104
+ ## 🤖 Supported Models
105
+
106
+ OneComp has been verified with the following model architectures.
107
+ Other Hugging Face-compatible models may work but are currently untested.
108
+
109
+ | # | Architecture | Verified Models | Status |
110
+ |---|-------------|-----------------|--------|
111
+ | 1 | Llama | TinyLlama, Llama-2, Llama-3 | ✅ Verified |
112
+ | 2 | Qwen3 | Qwen3-0.6B ~ 32B | ✅ Verified |
113
+
114
+ > **Note:** Support for additional architectures is planned. Contributions and test reports are welcome.
115
+
116
+ ## 🔧 Installation
117
+
118
+ ### for users (pip)
119
+
120
+ #### 1. Install PyTorch
121
+
122
+ Please install the appropriate version of PyTorch.
123
+
124
+ #### ✅ CPU-only
125
+ ```bash
126
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
127
+ ```
128
+
129
+ #### ✅ CUDA-enabled
130
+
131
+ Choose the appropriate CUDA version for your system:
132
+
133
+ | CUDA Version | Installation Command |
134
+ |--------------|------------------------|
135
+ | CUDA 11.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118` |
136
+ | CUDA 12.1 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
137
+ | CUDA 12.4 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124` |
138
+ | CUDA 12.6 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126` |
139
+ | CUDA 12.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128` |
140
+
141
+ Check your CUDA version:
142
+ ```bash
143
+ nvcc --version
144
+ ```
145
+
146
+ or
147
+ ```bash
148
+ nvidia-smi
149
+ ```
150
+
151
+ Verify PyTorch GPU support:
152
+ ```python
153
+ import torch
154
+ print(torch.cuda.is_available())
155
+ ```
156
+
157
+ #### 2. Install `onecomp`
158
+
159
+ Once PyTorch is installed, you can install `onecomp`:
160
+
161
+ ```bash
162
+ pip install onecomp
163
+ ```
164
+
165
+ ### for developers (uv : recommended)
166
+
167
+ #### Install `uv`
168
+
169
+ [`uv`](https://docs.astral.sh/uv/getting-started/installation/) is a fast Python package and project manager written in Rust.
170
+ It offers a drop-in replacement for pip and pip-tools while also managing virtual environments and Python installations.
171
+ With its Rust-based dependency resolver and the `uv.lock` lockfile, uv provides deterministic and reproducible environments across development machines and CI pipelines.
172
+
173
+ ```bash
174
+ # install uv (for macOS or Linux)
175
+ curl -LsSf https://astral.sh/uv/install.sh | sh
176
+
177
+ git clone <git repository URL>
178
+ cd OneCompression
179
+ uv sync --extra cu128 --extra dev
180
+ ```
181
+
182
+ The `uv sync` command creates a Python virtual environment and installs all dependent libraries.
183
+
184
+ The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
185
+ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
186
+ PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.
187
+
188
+ Adding `--extra dev` installs additional packages for development.
189
+
190
+ To use vLLM for serving quantized models, add `--extra vllm`:
191
+
192
+ ```bash
193
+ uv sync --extra cu128 --extra dev --extra vllm
194
+ ```
195
+
196
+ > **Note:** `--extra vllm` may take a long time on the first run if a pre-built `xformers` wheel is not available for your Python/CUDA combination (e.g. Python 3.13). Using Python 3.12 typically avoids this.
197
+
198
+ #### Running commands (uv environment)
199
+
200
+ In the environment created by `uv sync`, you can run commands in two ways:
201
+
202
+ ##### Option 1: Use `uv run` (no activation needed)
203
+
204
+ ```bash
205
+ uv run pytest tests/ -v
206
+ uv run python example/example1.py
207
+ uv run black --check onecomp/
208
+ ```
209
+
210
+ ##### Option 2: Activate the virtual environment (traditional approach)
211
+
212
+ ```bash
213
+ source .venv/bin/activate
214
+ pytest tests/ -v
215
+ python example/example1.py
216
+ black --check onecomp/
217
+ ```
218
+
219
+ ### for developers (pip)
220
+
221
+ ```bash
222
+ git clone <git repository URL>
223
+ cd OneCompression
224
+
225
+ # First, install PyTorch with CUDA support for your environment
226
+ pip install torch --index-url https://download.pytorch.org/whl/cu128
227
+ # Then install onecomp with development dependencies
228
+ pip install -e ".[dev]"
229
+ ```
230
+
231
+ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
232
+
233
+
234
+ ### Building Documentation Locally
235
+
236
+ ```bash
237
+ uv sync --extra cu128 --extra dev --extra docs
238
+ uv run mkdocs serve
239
+ ```
240
+
241
+ Then open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser.
242
+
243
+ ## 🚀 Examples
244
+
245
+ | Category | Script | Description |
246
+ |----------|--------|-------------|
247
+ | Quantization | [example_gptq.py](./example/example_gptq.py) | GPTQ quantization |
248
+ | | [example_qep_gptq.py](./example/example_qep_gptq.py) | GPTQ + QEP (error propagation) |
249
+ | | [example_jointq.py](./example/example_jointq.py) | JointQ quantization |
250
+ | | [example_autobit.py](./example/example_autobit.py) | AutoBit mixed-precision quantization |
251
+ | | [example_auto_run.py](./example/example_auto_run.py) | AutoBit with automatic VRAM estimation |
252
+ | Save / Load | [example_save_load.py](./example/example_save_load.py) | Save and load quantized models |
253
+ | Rotation Preprocessing | [example_llama_preprocess_rtn.py](./example/pre_process/example_llama_preprocess_rtn.py) | Rotation preprocessing + RTN (TinyLlama) |
254
+ | | [example_preprocess_save_load.py](./example/pre_process/example_preprocess_save_load.py) | Save and load rotation-preprocessed quantized models |
255
+ | Post-Process | [example_lora_sft.py](./example/post_process/example_lora_sft.py) | LoRA SFT post-quantization fine-tuning |
256
+ | | [example_lora_sft_knowledge.py](./example/post_process/example_lora_sft_knowledge.py) | LoRA SFT knowledge injection |
257
+ | vLLM | [example_gptq_vllm_inference.py](./example/vllm_inference/example_gptq_vllm_inference.py) | GPTQ + QEP quantization and vLLM inference |
258
+ | | [example_autobit_vllm_inference.py](./example/vllm_inference/example_autobit_vllm_inference.py) | AutoBit quantization and vLLM inference |
259
+
260
+ ## 🔌 vLLM Inference
261
+
262
+ OneComp-quantized models can be served with [vLLM](https://docs.vllm.ai/) via built-in plugins (DBF, Mixed-GPTQ).
263
+
264
+ ```bash
265
+ # uv users
266
+ uv sync --extra cu128 --extra vllm
267
+
268
+ # pip users
269
+ pip install vllm
270
+ ```
271
+
272
+ See the [vLLM Inference guide](https://FujitsuResearch.github.io/OneCompression/user-guide/vllm-inference/) for details.
273
+
274
+
275
+ ## 📄 License
276
+
277
+ See [LICENSE](./LICENSE) for more details.
278
+
279
+ ## Citation
280
+
281
+ OneComp technical report (coming soon on ArXiv):
282
+
283
+ ```
284
+ @misc{onecomp2026,
285
+ title={TBD},
286
+ author={TBD},
287
+ year={2026},
288
+ note={arXiv preprint coming soon}
289
+ }
290
+ ```
291
+
292
+ QEP (Quantization Error Propagation):
293
+
294
+ ```
295
+ @inproceedings{
296
+ arai2025quantization,
297
+ title={Quantization Error Propagation: Revisiting Layer-Wise Post-Training Quantization},
298
+ author={Yamato Arai and Yuma Ichikawa},
299
+ booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems},
300
+ year={2025},
301
+ url={https://openreview.net/forum?id=a3l3K9khbL}
302
+ }
303
+ ```
@@ -0,0 +1,218 @@
1
+ # Fujitsu One Compression
2
+
3
+ Fujitsu One Compression (OneComp) is a Python package for LLM compression.
4
+
5
+ ## 📖 Documentation
6
+
7
+ Full documentation is available at **[https://FujitsuResearch.github.io/OneCompression/](https://FujitsuResearch.github.io/OneCompression/)**.
8
+
9
+ ## 📦 Features
10
+
11
+ - **Quantization Error Propagation (QEP)**: A post-training quantization method that corrects quantization errors by propagating them to subsequent layers, improving the accuracy of quantized LLMs. See [Arai & Ichikawa, NeurIPS 2025](https://openreview.net/forum?id=a3l3K9khbL) for details. The original reference implementation is available at [FujitsuResearch/qep](https://github.com/FujitsuResearch/qep).
12
+ - **vLLM Plugin Integration**: Serve OneComp-quantized models with [vLLM](https://docs.vllm.ai/) via built-in plugins for DBF and Mixed-GPTQ quantization methods.
13
+ - **AutoBit**: Mixed-precision quantization with ILP-based bitwidth assignment. Automatically estimates the target bitwidth from available VRAM and assigns per-layer bitwidths to minimize quantization error under the memory budget.
14
+ - **JointQ**: Joint quantization method that optimizes weight assignments and scale parameters simultaneously for improved quantization accuracy. Supports group-wise quantization (e.g., 4-bit, groupsize=128).
15
+ - **LoRA SFT Post-Process**: Fine-tune quantized models with LoRA adapters for accuracy recovery or domain-specific knowledge injection. Supports SFT loss, teacher distillation, and intermediate block alignment.
16
+ - **Rotation Preprocessing**: SpinQuant/OstQuant-based rotation preprocessing that reduces quantization error by learning optimal rotation matrices before quantization. Rotation/scaling matrices are absorbed into model weights, with online Hadamard hooks automatically registered at load time. Supports Llama and Qwen3 architectures.
17
+ - (TBD)
18
+
19
+ ## 🤖 Supported Models
20
+
21
+ OneComp has been verified with the following model architectures.
22
+ Other Hugging Face-compatible models may work but are currently untested.
23
+
24
+ | # | Architecture | Verified Models | Status |
25
+ |---|-------------|-----------------|--------|
26
+ | 1 | Llama | TinyLlama, Llama-2, Llama-3 | ✅ Verified |
27
+ | 2 | Qwen3 | Qwen3-0.6B ~ 32B | ✅ Verified |
28
+
29
+ > **Note:** Support for additional architectures is planned. Contributions and test reports are welcome.
30
+
31
+ ## 🔧 Installation
32
+
33
+ ### for users (pip)
34
+
35
+ #### 1. Install PyTorch
36
+
37
+ Please install the appropriate version of PyTorch.
38
+
39
+ #### ✅ CPU-only
40
+ ```bash
41
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
42
+ ```
43
+
44
+ #### ✅ CUDA-enabled
45
+
46
+ Choose the appropriate CUDA version for your system:
47
+
48
+ | CUDA Version | Installation Command |
49
+ |--------------|------------------------|
50
+ | CUDA 11.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118` |
51
+ | CUDA 12.1 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
52
+ | CUDA 12.4 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124` |
53
+ | CUDA 12.6 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126` |
54
+ | CUDA 12.8 | `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128` |
55
+
56
+ Check your CUDA version:
57
+ ```bash
58
+ nvcc --version
59
+ ```
60
+
61
+ or
62
+ ```bash
63
+ nvidia-smi
64
+ ```
65
+
66
+ Verify PyTorch GPU support:
67
+ ```python
68
+ import torch
69
+ print(torch.cuda.is_available())
70
+ ```
71
+
72
+ #### 2. Install `onecomp`
73
+
74
+ Once PyTorch is installed, you can install `onecomp`:
75
+
76
+ ```bash
77
+ pip install onecomp
78
+ ```
79
+
80
+ ### for developers (uv : recommended)
81
+
82
+ #### Install `uv`
83
+
84
+ [`uv`](https://docs.astral.sh/uv/getting-started/installation/) is a fast Python package and project manager written in Rust.
85
+ It offers a drop-in replacement for pip and pip-tools while also managing virtual environments and Python installations.
86
+ With its Rust-based dependency resolver and the `uv.lock` lockfile, uv provides deterministic and reproducible environments across development machines and CI pipelines.
87
+
88
+ ```bash
89
+ # install uv (for macOS or Linux)
90
+ curl -LsSf https://astral.sh/uv/install.sh | sh
91
+
92
+ git clone <git repository URL>
93
+ cd OneCompression
94
+ uv sync --extra cu128 --extra dev
95
+ ```
96
+
97
+ The `uv sync` command creates a Python virtual environment and installs all dependent libraries.
98
+
99
+ The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
100
+ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
101
+ PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.
102
+
103
+ Adding `--extra dev` installs additional packages for development.
104
+
105
+ To use vLLM for serving quantized models, add `--extra vllm`:
106
+
107
+ ```bash
108
+ uv sync --extra cu128 --extra dev --extra vllm
109
+ ```
110
+
111
+ > **Note:** `--extra vllm` may take a long time on the first run if a pre-built `xformers` wheel is not available for your Python/CUDA combination (e.g. Python 3.13). Using Python 3.12 typically avoids this.
112
+
113
+ #### Running commands (uv environment)
114
+
115
+ In the environment created by `uv sync`, you can run commands in two ways:
116
+
117
+ ##### Option 1: Use `uv run` (no activation needed)
118
+
119
+ ```bash
120
+ uv run pytest tests/ -v
121
+ uv run python example/example1.py
122
+ uv run black --check onecomp/
123
+ ```
124
+
125
+ ##### Option 2: Activate the virtual environment (traditional approach)
126
+
127
+ ```bash
128
+ source .venv/bin/activate
129
+ pytest tests/ -v
130
+ python example/example1.py
131
+ black --check onecomp/
132
+ ```
133
+
134
+ ### for developers (pip)
135
+
136
+ ```bash
137
+ git clone <git repository URL>
138
+ cd OneCompression
139
+
140
+ # First, install PyTorch with CUDA support for your environment
141
+ pip install torch --index-url https://download.pytorch.org/whl/cu128
142
+ # Then install onecomp with development dependencies
143
+ pip install -e ".[dev]"
144
+ ```
145
+
146
+ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, or `cu128`.
147
+
148
+
149
+ ### Building Documentation Locally
150
+
151
+ ```bash
152
+ uv sync --extra cu128 --extra dev --extra docs
153
+ uv run mkdocs serve
154
+ ```
155
+
156
+ Then open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser.
157
+
158
+ ## 🚀 Examples
159
+
160
+ | Category | Script | Description |
161
+ |----------|--------|-------------|
162
+ | Quantization | [example_gptq.py](./example/example_gptq.py) | GPTQ quantization |
163
+ | | [example_qep_gptq.py](./example/example_qep_gptq.py) | GPTQ + QEP (error propagation) |
164
+ | | [example_jointq.py](./example/example_jointq.py) | JointQ quantization |
165
+ | | [example_autobit.py](./example/example_autobit.py) | AutoBit mixed-precision quantization |
166
+ | | [example_auto_run.py](./example/example_auto_run.py) | AutoBit with automatic VRAM estimation |
167
+ | Save / Load | [example_save_load.py](./example/example_save_load.py) | Save and load quantized models |
168
+ | Rotation Preprocessing | [example_llama_preprocess_rtn.py](./example/pre_process/example_llama_preprocess_rtn.py) | Rotation preprocessing + RTN (TinyLlama) |
169
+ | | [example_preprocess_save_load.py](./example/pre_process/example_preprocess_save_load.py) | Save and load rotation-preprocessed quantized models |
170
+ | Post-Process | [example_lora_sft.py](./example/post_process/example_lora_sft.py) | LoRA SFT post-quantization fine-tuning |
171
+ | | [example_lora_sft_knowledge.py](./example/post_process/example_lora_sft_knowledge.py) | LoRA SFT knowledge injection |
172
+ | vLLM | [example_gptq_vllm_inference.py](./example/vllm_inference/example_gptq_vllm_inference.py) | GPTQ + QEP quantization and vLLM inference |
173
+ | | [example_autobit_vllm_inference.py](./example/vllm_inference/example_autobit_vllm_inference.py) | AutoBit quantization and vLLM inference |
174
+
175
+ ## 🔌 vLLM Inference
176
+
177
+ OneComp-quantized models can be served with [vLLM](https://docs.vllm.ai/) via built-in plugins (DBF, Mixed-GPTQ).
178
+
179
+ ```bash
180
+ # uv users
181
+ uv sync --extra cu128 --extra vllm
182
+
183
+ # pip users
184
+ pip install vllm
185
+ ```
186
+
187
+ See the [vLLM Inference guide](https://FujitsuResearch.github.io/OneCompression/user-guide/vllm-inference/) for details.
188
+
189
+
190
+ ## 📄 License
191
+
192
+ See [LICENSE](./LICENSE) for more details.
193
+
194
+ ## Citation
195
+
196
+ OneComp technical report (coming soon on ArXiv):
197
+
198
+ ```
199
+ @misc{onecomp2026,
200
+ title={TBD},
201
+ author={TBD},
202
+ year={2026},
203
+ note={arXiv preprint coming soon}
204
+ }
205
+ ```
206
+
207
+ QEP (Quantization Error Propagation):
208
+
209
+ ```
210
+ @inproceedings{
211
+ arai2025quantization,
212
+ title={Quantization Error Propagation: Revisiting Layer-Wise Post-Training Quantization},
213
+ author={Yamato Arai and Yuma Ichikawa},
214
+ booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems},
215
+ year={2025},
216
+ url={https://openreview.net/forum?id=a3l3K9khbL}
217
+ }
218
+ ```
@@ -0,0 +1,95 @@
1
+ """GPTQ Benchmark
2
+
3
+ Run GPTQ for all combinations of bits × group_size in a single pass.
4
+ Shares calibration data accumulation across quantizers for efficiency.
5
+ Results are saved under output_dir.
6
+
7
+ Copyright 2025-2026 Fujitsu Ltd.
8
+
9
+ Usage:
10
+ python quant_benchmark.py
11
+ """
12
+
13
+ import itertools
14
+
15
+ import hydra
16
+ from omegaconf import DictConfig, OmegaConf
17
+
18
+ from onecomp import GPTQ, ModelConfig, Runner
19
+
20
+
21
+ def create_quantizers(cfg: DictConfig):
22
+ """Create a list of GPTQ quantizers for all combinations of bits × group_size."""
23
+ quantizers = []
24
+ sym = cfg.gptq.symmetric
25
+ sym_label = "sym" if sym else "asym"
26
+
27
+ for bits, gs in itertools.product(cfg.gptq.bits, cfg.gptq.group_size):
28
+ # Label strings
29
+ gs_label = "pc" if gs is None else f"gs{gs}"
30
+
31
+ # GPTQ: groupsize=-1 means per-channel
32
+ gptq_groupsize = -1 if gs is None else gs
33
+ quantizers.append(
34
+ GPTQ(
35
+ num_layers=cfg.gptq.num_layers,
36
+ wbits=bits,
37
+ sym=sym,
38
+ groupsize=gptq_groupsize,
39
+ blocksize=cfg.gptq.blocksize,
40
+ percdamp=cfg.gptq.percdamp,
41
+ actorder=cfg.gptq.actorder,
42
+ mse=cfg.gptq.mse,
43
+ q_grid=cfg.gptq.q_grid,
44
+ q_norm=cfg.gptq.q_norm,
45
+ calc_quant_error=True,
46
+ name=f"GPTQ_{bits}bit_{gs_label}_{sym_label}",
47
+ )
48
+ )
49
+
50
+ return quantizers
51
+
52
+
53
+ @hydra.main(version_base=None, config_path="conf", config_name="benchmark_llama3-8b")
54
+ def main(cfg: DictConfig):
55
+ print(OmegaConf.to_yaml(cfg))
56
+
57
+ model_config = ModelConfig(path=cfg.model_path, device=cfg.model_device)
58
+
59
+ quantizers = create_quantizers(cfg)
60
+
61
+ print(f"Number of quantizers: {len(quantizers)}")
62
+ for q in quantizers:
63
+ print(f" - {q.name}")
64
+
65
+ # Build Runner
66
+ runner = Runner(
67
+ model_config=model_config,
68
+ quantizers=quantizers,
69
+ max_length=cfg.max_length,
70
+ num_calibration_samples=cfg.num_calibration_samples,
71
+ calibration_strategy=cfg.calibration_strategy,
72
+ calibration_seed=cfg.calibration_seed,
73
+ calibration_batch_size=cfg.calibration_batch_size,
74
+ )
75
+
76
+ # Run quantization
77
+ runner.run()
78
+
79
+ # Save results
80
+ for q in quantizers:
81
+ runner.save_quantization_statistics(
82
+ f"quantization_statistics_{q.name}.json", quantizer=q
83
+ )
84
+
85
+ # Perplexity evaluation
86
+ if cfg.calc_ppl:
87
+ runner.benchmark_perplexity(original_model=cfg.calc_original_ppl)
88
+
89
+ # Accuracy evaluation
90
+ if cfg.calc_acc:
91
+ runner.benchmark_accuracy(original_model=cfg.calc_original_acc)
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()