compressed-tensors 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/PKG-INFO +27 -25
  2. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/README.md +24 -13
  3. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/setup.py +5 -2
  4. compressed-tensors-0.6.0/src/compressed_tensors/compressors/base.py +252 -0
  5. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/model_compressor.py +68 -1
  6. compressed-tensors-0.6.0/src/compressed_tensors/compressors/naive_quantized.py +140 -0
  7. compressed-tensors-0.6.0/src/compressed_tensors/compressors/pack_quantized.py +208 -0
  8. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/base.py +6 -1
  9. compressed-tensors-0.6.0/src/compressed_tensors/linear/__init__.py +13 -0
  10. compressed-tensors-0.6.0/src/compressed_tensors/linear/compressed_linear.py +87 -0
  11. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/apply.py +46 -8
  12. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/calibration.py +5 -4
  13. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/compressed.py +3 -1
  14. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/forward.py +76 -43
  15. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/helpers.py +29 -2
  16. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/initialize.py +51 -16
  17. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/__init__.py +1 -0
  18. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/base.py +54 -14
  19. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/min_max.py +8 -0
  20. compressed-tensors-0.6.0/src/compressed_tensors/quantization/observers/mse.py +162 -0
  21. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/quant_args.py +96 -24
  22. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/quant_scheme.py +7 -9
  23. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/utils/helpers.py +1 -1
  24. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/__init__.py +1 -0
  25. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/helpers.py +13 -0
  26. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/offload.py +14 -2
  27. compressed-tensors-0.6.0/src/compressed_tensors/utils/permute.py +70 -0
  28. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/safetensors_load.py +2 -0
  29. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/semi_structured_conversions.py +1 -0
  30. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/version.py +1 -1
  31. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/PKG-INFO +27 -25
  32. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/SOURCES.txt +5 -2
  33. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/requires.txt +3 -1
  34. compressed_tensors-0.5.0/src/compressed_tensors/compressors/base.py +0 -60
  35. compressed_tensors-0.5.0/src/compressed_tensors/compressors/naive_quantized.py +0 -144
  36. compressed_tensors-0.5.0/src/compressed_tensors/compressors/pack_quantized.py +0 -219
  37. compressed_tensors-0.5.0/tests/test_registry.py +0 -53
  38. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/LICENSE +0 -0
  39. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/pyproject.toml +0 -0
  40. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/setup.cfg +0 -0
  41. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/__init__.py +0 -0
  42. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/base.py +0 -0
  43. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/__init__.py +0 -0
  44. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/dense.py +0 -0
  45. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/helpers.py +0 -0
  46. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/marlin_24.py +0 -0
  47. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/sparse_bitmask.py +0 -0
  48. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/__init__.py +0 -0
  49. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/dense.py +0 -0
  50. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  51. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/__init__.py +0 -0
  52. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
  53. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/frozen.py +0 -0
  54. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/helpers.py +0 -0
  55. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/memoryless.py +0 -0
  56. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/quant_config.py +0 -0
  57. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  58. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/registry/__init__.py +0 -0
  59. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/registry/registry.py +0 -0
  60. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/permutations_24.py +0 -0
  61. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  62. {compressed_tensors-0.5.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/top_level.txt +0 -0
@@ -1,51 +1,53 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
7
7
  Author-email: support@neuralmagic.com
8
8
  License: Apache 2.0
9
9
  Description-Content-Type: text/markdown
10
- License-File: LICENSE
11
- Requires-Dist: torch>=1.7.0
12
- Requires-Dist: transformers
13
- Requires-Dist: accelerate
14
- Requires-Dist: pydantic>=2.0
15
10
  Provides-Extra: dev
16
- Requires-Dist: black==22.12.0; extra == "dev"
17
- Requires-Dist: isort==5.8.0; extra == "dev"
18
- Requires-Dist: wheel>=0.36.2; extra == "dev"
19
- Requires-Dist: flake8>=3.8.3; extra == "dev"
20
- Requires-Dist: pytest>=6.0.0; extra == "dev"
21
- Requires-Dist: nbconvert>=7.16.3; extra == "dev"
22
-
23
- # compressed_tensors
24
-
25
- This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
11
+ Provides-Extra: accelerate
12
+ License-File: LICENSE
26
13
 
27
- ## Motivation
14
+ # compressed-tensors
28
15
 
29
- ### Reduce disk space by saving sparse tensors in a compressed format
16
+ The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
30
17
 
31
- The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
18
+ ## Why `compressed-tensors`?
32
19
 
33
- - Sparse tensors -> due to a large number of entries that are equal to zero.
34
- - Quantized -> due to their low precision representation.
20
+ As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
21
+ Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
22
+ `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
35
23
 
36
- ### Introduce an elegant interface to save/load compressed tensors
24
+ * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
25
+ * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
26
+ * **Flexible Quantization Support**:
27
+ * Weight-only quantization (e.g., W4A16, W8A16, WnA16)
28
+ * Activation quantization (e.g., W8A8)
29
+ * KV cache quantization
30
+ * Non-uniform schemes (different layers can be quantized in different ways!)
31
+ * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
32
+ * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
37
33
 
38
- The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
34
+ This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
39
35
 
40
36
  ## Installation
41
37
 
42
- ### Pip
38
+ ### From [PyPI](https://pypi.org/project/compressed-tensors)
43
39
 
40
+ Stable release:
44
41
  ```bash
45
42
  pip install compressed-tensors
46
43
  ```
47
44
 
48
- ### From source
45
+ Nightly release:
46
+ ```bash
47
+ pip install compressed-tensors-nightly
48
+ ```
49
+
50
+ ### From Source
49
51
 
50
52
  ```bash
51
53
  git clone https://github.com/neuralmagic/compressed-tensors
@@ -1,29 +1,40 @@
1
- # compressed_tensors
1
+ # compressed-tensors
2
2
 
3
- This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
3
+ The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
4
4
 
5
- ## Motivation
5
+ ## Why `compressed-tensors`?
6
6
 
7
- ### Reduce disk space by saving sparse tensors in a compressed format
7
+ As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
8
+ Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
9
+ `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
8
10
 
9
- The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
11
+ * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
12
+ * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
13
+ * **Flexible Quantization Support**:
14
+ * Weight-only quantization (e.g., W4A16, W8A16, WnA16)
15
+ * Activation quantization (e.g., W8A8)
16
+ * KV cache quantization
17
+ * Non-uniform schemes (different layers can be quantized in different ways!)
18
+ * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
19
+ * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
10
20
 
11
- - Sparse tensors -> due to a large number of entries that are equal to zero.
12
- - Quantized -> due to their low precision representation.
13
-
14
- ### Introduce an elegant interface to save/load compressed tensors
15
-
16
- The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
21
+ This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
17
22
 
18
23
  ## Installation
19
24
 
20
- ### Pip
25
+ ### From [PyPI](https://pypi.org/project/compressed-tensors)
21
26
 
27
+ Stable release:
22
28
  ```bash
23
29
  pip install compressed-tensors
24
30
  ```
25
31
 
26
- ### From source
32
+ Nightly release:
33
+ ```bash
34
+ pip install compressed-tensors-nightly
35
+ ```
36
+
37
+ ### From Source
27
38
 
28
39
  ```bash
29
40
  git clone https://github.com/neuralmagic/compressed-tensors
@@ -46,10 +46,13 @@ def _setup_packages() -> List:
46
46
  )
47
47
 
48
48
  def _setup_install_requires() -> List:
49
- return ["torch>=1.7.0", "transformers", "accelerate", "pydantic>=2.0"]
49
+ return ["torch>=1.7.0", "transformers", "pydantic>=2.0"]
50
50
 
51
51
  def _setup_extras() -> Dict:
52
- return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
52
+ return {
53
+ "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"],
54
+ "accelerate": ["accelerate"]
55
+ }
53
56
 
54
57
  setup(
55
58
  name=_PACKAGE_NAME,
@@ -0,0 +1,252 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Dict, Generator, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from compressed_tensors.config import SparsityCompressionConfig
20
+ from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
21
+ from compressed_tensors.registry import RegistryMixin
22
+ from compressed_tensors.utils import get_nested_weight_mappings, merge_names
23
+ from safetensors import safe_open
24
+ from torch import Tensor
25
+ from torch.nn.modules import Module
26
+ from tqdm import tqdm
27
+
28
+
29
+ _LOGGER: logging.Logger = logging.getLogger(__name__)
30
+
31
+ __all__ = ["Compressor"]
32
+
33
+
34
+ class Compressor(RegistryMixin):
35
+ """
36
+ Base class representing a model compression algorithm. Each child class should
37
+ implement compression_param_info, compress_weight and decompress_weight.
38
+
39
+ Compressors support compressing/decompressing a full module state dict or a single
40
+ quantized PyTorch leaf module.
41
+
42
+ Model Load Lifecycle (run_compressed=False):
43
+ - ModelCompressor.decompress()
44
+ - apply_quantization_config()
45
+ - Compressor.decompress()
46
+ - Compressor.decompress_weight()
47
+
48
+ Model Save Lifecycle:
49
+ - ModelCompressor.compress()
50
+ - Compressor.compress()
51
+ - Compressor.compress_weight()
52
+
53
+ Module Lifecycle (run_compressed=True):
54
+ - apply_quantization_config()
55
+ - compressed_module = CompressedLinear(module)
56
+ - initialize_module_for_quantization()
57
+ - Compressor.compression_param_info()
58
+ - register_parameters()
59
+ - compressed_module.forward()
60
+ -compressed_module.decompress()
61
+
62
+
63
+ :param config: config specifying compression parameters
64
+ """
65
+
66
+ def __init__(
67
+ self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None
68
+ ):
69
+ self.config = config
70
+
71
+ def compression_param_info(
72
+ self,
73
+ weight_shape: torch.Size,
74
+ quantization_args: Optional[QuantizationArgs] = None,
75
+ ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
76
+ """
77
+ Creates a dictionary of expected shapes and dtypes for each compression
78
+ parameter used by the compressor
79
+
80
+ :param weight_shape: uncompressed weight shape
81
+ :param quantization_args: quantization parameters for the weight
82
+ :return: dictionary mapping compressed parameter names to shape and dtype
83
+ """
84
+ raise NotImplementedError()
85
+
86
+ def compress(
87
+ self,
88
+ model_state: Dict[str, Tensor],
89
+ names_to_scheme: Dict[str, QuantizationArgs],
90
+ **kwargs,
91
+ ) -> Dict[str, Tensor]:
92
+ """
93
+ Compresses a dense state dict
94
+
95
+ :param model_state: state dict of uncompressed model
96
+ :param names_to_scheme: quantization args for each quantized weight, needed for
97
+ quantize function to calculate bit depth
98
+ :return: compressed state dict
99
+ """
100
+ compressed_dict = {}
101
+ weight_suffix = ".weight"
102
+ _LOGGER.debug(
103
+ f"Compressing model with {len(model_state)} parameterized layers..."
104
+ )
105
+
106
+ for name, value in tqdm(model_state.items(), desc="Compressing model"):
107
+ if name.endswith(weight_suffix):
108
+ prefix = name[: -(len(weight_suffix))]
109
+ scale = model_state.get(merge_names(prefix, "weight_scale"), None)
110
+ zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
111
+ g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
112
+ if scale is not None:
113
+ # weight is quantized, compress it
114
+ quant_args = names_to_scheme[prefix]
115
+ compressed_data = self.compress_weight(
116
+ weight=value,
117
+ scale=scale,
118
+ zero_point=zp,
119
+ g_idx=g_idx,
120
+ quantization_args=quant_args,
121
+ device="cpu",
122
+ )
123
+ for key, value in compressed_data.items():
124
+ compressed_dict[merge_names(prefix, key)] = value
125
+ else:
126
+ compressed_dict[name] = value.to("cpu")
127
+ elif name.endswith("zero_point") and torch.all(value == 0):
128
+ continue
129
+ elif name.endswith("g_idx") and torch.any(value <= -1):
130
+ continue
131
+ else:
132
+ compressed_dict[name] = value.to("cpu")
133
+
134
+ return compressed_dict
135
+
136
+ def decompress(
137
+ self,
138
+ path_to_model_or_tensors: str,
139
+ names_to_scheme: Dict[str, QuantizationArgs],
140
+ device: str = "cpu",
141
+ ) -> Generator[Tuple[str, Tensor], None, None]:
142
+ """
143
+ Reads a compressed state dict located at path_to_model_or_tensors
144
+ and returns a generator for sequentially decompressing back to a
145
+ dense state dict
146
+
147
+ :param path_to_model_or_tensors: path to compressed safetensors model (directory
148
+ with one or more safetensors files) or compressed tensors file
149
+ :param names_to_scheme: quantization args for each quantized weight
150
+ :param device: optional device to load intermediate weights into
151
+ :return: compressed state dict
152
+ """
153
+ weight_mappings = get_nested_weight_mappings(
154
+ path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
155
+ )
156
+ for weight_name in weight_mappings.keys():
157
+ weight_data = {}
158
+ for param_name, safe_path in weight_mappings[weight_name].items():
159
+ full_name = merge_names(weight_name, param_name)
160
+ with safe_open(safe_path, framework="pt", device=device) as f:
161
+ weight_data[param_name] = f.get_tensor(full_name)
162
+
163
+ if "weight_scale" in weight_data:
164
+ quant_args = names_to_scheme[weight_name]
165
+ decompressed = self.decompress_weight(
166
+ compressed_data=weight_data, quantization_args=quant_args
167
+ )
168
+ yield merge_names(weight_name, "weight"), decompressed
169
+
170
+ def compress_weight(
171
+ self,
172
+ weight: Tensor,
173
+ scale: Tensor,
174
+ zero_point: Optional[Tensor] = None,
175
+ g_idx: Optional[torch.Tensor] = None,
176
+ quantization_args: Optional[QuantizationArgs] = None,
177
+ ) -> Dict[str, torch.Tensor]:
178
+ """
179
+ Compresses a single uncompressed weight
180
+
181
+ :param weight: uncompressed weight tensor
182
+ :param scale: quantization scale for weight
183
+ :param zero_point: quantization zero point for weight
184
+ :param g_idx: optional mapping from column index to group index
185
+ :param quantization_args: quantization parameters for weight
186
+ :return: dictionary of compressed weight data
187
+ """
188
+ raise NotImplementedError()
189
+
190
+ def decompress_weight(
191
+ self,
192
+ compressed_data: Dict[str, Tensor],
193
+ quantization_args: Optional[QuantizationArgs] = None,
194
+ ) -> torch.Tensor:
195
+ """
196
+ Decompresses a single compressed weight
197
+
198
+ :param compressed_data: dictionary of data needed for decompression
199
+ :param quantization_args: quantization parameters for the weight
200
+ :return: tensor of the decompressed weight
201
+ """
202
+ raise NotImplementedError()
203
+
204
+ def compress_module(self, module: Module) -> Optional[Dict[str, torch.Tensor]]:
205
+ """
206
+ Compresses a single quantized leaf PyTorch module. If the module is not
207
+ quantized, this function has no effect.
208
+
209
+ :param module: PyTorch module to compress
210
+ :return: dictionary of compressed weight data, or None if module is not
211
+ quantized
212
+ """
213
+ if not hasattr(module, "quantization_scheme"):
214
+ return None # module is not quantized
215
+ quantization_scheme = module.quantization_scheme
216
+ if not hasattr(quantization_scheme, "weights"):
217
+ return None # weights are not quantized
218
+
219
+ quantization_args = quantization_scheme.weights
220
+ weight = getattr(module, "weight", None)
221
+ weight_scale = getattr(module, "weight_scale", None)
222
+ weight_zero_point = getattr(module, "weight_zero_point", None)
223
+
224
+ return self.compress_weight(
225
+ weight=weight,
226
+ scale=weight_scale,
227
+ zero_point=weight_zero_point,
228
+ quantization_args=quantization_args,
229
+ )
230
+
231
+ def decompress_module(self, module: Module):
232
+ """
233
+ Decompresses a single compressed leaf PyTorch module. If the module is not
234
+ quantized, this function has no effect.
235
+
236
+ :param module: PyTorch module to decompress
237
+ :return: tensor of the decompressed weight, or None if module is not quantized
238
+ """
239
+ if not hasattr(module, "quantization_scheme"):
240
+ return None # module is not quantized
241
+ quantization_scheme = module.quantization_scheme
242
+ if not hasattr(quantization_scheme, "weights"):
243
+ return None # weights are not quantized
244
+
245
+ quantization_args = quantization_scheme.weights
246
+ compressed_data = {}
247
+ for name, parameter in module.named_parameters():
248
+ compressed_data[name] = parameter
249
+
250
+ return self.decompress_weight(
251
+ compressed_data=compressed_data, quantization_args=quantization_args
252
+ )
@@ -28,7 +28,7 @@ from compressed_tensors.base import (
28
28
  SPARSITY_CONFIG_NAME,
29
29
  )
30
30
  from compressed_tensors.compressors import Compressor
31
- from compressed_tensors.config import SparsityCompressionConfig
31
+ from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
32
32
  from compressed_tensors.quantization import (
33
33
  QuantizationConfig,
34
34
  QuantizationStatus,
@@ -176,6 +176,9 @@ class ModelCompressor:
176
176
  if hasattr(compression_config, SPARSITY_CONFIG_NAME):
177
177
  # for loaded HFQuantizer config
178
178
  return getattr(compression_config, SPARSITY_CONFIG_NAME)
179
+ if SPARSITY_CONFIG_NAME in compression_config:
180
+ # for loaded HFQuantizer config from dict
181
+ return compression_config[SPARSITY_CONFIG_NAME]
179
182
 
180
183
  # SparseAutoModel format
181
184
  return compression_config.get(SPARSITY_CONFIG_NAME, None)
@@ -189,6 +192,10 @@ class ModelCompressor:
189
192
  # for loaded HFQuantizer config
190
193
  return getattr(compression_config, QUANTIZATION_CONFIG_NAME)
191
194
 
195
+ if QUANTIZATION_CONFIG_NAME in compression_config:
196
+ # for loaded HFQuantizer config from dict
197
+ return compression_config[QUANTIZATION_CONFIG_NAME]
198
+
192
199
  # SparseAutoModel format
193
200
  quantization_config = deepcopy(compression_config)
194
201
  quantization_config.pop(SPARSITY_CONFIG_NAME, None)
@@ -234,12 +241,72 @@ class ModelCompressor:
234
241
  compressed_state_dict = self.quantization_compressor.compress(
235
242
  state_dict, names_to_scheme=quantized_modules_to_args
236
243
  )
244
+ if self.quantization_config.format != CompressionFormat.dense.value:
245
+ self.quantization_config.quantization_status = (
246
+ QuantizationStatus.COMPRESSED
247
+ )
237
248
 
238
249
  if self.sparsity_compressor is not None:
239
250
  compressed_state_dict = self.sparsity_compressor.compress(
240
251
  compressed_state_dict
241
252
  )
242
253
 
254
+ # HACK (mgoin): Post-process step for kv cache scales to take the
255
+ # k/v_proj module `output_scale` parameters, and store them in the
256
+ # parent attention module as `k_scale` and `v_scale`
257
+ #
258
+ # Example:
259
+ # Replace `model.layers.0.self_attn.k_proj.output_scale`
260
+ # with `model.layers.0.self_attn.k_scale`
261
+ if (
262
+ self.quantization_config is not None
263
+ and self.quantization_config.kv_cache_scheme is not None
264
+ ):
265
+ # HACK (mgoin): We assume the quantized modules in question
266
+ # will be k_proj and v_proj since those are the default targets.
267
+ # We check that both of these modules have output activation
268
+ # quantization, and additionally check that q_proj doesn't.
269
+ q_proj_has_no_quant_output = 0
270
+ k_proj_has_quant_output = 0
271
+ v_proj_has_quant_output = 0
272
+ for name, module in model.named_modules():
273
+ if not hasattr(module, "quantization_scheme"):
274
+ # We still want to count non-quantized q_proj
275
+ if name.endswith(".q_proj"):
276
+ q_proj_has_no_quant_output += 1
277
+ continue
278
+ out_act = module.quantization_scheme.output_activations
279
+ if name.endswith(".q_proj") and out_act is None:
280
+ q_proj_has_no_quant_output += 1
281
+ elif name.endswith(".k_proj") and out_act is not None:
282
+ k_proj_has_quant_output += 1
283
+ elif name.endswith(".v_proj") and out_act is not None:
284
+ v_proj_has_quant_output += 1
285
+
286
+ assert (
287
+ q_proj_has_no_quant_output > 0
288
+ and k_proj_has_quant_output > 0
289
+ and v_proj_has_quant_output > 0
290
+ )
291
+ assert (
292
+ q_proj_has_no_quant_output
293
+ == k_proj_has_quant_output
294
+ == v_proj_has_quant_output
295
+ )
296
+
297
+ # Move all .k/v_proj.output_scale parameters to .k/v_scale
298
+ working_state_dict = {}
299
+ for key in compressed_state_dict.keys():
300
+ if key.endswith(".k_proj.output_scale"):
301
+ new_key = key.replace(".k_proj.output_scale", ".k_scale")
302
+ working_state_dict[new_key] = compressed_state_dict[key]
303
+ elif key.endswith(".v_proj.output_scale"):
304
+ new_key = key.replace(".v_proj.output_scale", ".v_scale")
305
+ working_state_dict[new_key] = compressed_state_dict[key]
306
+ else:
307
+ working_state_dict[key] = compressed_state_dict[key]
308
+ compressed_state_dict = working_state_dict
309
+
243
310
  # HACK: Override the dtype_byte_size function in transformers to
244
311
  # support float8 types. Fix is posted upstream
245
312
  # https://github.com/huggingface/transformers/pull/30488
@@ -0,0 +1,140 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Dict, Optional, Tuple
17
+
18
+ import torch
19
+ from compressed_tensors.compressors import Compressor
20
+ from compressed_tensors.config import CompressionFormat
21
+ from compressed_tensors.quantization import QuantizationArgs
22
+ from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
23
+ from compressed_tensors.quantization.utils import can_quantize
24
+ from torch import Tensor
25
+
26
+
27
+ __all__ = [
28
+ "QuantizationCompressor",
29
+ "IntQuantizationCompressor",
30
+ "FloatQuantizationCompressor",
31
+ ]
32
+
33
+ _LOGGER: logging.Logger = logging.getLogger(__name__)
34
+
35
+
36
+ @Compressor.register(name=CompressionFormat.naive_quantized.value)
37
+ class QuantizationCompressor(Compressor):
38
+ """
39
+ Implements naive compression for quantized models. Weight of each
40
+ quantized layer is converted from its original float type to the closest Pytorch
41
+ type to the type specified by the layer's QuantizationArgs.
42
+ """
43
+
44
+ COMPRESSION_PARAM_NAMES = [
45
+ "weight",
46
+ "weight_scale",
47
+ "weight_zero_point",
48
+ "weight_g_idx",
49
+ ]
50
+
51
+ def compression_param_info(
52
+ self,
53
+ weight_shape: torch.Size,
54
+ quantization_args: Optional[QuantizationArgs] = None,
55
+ ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
56
+ """
57
+ Creates a dictionary of expected shapes and dtypes for each compression
58
+ parameter used by the compressor
59
+
60
+ :param weight_shape: uncompressed weight shape
61
+ :param quantization_args: quantization parameters for the weight
62
+ :return: dictionary mapping compressed parameter names to shape and dtype
63
+ """
64
+ dtype = quantization_args.pytorch_dtype()
65
+ return {"weight": (weight_shape, dtype)}
66
+
67
+ def compress_weight(
68
+ self,
69
+ weight: Tensor,
70
+ scale: Tensor,
71
+ zero_point: Optional[Tensor] = None,
72
+ g_idx: Optional[torch.Tensor] = None,
73
+ quantization_args: Optional[QuantizationArgs] = None,
74
+ device: Optional[torch.device] = None,
75
+ ) -> Dict[str, torch.Tensor]:
76
+ """
77
+ Compresses a single uncompressed weight
78
+
79
+ :param weight: uncompressed weight tensor
80
+ :param scale: quantization scale for weight
81
+ :param zero_point: quantization zero point for weight
82
+ :param g_idx: optional mapping from column index to group index
83
+ :param quantization_args: quantization parameters for weight
84
+ :param device: optional device to move compressed output to
85
+ :return: dictionary of compressed weight data
86
+ """
87
+ if can_quantize(weight, quantization_args):
88
+ quantized_weight = quantize(
89
+ x=weight,
90
+ scale=scale,
91
+ zero_point=zero_point,
92
+ g_idx=g_idx,
93
+ args=quantization_args,
94
+ dtype=quantization_args.pytorch_dtype(),
95
+ )
96
+
97
+ if device is not None:
98
+ quantized_weight = quantized_weight.to(device)
99
+
100
+ return {"weight": quantized_weight}
101
+
102
+ def decompress_weight(
103
+ self,
104
+ compressed_data: Dict[str, Tensor],
105
+ quantization_args: Optional[QuantizationArgs] = None,
106
+ ) -> torch.Tensor:
107
+ """
108
+ Decompresses a single compressed weight
109
+
110
+ :param compressed_data: dictionary of data needed for decompression
111
+ :param quantization_args: quantization parameters for the weight
112
+ :return: tensor of the decompressed weight
113
+ """
114
+ weight = compressed_data["weight"]
115
+ scale = compressed_data["weight_scale"]
116
+ zero_point = compressed_data.get("weight_zero_point", None)
117
+ g_idx = compressed_data.get("weight_g_idx", None)
118
+ decompressed_weight = dequantize(
119
+ x_q=weight, scale=scale, zero_point=zero_point, g_idx=g_idx
120
+ )
121
+
122
+ return decompressed_weight
123
+
124
+
125
+ @Compressor.register(name=CompressionFormat.int_quantized.value)
126
+ class IntQuantizationCompressor(QuantizationCompressor):
127
+ """
128
+ Alias for integer quantized models
129
+ """
130
+
131
+ pass
132
+
133
+
134
+ @Compressor.register(name=CompressionFormat.float_quantized.value)
135
+ class FloatQuantizationCompressor(QuantizationCompressor):
136
+ """
137
+ Alias for fp quantized models
138
+ """
139
+
140
+ pass