compressed-tensors 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/PKG-INFO +26 -14
  2. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/README.md +24 -13
  3. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/setup.py +4 -1
  4. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/base.py +1 -0
  5. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/__init__.py +5 -1
  6. compressed-tensors-0.6.0/src/compressed_tensors/compressors/base.py +252 -0
  7. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/dense.py +1 -1
  8. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/marlin_24.py +11 -10
  9. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/model_compressor.py +101 -13
  10. compressed-tensors-0.6.0/src/compressed_tensors/compressors/naive_quantized.py +140 -0
  11. compressed-tensors-0.6.0/src/compressed_tensors/compressors/pack_quantized.py +208 -0
  12. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/sparse_bitmask.py +1 -1
  13. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/base.py +8 -1
  14. {compressed-tensors-0.4.0/src/compressed_tensors/utils → compressed-tensors-0.6.0/src/compressed_tensors/linear}/__init__.py +0 -3
  15. compressed-tensors-0.6.0/src/compressed_tensors/linear/compressed_linear.py +87 -0
  16. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/__init__.py +1 -0
  17. compressed-tensors-0.6.0/src/compressed_tensors/quantization/lifecycle/apply.py +389 -0
  18. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/calibration.py +22 -2
  19. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/compressed.py +3 -1
  20. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/forward.py +139 -61
  21. compressed-tensors-0.6.0/src/compressed_tensors/quantization/lifecycle/helpers.py +80 -0
  22. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/initialize.py +77 -13
  23. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/__init__.py +1 -0
  24. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/base.py +93 -14
  25. compressed-tensors-0.6.0/src/compressed_tensors/quantization/observers/helpers.py +111 -0
  26. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/min_max.py +8 -0
  27. compressed-tensors-0.6.0/src/compressed_tensors/quantization/observers/mse.py +162 -0
  28. compressed-tensors-0.6.0/src/compressed_tensors/quantization/quant_args.py +241 -0
  29. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/quant_config.py +35 -2
  30. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/quant_scheme.py +112 -13
  31. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/utils/helpers.py +68 -2
  32. {compressed-tensors-0.4.0/src/compressed_tensors/compressors → compressed-tensors-0.6.0/src/compressed_tensors}/utils/__init__.py +3 -1
  33. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/helpers.py +44 -2
  34. compressed-tensors-0.6.0/src/compressed_tensors/utils/offload.py +116 -0
  35. compressed-tensors-0.6.0/src/compressed_tensors/utils/permute.py +70 -0
  36. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/utils/safetensors_load.py +2 -0
  37. {compressed-tensors-0.4.0/src/compressed_tensors/compressors → compressed-tensors-0.6.0/src/compressed_tensors}/utils/semi_structured_conversions.py +1 -0
  38. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/version.py +1 -1
  39. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/PKG-INFO +26 -14
  40. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/SOURCES.txt +10 -6
  41. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/requires.txt +3 -0
  42. compressed-tensors-0.4.0/src/compressed_tensors/compressors/base.py +0 -60
  43. compressed-tensors-0.4.0/src/compressed_tensors/compressors/int_quantized.py +0 -126
  44. compressed-tensors-0.4.0/src/compressed_tensors/compressors/pack_quantized.py +0 -212
  45. compressed-tensors-0.4.0/src/compressed_tensors/compressors/utils/helpers.py +0 -43
  46. compressed-tensors-0.4.0/src/compressed_tensors/quantization/lifecycle/apply.py +0 -229
  47. compressed-tensors-0.4.0/src/compressed_tensors/quantization/observers/helpers.py +0 -58
  48. compressed-tensors-0.4.0/src/compressed_tensors/quantization/quant_args.py +0 -125
  49. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/LICENSE +0 -0
  50. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/pyproject.toml +0 -0
  51. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/setup.cfg +0 -0
  52. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/__init__.py +0 -0
  53. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/compressors/helpers.py +0 -0
  54. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/__init__.py +0 -0
  55. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/dense.py +0 -0
  56. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  57. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/__init__.py +0 -0
  58. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/lifecycle/frozen.py +0 -0
  59. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/observers/memoryless.py +0 -0
  60. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  61. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/registry/__init__.py +0 -0
  62. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors/registry/registry.py +0 -0
  63. {compressed-tensors-0.4.0/src/compressed_tensors/compressors → compressed-tensors-0.6.0/src/compressed_tensors}/utils/permutations_24.py +0 -0
  64. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  65. {compressed-tensors-0.4.0 → compressed-tensors-0.6.0}/src/compressed_tensors.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -8,34 +8,46 @@ Author-email: support@neuralmagic.com
8
8
  License: Apache 2.0
9
9
  Description-Content-Type: text/markdown
10
10
  Provides-Extra: dev
11
+ Provides-Extra: accelerate
11
12
  License-File: LICENSE
12
13
 
13
- # compressed_tensors
14
+ # compressed-tensors
14
15
 
15
- This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
16
+ The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
16
17
 
17
- ## Motivation
18
+ ## Why `compressed-tensors`?
18
19
 
19
- ### Reduce disk space by saving sparse tensors in a compressed format
20
+ As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
21
+ Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
22
+ `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
20
23
 
21
- The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
24
+ * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
25
+ * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
26
+ * **Flexible Quantization Support**:
27
+ * Weight-only quantization (e.g., W4A16, W8A16, WnA16)
28
+ * Activation quantization (e.g., W8A8)
29
+ * KV cache quantization
30
+ * Non-uniform schemes (different layers can be quantized in different ways!)
31
+ * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
32
+ * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
22
33
 
23
- - Sparse tensors -> due to a large number of entries that are equal to zero.
24
- - Quantized -> due to their low precision representation.
25
-
26
- ### Introduce an elegant interface to save/load compressed tensors
27
-
28
- The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
34
+ This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
29
35
 
30
36
  ## Installation
31
37
 
32
- ### Pip
38
+ ### From [PyPI](https://pypi.org/project/compressed-tensors)
33
39
 
40
+ Stable release:
34
41
  ```bash
35
42
  pip install compressed-tensors
36
43
  ```
37
44
 
38
- ### From source
45
+ Nightly release:
46
+ ```bash
47
+ pip install compressed-tensors-nightly
48
+ ```
49
+
50
+ ### From Source
39
51
 
40
52
  ```bash
41
53
  git clone https://github.com/neuralmagic/compressed-tensors
@@ -1,29 +1,40 @@
1
- # compressed_tensors
1
+ # compressed-tensors
2
2
 
3
- This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
3
+ The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
4
4
 
5
- ## Motivation
5
+ ## Why `compressed-tensors`?
6
6
 
7
- ### Reduce disk space by saving sparse tensors in a compressed format
7
+ As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
8
+ Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
9
+ `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
8
10
 
9
- The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
11
+ * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
12
+ * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
13
+ * **Flexible Quantization Support**:
14
+ * Weight-only quantization (e.g., W4A16, W8A16, WnA16)
15
+ * Activation quantization (e.g., W8A8)
16
+ * KV cache quantization
17
+ * Non-uniform schemes (different layers can be quantized in different ways!)
18
+ * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
19
+ * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
10
20
 
11
- - Sparse tensors -> due to a large number of entries that are equal to zero.
12
- - Quantized -> due to their low precision representation.
13
-
14
- ### Introduce an elegant interface to save/load compressed tensors
15
-
16
- The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
21
+ This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
17
22
 
18
23
  ## Installation
19
24
 
20
- ### Pip
25
+ ### From [PyPI](https://pypi.org/project/compressed-tensors)
21
26
 
27
+ Stable release:
22
28
  ```bash
23
29
  pip install compressed-tensors
24
30
  ```
25
31
 
26
- ### From source
32
+ Nightly release:
33
+ ```bash
34
+ pip install compressed-tensors-nightly
35
+ ```
36
+
37
+ ### From Source
27
38
 
28
39
  ```bash
29
40
  git clone https://github.com/neuralmagic/compressed-tensors
@@ -49,7 +49,10 @@ def _setup_install_requires() -> List:
49
49
  return ["torch>=1.7.0", "transformers", "pydantic>=2.0"]
50
50
 
51
51
  def _setup_extras() -> Dict:
52
- return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
52
+ return {
53
+ "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"],
54
+ "accelerate": ["accelerate"]
55
+ }
53
56
 
54
57
  setup(
55
58
  name=_PACKAGE_NAME,
@@ -15,3 +15,4 @@
15
15
  SPARSITY_CONFIG_NAME = "sparsity_config"
16
16
  QUANTIZATION_CONFIG_NAME = "quantization_config"
17
17
  COMPRESSION_CONFIG_NAME = "compression_config"
18
+ KV_CACHE_SCHEME_NAME = "kv_cache_scheme"
@@ -17,8 +17,12 @@
17
17
  from .base import Compressor
18
18
  from .dense import DenseCompressor
19
19
  from .helpers import load_compressed, save_compressed, save_compressed_model
20
- from .int_quantized import IntQuantizationCompressor
21
20
  from .marlin_24 import Marlin24Compressor
22
21
  from .model_compressor import ModelCompressor, map_modules_to_quant_args
22
+ from .naive_quantized import (
23
+ FloatQuantizationCompressor,
24
+ IntQuantizationCompressor,
25
+ QuantizationCompressor,
26
+ )
23
27
  from .pack_quantized import PackedQuantizationCompressor
24
28
  from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
@@ -0,0 +1,252 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Dict, Generator, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from compressed_tensors.config import SparsityCompressionConfig
20
+ from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
21
+ from compressed_tensors.registry import RegistryMixin
22
+ from compressed_tensors.utils import get_nested_weight_mappings, merge_names
23
+ from safetensors import safe_open
24
+ from torch import Tensor
25
+ from torch.nn.modules import Module
26
+ from tqdm import tqdm
27
+
28
+
29
+ _LOGGER: logging.Logger = logging.getLogger(__name__)
30
+
31
+ __all__ = ["Compressor"]
32
+
33
+
34
+ class Compressor(RegistryMixin):
35
+ """
36
+ Base class representing a model compression algorithm. Each child class should
37
+ implement compression_param_info, compress_weight and decompress_weight.
38
+
39
+ Compressors support compressing/decompressing a full module state dict or a single
40
+ quantized PyTorch leaf module.
41
+
42
+ Model Load Lifecycle (run_compressed=False):
43
+ - ModelCompressor.decompress()
44
+ - apply_quantization_config()
45
+ - Compressor.decompress()
46
+ - Compressor.decompress_weight()
47
+
48
+ Model Save Lifecycle:
49
+ - ModelCompressor.compress()
50
+ - Compressor.compress()
51
+ - Compressor.compress_weight()
52
+
53
+ Module Lifecycle (run_compressed=True):
54
+ - apply_quantization_config()
55
+ - compressed_module = CompressedLinear(module)
56
+ - initialize_module_for_quantization()
57
+ - Compressor.compression_param_info()
58
+ - register_parameters()
59
+ - compressed_module.forward()
60
+ -compressed_module.decompress()
61
+
62
+
63
+ :param config: config specifying compression parameters
64
+ """
65
+
66
+ def __init__(
67
+ self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None
68
+ ):
69
+ self.config = config
70
+
71
+ def compression_param_info(
72
+ self,
73
+ weight_shape: torch.Size,
74
+ quantization_args: Optional[QuantizationArgs] = None,
75
+ ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
76
+ """
77
+ Creates a dictionary of expected shapes and dtypes for each compression
78
+ parameter used by the compressor
79
+
80
+ :param weight_shape: uncompressed weight shape
81
+ :param quantization_args: quantization parameters for the weight
82
+ :return: dictionary mapping compressed parameter names to shape and dtype
83
+ """
84
+ raise NotImplementedError()
85
+
86
+ def compress(
87
+ self,
88
+ model_state: Dict[str, Tensor],
89
+ names_to_scheme: Dict[str, QuantizationArgs],
90
+ **kwargs,
91
+ ) -> Dict[str, Tensor]:
92
+ """
93
+ Compresses a dense state dict
94
+
95
+ :param model_state: state dict of uncompressed model
96
+ :param names_to_scheme: quantization args for each quantized weight, needed for
97
+ quantize function to calculate bit depth
98
+ :return: compressed state dict
99
+ """
100
+ compressed_dict = {}
101
+ weight_suffix = ".weight"
102
+ _LOGGER.debug(
103
+ f"Compressing model with {len(model_state)} parameterized layers..."
104
+ )
105
+
106
+ for name, value in tqdm(model_state.items(), desc="Compressing model"):
107
+ if name.endswith(weight_suffix):
108
+ prefix = name[: -(len(weight_suffix))]
109
+ scale = model_state.get(merge_names(prefix, "weight_scale"), None)
110
+ zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
111
+ g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
112
+ if scale is not None:
113
+ # weight is quantized, compress it
114
+ quant_args = names_to_scheme[prefix]
115
+ compressed_data = self.compress_weight(
116
+ weight=value,
117
+ scale=scale,
118
+ zero_point=zp,
119
+ g_idx=g_idx,
120
+ quantization_args=quant_args,
121
+ device="cpu",
122
+ )
123
+ for key, value in compressed_data.items():
124
+ compressed_dict[merge_names(prefix, key)] = value
125
+ else:
126
+ compressed_dict[name] = value.to("cpu")
127
+ elif name.endswith("zero_point") and torch.all(value == 0):
128
+ continue
129
+ elif name.endswith("g_idx") and torch.any(value <= -1):
130
+ continue
131
+ else:
132
+ compressed_dict[name] = value.to("cpu")
133
+
134
+ return compressed_dict
135
+
136
+ def decompress(
137
+ self,
138
+ path_to_model_or_tensors: str,
139
+ names_to_scheme: Dict[str, QuantizationArgs],
140
+ device: str = "cpu",
141
+ ) -> Generator[Tuple[str, Tensor], None, None]:
142
+ """
143
+ Reads a compressed state dict located at path_to_model_or_tensors
144
+ and returns a generator for sequentially decompressing back to a
145
+ dense state dict
146
+
147
+ :param path_to_model_or_tensors: path to compressed safetensors model (directory
148
+ with one or more safetensors files) or compressed tensors file
149
+ :param names_to_scheme: quantization args for each quantized weight
150
+ :param device: optional device to load intermediate weights into
151
+ :return: compressed state dict
152
+ """
153
+ weight_mappings = get_nested_weight_mappings(
154
+ path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
155
+ )
156
+ for weight_name in weight_mappings.keys():
157
+ weight_data = {}
158
+ for param_name, safe_path in weight_mappings[weight_name].items():
159
+ full_name = merge_names(weight_name, param_name)
160
+ with safe_open(safe_path, framework="pt", device=device) as f:
161
+ weight_data[param_name] = f.get_tensor(full_name)
162
+
163
+ if "weight_scale" in weight_data:
164
+ quant_args = names_to_scheme[weight_name]
165
+ decompressed = self.decompress_weight(
166
+ compressed_data=weight_data, quantization_args=quant_args
167
+ )
168
+ yield merge_names(weight_name, "weight"), decompressed
169
+
170
+ def compress_weight(
171
+ self,
172
+ weight: Tensor,
173
+ scale: Tensor,
174
+ zero_point: Optional[Tensor] = None,
175
+ g_idx: Optional[torch.Tensor] = None,
176
+ quantization_args: Optional[QuantizationArgs] = None,
177
+ ) -> Dict[str, torch.Tensor]:
178
+ """
179
+ Compresses a single uncompressed weight
180
+
181
+ :param weight: uncompressed weight tensor
182
+ :param scale: quantization scale for weight
183
+ :param zero_point: quantization zero point for weight
184
+ :param g_idx: optional mapping from column index to group index
185
+ :param quantization_args: quantization parameters for weight
186
+ :return: dictionary of compressed weight data
187
+ """
188
+ raise NotImplementedError()
189
+
190
+ def decompress_weight(
191
+ self,
192
+ compressed_data: Dict[str, Tensor],
193
+ quantization_args: Optional[QuantizationArgs] = None,
194
+ ) -> torch.Tensor:
195
+ """
196
+ Decompresses a single compressed weight
197
+
198
+ :param compressed_data: dictionary of data needed for decompression
199
+ :param quantization_args: quantization parameters for the weight
200
+ :return: tensor of the decompressed weight
201
+ """
202
+ raise NotImplementedError()
203
+
204
+ def compress_module(self, module: Module) -> Optional[Dict[str, torch.Tensor]]:
205
+ """
206
+ Compresses a single quantized leaf PyTorch module. If the module is not
207
+ quantized, this function has no effect.
208
+
209
+ :param module: PyTorch module to compress
210
+ :return: dictionary of compressed weight data, or None if module is not
211
+ quantized
212
+ """
213
+ if not hasattr(module, "quantization_scheme"):
214
+ return None # module is not quantized
215
+ quantization_scheme = module.quantization_scheme
216
+ if not hasattr(quantization_scheme, "weights"):
217
+ return None # weights are not quantized
218
+
219
+ quantization_args = quantization_scheme.weights
220
+ weight = getattr(module, "weight", None)
221
+ weight_scale = getattr(module, "weight_scale", None)
222
+ weight_zero_point = getattr(module, "weight_zero_point", None)
223
+
224
+ return self.compress_weight(
225
+ weight=weight,
226
+ scale=weight_scale,
227
+ zero_point=weight_zero_point,
228
+ quantization_args=quantization_args,
229
+ )
230
+
231
+ def decompress_module(self, module: Module):
232
+ """
233
+ Decompresses a single compressed leaf PyTorch module. If the module is not
234
+ quantized, this function has no effect.
235
+
236
+ :param module: PyTorch module to decompress
237
+ :return: tensor of the decompressed weight, or None if module is not quantized
238
+ """
239
+ if not hasattr(module, "quantization_scheme"):
240
+ return None # module is not quantized
241
+ quantization_scheme = module.quantization_scheme
242
+ if not hasattr(quantization_scheme, "weights"):
243
+ return None # weights are not quantized
244
+
245
+ quantization_args = quantization_scheme.weights
246
+ compressed_data = {}
247
+ for name, parameter in module.named_parameters():
248
+ compressed_data[name] = parameter
249
+
250
+ return self.decompress_weight(
251
+ compressed_data=compressed_data, quantization_args=quantization_args
252
+ )
@@ -29,6 +29,6 @@ class DenseCompressor(Compressor):
29
29
  return model_state
30
30
 
31
31
  def decompress(
32
- self, path_to_model_or_tensors: str, device: str = "cpu"
32
+ self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
33
33
  ) -> Generator[Tuple[str, Tensor], None, None]:
34
34
  return iter([])
@@ -18,15 +18,16 @@ from typing import Dict, Generator, Tuple
18
18
  import numpy as np
19
19
  import torch
20
20
  from compressed_tensors.compressors import Compressor
21
- from compressed_tensors.compressors.utils import (
21
+ from compressed_tensors.config import CompressionFormat
22
+ from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
23
+ from compressed_tensors.quantization.lifecycle.forward import quantize
24
+ from compressed_tensors.utils import (
22
25
  get_permutations_24,
26
+ is_quantization_param,
27
+ merge_names,
23
28
  sparse_semi_structured_from_dense_cutlass,
24
29
  tensor_follows_mask_structure,
25
30
  )
26
- from compressed_tensors.config import CompressionFormat
27
- from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
28
- from compressed_tensors.quantization.lifecycle.forward import quantize
29
- from compressed_tensors.utils import is_quantization_param, merge_names
30
31
  from torch import Tensor
31
32
  from tqdm import tqdm
32
33
 
@@ -107,7 +108,7 @@ class Marlin24Compressor(Compressor):
107
108
  def compress(
108
109
  self,
109
110
  model_state: Dict[str, Tensor],
110
- model_quant_args: Dict[str, QuantizationArgs],
111
+ names_to_scheme: Dict[str, QuantizationArgs],
111
112
  **kwargs,
112
113
  ) -> Dict[str, Tensor]:
113
114
  """
@@ -115,11 +116,11 @@ class Marlin24Compressor(Compressor):
115
116
  with the Marlin24 kernel
116
117
 
117
118
  :param model_state: state dict of uncompressed model
118
- :param model_quant_args: quantization args for each quantized weight, needed for
119
+ :param names_to_scheme: quantization args for each quantized weight, needed for
119
120
  quantize function to calculate bit depth
120
121
  :return: compressed state dict
121
122
  """
122
- self.validate_quant_compatability(model_quant_args)
123
+ self.validate_quant_compatability(names_to_scheme)
123
124
 
124
125
  compressed_dict = {}
125
126
  weight_suffix = ".weight"
@@ -139,7 +140,7 @@ class Marlin24Compressor(Compressor):
139
140
  value = value.to(torch.float16)
140
141
 
141
142
  # quantize weight, keeping it as a float16 for now
142
- quant_args = model_quant_args[prefix]
143
+ quant_args = names_to_scheme[prefix]
143
144
  value = quantize(
144
145
  x=value, scale=scale, zero_point=zp, args=quant_args
145
146
  )
@@ -175,7 +176,7 @@ class Marlin24Compressor(Compressor):
175
176
  return compressed_dict
176
177
 
177
178
  def decompress(
178
- self, path_to_model_or_tensors: str, device: str = "cpu"
179
+ self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
179
180
  ) -> Generator[Tuple[str, Tensor], None, None]:
180
181
  raise NotImplementedError(
181
182
  "Decompression is not implemented for the Marlin24 Compressor."