compressed-tensors 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. compressed_tensors/base.py +2 -1
  2. compressed_tensors/compressors/__init__.py +5 -1
  3. compressed_tensors/compressors/base.py +11 -54
  4. compressed_tensors/compressors/dense.py +4 -4
  5. compressed_tensors/compressors/helpers.py +12 -12
  6. compressed_tensors/compressors/int_quantized.py +126 -0
  7. compressed_tensors/compressors/marlin_24.py +250 -0
  8. compressed_tensors/compressors/model_compressor.py +315 -0
  9. compressed_tensors/compressors/pack_quantized.py +212 -0
  10. compressed_tensors/compressors/sparse_bitmask.py +3 -3
  11. compressed_tensors/compressors/utils/__init__.py +19 -0
  12. compressed_tensors/compressors/utils/helpers.py +43 -0
  13. compressed_tensors/compressors/utils/permutations_24.py +65 -0
  14. compressed_tensors/compressors/utils/semi_structured_conversions.py +341 -0
  15. compressed_tensors/config/base.py +7 -4
  16. compressed_tensors/config/dense.py +4 -4
  17. compressed_tensors/config/sparse_bitmask.py +3 -3
  18. compressed_tensors/quantization/lifecycle/__init__.py +1 -0
  19. compressed_tensors/quantization/lifecycle/apply.py +62 -11
  20. compressed_tensors/quantization/lifecycle/compressed.py +69 -0
  21. compressed_tensors/quantization/lifecycle/forward.py +161 -54
  22. compressed_tensors/quantization/lifecycle/frozen.py +4 -0
  23. compressed_tensors/quantization/lifecycle/initialize.py +33 -5
  24. compressed_tensors/quantization/observers/base.py +31 -27
  25. compressed_tensors/quantization/observers/helpers.py +6 -1
  26. compressed_tensors/quantization/observers/memoryless.py +17 -9
  27. compressed_tensors/quantization/observers/min_max.py +44 -13
  28. compressed_tensors/quantization/quant_args.py +2 -2
  29. compressed_tensors/quantization/quant_config.py +69 -21
  30. compressed_tensors/quantization/quant_scheme.py +81 -1
  31. compressed_tensors/quantization/utils/helpers.py +76 -8
  32. compressed_tensors/utils/helpers.py +24 -6
  33. compressed_tensors/utils/safetensors_load.py +3 -2
  34. compressed_tensors/version.py +53 -0
  35. {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/METADATA +46 -8
  36. compressed_tensors-0.4.0.dist-info/RECORD +48 -0
  37. compressed_tensors-0.3.3.dist-info/RECORD +0 -38
  38. {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/LICENSE +0 -0
  39. {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/WHEEL +0 -0
  40. {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/top_level.txt +0 -0
@@ -13,10 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from enum import Enum
16
- from typing import Dict, List, Optional
16
+ from typing import Dict, List, Optional, Union
17
17
 
18
- from compressed_tensors.base import QUANTIZATION_CONFIG_NAME
19
- from compressed_tensors.quantization.quant_scheme import QuantizationScheme
18
+ from compressed_tensors.config import CompressionFormat
19
+ from compressed_tensors.quantization.quant_scheme import (
20
+ QuantizationScheme,
21
+ preset_name_to_scheme,
22
+ )
20
23
  from compressed_tensors.quantization.utils import (
21
24
  calculate_compression_ratio,
22
25
  is_module_quantized,
@@ -25,13 +28,14 @@ from compressed_tensors.quantization.utils import (
25
28
  )
26
29
  from pydantic import BaseModel, Field
27
30
  from torch.nn import Module
28
- from transformers import AutoConfig
29
31
 
30
32
 
31
33
  __all__ = [
32
34
  "QuantizationStatus",
33
35
  "QuantizationConfig",
34
36
  "LIFECYCLE_ORDER",
37
+ "DEFAULT_QUANTIZATION_METHOD",
38
+ "DEFAULT_QUANTIZATION_FORMAT",
35
39
  ]
36
40
 
37
41
 
@@ -62,10 +66,33 @@ class QuantizationStatus(str, Enum):
62
66
  return
63
67
 
64
68
  def __ge__(self, other):
69
+ if other is None:
70
+ return True
65
71
  if not isinstance(other, self.__class__):
66
72
  raise NotImplementedError
67
73
  return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
68
74
 
75
+ def __gt__(self, other):
76
+ if other is None:
77
+ return True
78
+ if not isinstance(other, self.__class__):
79
+ raise NotImplementedError
80
+ return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other)
81
+
82
+ def __lt__(self, other):
83
+ if other is None:
84
+ return False
85
+ if not isinstance(other, self.__class__):
86
+ raise NotImplementedError
87
+ return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other)
88
+
89
+ def __le__(self, other):
90
+ if other is None:
91
+ return False
92
+ if not isinstance(other, self.__class__):
93
+ raise NotImplementedError
94
+ return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other)
95
+
69
96
 
70
97
  LIFECYCLE_ORDER = [
71
98
  QuantizationStatus.INITIALIZED,
@@ -74,6 +101,9 @@ LIFECYCLE_ORDER = [
74
101
  QuantizationStatus.COMPRESSED,
75
102
  ]
76
103
 
104
+ DEFAULT_QUANTIZATION_METHOD = "compressed-tensors"
105
+ DEFAULT_QUANTIZATION_FORMAT = "fakequant"
106
+
77
107
 
78
108
  class QuantizationConfig(BaseModel):
79
109
  """
@@ -81,7 +111,8 @@ class QuantizationConfig(BaseModel):
81
111
  mapped to a QuantizationScheme in config_groups.
82
112
 
83
113
  :param config_groups: dict of QuantizationSchemes specifying the quantization
84
- settings for each quantized layer
114
+ settings for each quantized layer. A group could also be a reference to
115
+ a predefined scheme name, mapped to a list of its target layers/classes
85
116
  :param quant_method: a constant used to differentiate sparseML quantization from
86
117
  other quantization configs
87
118
  :param format: specifies how the quantized model is stored on disk
@@ -93,30 +124,34 @@ class QuantizationConfig(BaseModel):
93
124
  are not quantized even if they match up with a target in config_groups
94
125
  """
95
126
 
96
- config_groups: Dict[str, QuantizationScheme]
97
- quant_method: str = "sparseml"
98
- format: str = "fakequant"
127
+ config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
128
+ quant_method: str = DEFAULT_QUANTIZATION_METHOD
129
+ format: str = DEFAULT_QUANTIZATION_FORMAT
99
130
  quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
100
131
  global_compression_ratio: Optional[float] = None
101
132
  ignore: Optional[List[str]] = Field(default_factory=list)
102
133
 
103
- @staticmethod
104
- def from_model_config(model_name_or_path) -> "QuantizationConfig":
134
+ def model_post_init(self, __context):
105
135
  """
106
- Given a path to a model config, extract a quantization config if it exists
107
-
108
- :param pretrained_model_name_or_path: path to model config on disk or HF hub
109
- :return: instantiated QuantizationConfig if config contains a quant config
136
+ updates any quantization schemes defined as presets to be fully loaded
137
+ schemes
110
138
  """
111
- config = AutoConfig.from_pretrained(model_name_or_path)
112
- quantization_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
113
- if quantization_config is None:
114
- return None
115
-
116
- return QuantizationConfig.parse_obj(quantization_config)
139
+ for group_name, targets_or_scheme in self.config_groups.items():
140
+ if isinstance(targets_or_scheme, QuantizationScheme):
141
+ continue # scheme already defined
142
+ self.config_groups[group_name] = preset_name_to_scheme(
143
+ name=group_name,
144
+ targets=targets_or_scheme,
145
+ )
146
+
147
+ def to_dict(self):
148
+ # for compatibility with HFQuantizer
149
+ return self.dict()
117
150
 
118
151
  @staticmethod
119
- def from_pretrained(model: Module) -> "QuantizationConfig":
152
+ def from_pretrained(
153
+ model: Module, format: Optional[str] = None
154
+ ) -> Optional["QuantizationConfig"]:
120
155
  """
121
156
  Converts a model into its associated QuantizationConfig based on the
122
157
  QuantizationScheme attached to each quanitzed module
@@ -147,6 +182,9 @@ class QuantizationConfig(BaseModel):
147
182
  if not match_found:
148
183
  quant_scheme_to_layers.append(scheme)
149
184
 
185
+ if len(quant_scheme_to_layers) == 0: # No quantized layers
186
+ return None
187
+
150
188
  # clean up ignore list, we can leave out layers types if none of the
151
189
  # instances are quantized
152
190
  consolidated_ignore = []
@@ -162,10 +200,20 @@ class QuantizationConfig(BaseModel):
162
200
  group_name = "group_" + str(idx)
163
201
  config_groups[group_name] = scheme
164
202
 
203
+ # TODO: this is incorrect in compressed mode, since we are overwriting the
204
+ # original weight we lose the uncompressed bit_depth indo
165
205
  compression_ratio = calculate_compression_ratio(model)
206
+
207
+ if format is None:
208
+ if quantization_status == QuantizationStatus.COMPRESSED:
209
+ format = CompressionFormat.int_quantized.value
210
+ else:
211
+ format = CompressionFormat.dense.value
212
+
166
213
  return QuantizationConfig(
167
214
  config_groups=config_groups,
168
215
  quantization_status=quantization_status,
169
216
  global_compression_ratio=compression_ratio,
217
+ format=format,
170
218
  ignore=consolidated_ignore,
171
219
  )
@@ -12,13 +12,18 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from copy import deepcopy
15
16
  from typing import List, Optional
16
17
 
17
18
  from compressed_tensors.quantization.quant_args import QuantizationArgs
18
19
  from pydantic import BaseModel
19
20
 
20
21
 
21
- __all__ = ["QuantizationScheme"]
22
+ __all__ = [
23
+ "QuantizationScheme",
24
+ "preset_name_to_scheme",
25
+ "is_preset_scheme",
26
+ ]
22
27
 
23
28
 
24
29
  class QuantizationScheme(BaseModel):
@@ -37,3 +42,78 @@ class QuantizationScheme(BaseModel):
37
42
  weights: Optional[QuantizationArgs] = None
38
43
  input_activations: Optional[QuantizationArgs] = None
39
44
  output_activations: Optional[QuantizationArgs] = None
45
+
46
+ @classmethod
47
+ def default_scheme(
48
+ cls,
49
+ targets: Optional[List[str]] = None,
50
+ ):
51
+
52
+ if targets is None:
53
+ # default to quantizing all Linear layers
54
+ targets = ["Linear"]
55
+
56
+ # default to 8 bit integer symmetric quantization
57
+ # for weights
58
+ weights = QuantizationArgs(num_bits=8, symmetric=True)
59
+
60
+ # default to 8 bit integer asymmetric quantization
61
+ input_activations = QuantizationArgs(num_bits=8, symmetric=True)
62
+
63
+ # Do not quantize the output activations
64
+ # by default
65
+ output_activations = None
66
+
67
+ return cls(
68
+ targets=targets,
69
+ weights=weights,
70
+ input_activations=input_activations,
71
+ output_activations=output_activations,
72
+ )
73
+
74
+
75
+ """
76
+ Pre-Set Quantization Scheme Args
77
+ """
78
+
79
+
80
+ def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
81
+ """
82
+ :param name: preset quantization settings name. must exist in upper case in
83
+ PRESET_SCHEMES
84
+ :param targets: list of quantization targets to be passed to the Scheme
85
+ :return: new QuantizationScheme for a given name with the given targets
86
+ """
87
+ name = name.upper()
88
+
89
+ if name not in PRESET_SCHEMES:
90
+ raise KeyError(
91
+ f"Unknown preset scheme name {name}, "
92
+ f"available names: {list(PRESET_SCHEMES.keys())}"
93
+ )
94
+
95
+ scheme_args = deepcopy(PRESET_SCHEMES[name]) # deepcopy to avoid args references
96
+ return QuantizationScheme(
97
+ targets=targets,
98
+ **scheme_args,
99
+ )
100
+
101
+
102
+ def is_preset_scheme(name: str) -> bool:
103
+ """
104
+ :param name: preset quantization settings name
105
+ :return: True if the name is a preset scheme name
106
+ """
107
+ return name.upper() in PRESET_SCHEMES
108
+
109
+
110
+ W8A8 = dict(
111
+ weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=True)
112
+ )
113
+
114
+ W4A16 = dict(weights=QuantizationArgs(num_bits=4, group_size=128))
115
+
116
+ PRESET_SCHEMES = {
117
+ "W8A8": W8A8,
118
+ "W4A16": W4A16,
119
+ }
@@ -12,21 +12,43 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Tuple
15
+ import logging
16
+ from typing import Optional, Tuple
16
17
 
17
18
  import torch
19
+ from compressed_tensors.quantization.observers.base import Observer
18
20
  from torch.nn import Module
19
21
  from tqdm import tqdm
20
22
 
21
23
 
22
24
  __all__ = [
25
+ "infer_quantization_status",
23
26
  "is_module_quantized",
24
27
  "is_model_quantized",
25
28
  "iter_named_leaf_modules",
26
29
  "module_type",
27
30
  "calculate_compression_ratio",
31
+ "get_torch_bit_depth",
32
+ "can_quantize",
28
33
  ]
29
34
 
35
+ _LOGGER: logging.Logger = logging.getLogger(__name__)
36
+
37
+
38
+ def infer_quantization_status(model: Module) -> Optional["QuantizationStatus"]: # noqa
39
+ """
40
+ Checks the quantization status of a model. Assumes all modules in the model have
41
+ the same status, so only the first quantized model is checked.
42
+
43
+ :param model: model to check quantization status for
44
+ :return: quantization status if the model is quantized, otherwise None
45
+ """
46
+ for module in model.modules():
47
+ status = getattr(module, "quantization_status", None)
48
+ if status is not None:
49
+ return status
50
+ return None
51
+
30
52
 
31
53
  def is_module_quantized(module: Module) -> bool:
32
54
  """
@@ -78,11 +100,60 @@ def module_type(module: Module) -> str:
78
100
 
79
101
 
80
102
  def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
81
- # yields modules that do not have any submodules
82
- # TODO: potentially expand to add list of allowed submodules such as observers
103
+ """
104
+ Yields modules that do not have any submodules except observers. The observers
105
+ themselves are not yielded
106
+
107
+ :param model: model to get leaf modules of
108
+ :returns: generator tuple of (name, leaf_submodule)
109
+ """
83
110
  for name, submodule in model.named_modules():
84
- if len(list(submodule.children())) == 0:
111
+ children = list(submodule.children())
112
+ if len(children) == 0 and not isinstance(submodule, Observer):
85
113
  yield name, submodule
114
+ else:
115
+ has_non_observer_children = False
116
+ for child in children:
117
+ if not isinstance(child, Observer):
118
+ has_non_observer_children = True
119
+
120
+ if not has_non_observer_children:
121
+ yield name, submodule
122
+
123
+
124
+ def get_torch_bit_depth(value: torch.Tensor) -> int:
125
+ """
126
+ Determine the number of bits used to represent the dtype of a tensor
127
+
128
+ :param value: tensor to check bit depth of
129
+ :return: bit depth of each element in the value tensor
130
+ """
131
+ try:
132
+ bit_depth = torch.finfo(value.dtype).bits
133
+ except TypeError:
134
+ bit_depth = torch.iinfo(value.dtype).bits
135
+
136
+ return bit_depth
137
+
138
+
139
+ def can_quantize(value: torch.Tensor, quant_args: "QuantizationArgs") -> bool: # noqa
140
+ """
141
+ Checks if value can be quantized by quant_args.
142
+
143
+ :param value: tensor to check for quantization
144
+ :param quant_args: QuantizationArgs to use for quantization
145
+ :return: False if value is already quantized to quant_args or value is incompatible
146
+ with quant_args, True if value can be quantized with quant_args
147
+ """
148
+ bit_depth = get_torch_bit_depth(value)
149
+ requested_depth = quant_args.num_bits
150
+ if bit_depth < quant_args.num_bits:
151
+ _LOGGER.warn(
152
+ f"Can't quantize tensor with bit depth {bit_depth} to {requested_depth}."
153
+ "The QuantizationArgs provided are not compatible with the input tensor."
154
+ )
155
+
156
+ return bit_depth > quant_args.num_bits
86
157
 
87
158
 
88
159
  def calculate_compression_ratio(model: Module) -> float:
@@ -101,10 +172,7 @@ def calculate_compression_ratio(model: Module) -> float:
101
172
  desc="Calculating quantization compression ratio",
102
173
  ):
103
174
  for parameter in model.parameters():
104
- try:
105
- uncompressed_bits = torch.finfo(parameter.dtype).bits
106
- except TypeError:
107
- uncompressed_bits = torch.iinfo(parameter.dtype).bits
175
+ uncompressed_bits = get_torch_bit_depth(parameter)
108
176
  compressed_bits = uncompressed_bits
109
177
  if is_module_quantized(submodule):
110
178
  compressed_bits = submodule.quantization_scheme.weights.num_bits
@@ -15,18 +15,17 @@
15
15
 
16
16
  from typing import Optional
17
17
 
18
- from compressed_tensors.base import SPARSITY_CONFIG_NAME
19
- from compressed_tensors.compressors import ModelCompressor
20
- from compressed_tensors.config import CompressionConfig
21
18
  from transformers import AutoConfig
22
19
 
23
20
 
24
- __all__ = ["infer_compressor_from_model_config"]
21
+ __all__ = ["infer_compressor_from_model_config", "fix_fsdp_module_name"]
22
+
23
+ FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
25
24
 
26
25
 
27
26
  def infer_compressor_from_model_config(
28
27
  pretrained_model_name_or_path: str,
29
- ) -> Optional[ModelCompressor]:
28
+ ) -> Optional["ModelCompressor"]: # noqa: F821
30
29
  """
31
30
  Given a path to a model config, extract a sparsity config if it exists and return
32
31
  the associated ModelCompressor
@@ -34,8 +33,11 @@ def infer_compressor_from_model_config(
34
33
  :param pretrained_model_name_or_path: path to model config on disk or HF hub
35
34
  :return: matching compressor if config contains a sparsity config
36
35
  """
36
+ from compressed_tensors.compressors import ModelCompressor
37
+ from compressed_tensors.config import CompressionConfig
38
+
37
39
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
38
- sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
40
+ sparsity_config = ModelCompressor.parse_sparsity_config(config)
39
41
  if sparsity_config is None:
40
42
  return None
41
43
 
@@ -43,3 +45,19 @@ def infer_compressor_from_model_config(
43
45
  sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
44
46
  compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
45
47
  return compressor
48
+
49
+
50
+ # TODO: There is already the same function in
51
+ # SparseML, should be moved to a shared location
52
+ # in the future
53
+ def fix_fsdp_module_name(name: str) -> str:
54
+ """
55
+ Remove FSDP wrapper prefixes from a module name
56
+ Accounts for scenario where FSDP_WRAPPER_NAME is
57
+ at the end of the name, as well as in the middle.
58
+ :param name: name to strip
59
+ :return: stripped name
60
+ """
61
+ return name.replace(FSDP_WRAPPER_NAME + ".", "").replace(
62
+ "." + FSDP_WRAPPER_NAME, ""
63
+ )
@@ -31,6 +31,7 @@ __all__ = [
31
31
  "get_weight_mappings",
32
32
  "get_nested_weight_mappings",
33
33
  "get_quantization_state_dict",
34
+ "is_quantization_param",
34
35
  ]
35
36
 
36
37
 
@@ -214,7 +215,7 @@ def get_quantization_state_dict(model_path: str) -> Dict[str, Tensor]:
214
215
  weight_mappings = get_weight_mappings(model_path)
215
216
  state_dict = {}
216
217
  for weight_name, safe_path in weight_mappings.items():
217
- if not _is_quantization_weight(weight_name):
218
+ if not is_quantization_param(weight_name):
218
219
  continue
219
220
  with safe_open(safe_path, framework="pt", device="cpu") as f:
220
221
  state_dict[weight_name] = f.get_tensor(weight_name)
@@ -222,7 +223,7 @@ def get_quantization_state_dict(model_path: str) -> Dict[str, Tensor]:
222
223
  return state_dict
223
224
 
224
225
 
225
- def _is_quantization_weight(name: str) -> bool:
226
+ def is_quantization_param(name: str) -> bool:
226
227
  """
227
228
  Checks is a parameter name is associated with a quantization parameter
228
229
 
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Functionality for storing and setting the version info for SparseML
17
+ """
18
+
19
+
20
+ version_base = "0.4.0"
21
+ is_release = True # change to True to set the generated version as a release version
22
+
23
+
24
+ def _generate_version(
25
+ is_release: bool,
26
+ version_base: str,
27
+ ):
28
+ from datetime import date
29
+
30
+ if is_release:
31
+ return version_base
32
+ else:
33
+ return f"{version_base}.{date.today().strftime('%Y%m%d')}"
34
+
35
+
36
+ __all__ = [
37
+ "__version__",
38
+ "version_base",
39
+ "is_release",
40
+ "version",
41
+ "version_major",
42
+ "version_minor",
43
+ "version_patch",
44
+ "version_build",
45
+ "version_major_minor",
46
+ ]
47
+ __version__ = _generate_version(is_release, version_base)
48
+
49
+ version = __version__
50
+ version_major, version_minor, version_patch, version_build = version.split(".") + (
51
+ [None] if len(version.split(".")) < 4 else []
52
+ ) # handle conditional for version being 3 parts or 4 (4 containing build date)
53
+ version_major_minor = f"{version_major}.{version_minor}"
@@ -1,24 +1,23 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors
3
- Version: 0.3.3
3
+ Version: 0.4.0
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
7
7
  Author-email: support@neuralmagic.com
8
8
  License: Apache 2.0
9
- Platform: UNKNOWN
10
9
  Description-Content-Type: text/markdown
11
10
  License-File: LICENSE
12
- Requires-Dist: pydantic <2.7
13
11
  Requires-Dist: torch >=1.7.0
14
- Requires-Dist: transformers <4.41
12
+ Requires-Dist: transformers
13
+ Requires-Dist: pydantic >=2.0
15
14
  Provides-Extra: dev
16
15
  Requires-Dist: black ==22.12.0 ; extra == 'dev'
17
- Requires-Dist: flake8 >=3.8.3 ; extra == 'dev'
18
16
  Requires-Dist: isort ==5.8.0 ; extra == 'dev'
19
- Requires-Dist: nbconvert >=7.16.3 ; extra == 'dev'
20
- Requires-Dist: pytest >=6.0.0 ; extra == 'dev'
21
17
  Requires-Dist: wheel >=0.36.2 ; extra == 'dev'
18
+ Requires-Dist: flake8 >=3.8.3 ; extra == 'dev'
19
+ Requires-Dist: pytest >=6.0.0 ; extra == 'dev'
20
+ Requires-Dist: nbconvert >=7.16.3 ; extra == 'dev'
22
21
 
23
22
  # compressed_tensors
24
23
 
@@ -90,7 +89,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
90
89
  from transformers import AutoModelForCausalLM
91
90
 
92
91
  model_name = "neuralmagic/llama2.c-stories110M-pruned50"
93
- model = AutoModelForCausalLM.from_pretrained(model_name)
92
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
94
93
 
95
94
  original_state_dict = model.state_dict()
96
95
 
@@ -106,3 +105,42 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
106
105
  For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
107
106
 
108
107
 
108
+ ## Saving a Compressed Model with PTQ
109
+
110
+ We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
111
+
112
+ ```python
113
+ model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
114
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
115
+
116
+ config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
117
+ config.quantization_status = QuantizationStatus.CALIBRATION
118
+ apply_quantization_config(model, config)
119
+
120
+ dataset = load_dataset("ptb_text_only")["train"]
121
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
122
+
123
+ def tokenize_function(examples):
124
+ return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
125
+
126
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
127
+ data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
128
+
129
+ with torch.no_grad():
130
+ for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
131
+ sample = {key: value.to(device) for key,value in sample.items()}
132
+ _ = model(**sample)
133
+
134
+ if idx >= 512:
135
+ break
136
+
137
+ model.apply(freeze_module_quantization)
138
+ model.apply(compress_quantized_weights)
139
+
140
+ output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
141
+ compressor = ModelCompressor(quantization_config=config)
142
+ compressed_state_dict = compressor.compress(model)
143
+ model.save_pretrained(output_dir, state_dict=compressed_state_dict)
144
+ ```
145
+
146
+ For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
@@ -0,0 +1,48 @@
1
+ compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6hY,789
2
+ compressed_tensors/base.py,sha256=OA2TOLP1gP3LSH7gp508eqr2ZtDQ-pqRHElCp-aB0vs,755
3
+ compressed_tensors/version.py,sha256=_nj1yS4msz1OXd0H1v1m-z1JkMOuy19M9lFDTWP5xf0,1585
4
+ compressed_tensors/compressors/__init__.py,sha256=rhqPp3YXFxCJRLZs1KRNSHTIxK2rNU--sYwDI8MW47w,1061
5
+ compressed_tensors/compressors/base.py,sha256=LWEgbpgTxzmoqQ7Xhq2OQszUgWoDtFuGCiV1Y8nlBGw,2134
6
+ compressed_tensors/compressors/dense.py,sha256=G_XHbvuENyupIKlXSITOQgvPkNkcMEOLcLWQr70V9EE,1257
7
+ compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
8
+ compressed_tensors/compressors/int_quantized.py,sha256=Ct2vCK0yoPm6vkIFlzDMGQ7m14xT1GyURsSwH9DP770,5242
9
+ compressed_tensors/compressors/marlin_24.py,sha256=X_BjtFB3Mn0hqiLz56UM3jGX2eNmGLnvEIPfbg7di6U,9444
10
+ compressed_tensors/compressors/model_compressor.py,sha256=h3ixQtfzt6HxSNtdnB9OVdpCucTmIo4paDoaM7XYZXE,12559
11
+ compressed_tensors/compressors/pack_quantized.py,sha256=VPiLlgJlDgARrn7YmiQoLqUfxErKBfj54epMYWRsF8k,8451
12
+ compressed_tensors/compressors/sparse_bitmask.py,sha256=H9oZSTYI1oRCzAMbd4zThUnZd1h2rfs8DmA3tPcvuNE,8637
13
+ compressed_tensors/compressors/utils/__init__.py,sha256=-mbGDZh1hd9T6u62Ht_iBIK255UmMg0f5bLkSs1f9Cc,731
14
+ compressed_tensors/compressors/utils/helpers.py,sha256=4fq7KclSIK__jemCG9pwYlgWLrQjsaAMxhIrhjdw0BQ,1506
15
+ compressed_tensors/compressors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVyah6BUUir_StT28,2530
16
+ compressed_tensors/compressors/utils/semi_structured_conversions.py,sha256=g1EZHzdv-ko7ufPX430dp7wE33o6FWJXuSP4zZydCu0,13488
17
+ compressed_tensors/config/__init__.py,sha256=ZBqWn3r6ku1qfmlHHYp0mQueY0i7Pwhr9rbQk9dDlMc,704
18
+ compressed_tensors/config/base.py,sha256=ZnpuOevCE0pXdA8OJfIJnxj-ccproH7o1EOwRY8_hUU,1482
19
+ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74jNbjks,1317
20
+ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
21
+ compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
22
+ compressed_tensors/quantization/quant_args.py,sha256=Z9Zu20ooAwEWlliAdUw1f1zwSrheuD6vqm3YXgJ1Lws,4388
23
+ compressed_tensors/quantization/quant_config.py,sha256=hL42sXp1wAZxyrkHarw7tAMRcwSVEr0MT3wmrmL3NhE,8285
24
+ compressed_tensors/quantization/quant_scheme.py,sha256=aX4h8t8RDqrWeUqoqrYMOxc0xkWcu8Ue_CHLoG-fRjQ,3569
25
+ compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
26
+ compressed_tensors/quantization/lifecycle/apply.py,sha256=aZrglJ5mR3Xaxwj51-1BVVB1JGVkKQEeHxGfBaVmsHI,8881
27
+ compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
28
+ compressed_tensors/quantization/lifecycle/compressed.py,sha256=VreB10xPwgSLQQlTu20UCrFpRS--cA7-lx5s7nrPPrg,2247
29
+ compressed_tensors/quantization/lifecycle/forward.py,sha256=0T817yzYqFR1wUjk2XCtOISwr4u7cdkKqAv13jjfu24,11113
30
+ compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
31
+ compressed_tensors/quantization/lifecycle/initialize.py,sha256=9xgPzHejQUO_AkZcc_SH5kqFeieG-9uo0fMRYV51i7Y,4577
32
+ compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
33
+ compressed_tensors/quantization/observers/base.py,sha256=z_JC-CRz-PY7WlpSoyOoSQQWz5ekTEd5LbXt0iHQRes,5239
34
+ compressed_tensors/quantization/observers/helpers.py,sha256=FUyYUNd-3LbXt0-8Lwr7EPI2m-LXXBTXW1l5iOajNhA,2272
35
+ compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
36
+ compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
37
+ compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
38
+ compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
39
+ compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
40
+ compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
41
+ compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
42
+ compressed_tensors/utils/helpers.py,sha256=5ull5yFT31M2zVxKeFvpvvlvX5f1Sk1LGuj_wrfZWCY,2267
43
+ compressed_tensors/utils/safetensors_load.py,sha256=0MheXwx1jeY12PeISppiSIZHs6rmN2YddwPpFb9V67I,8527
44
+ compressed_tensors-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ compressed_tensors-0.4.0.dist-info/METADATA,sha256=NtnK_A9ck3KPmh4syGcGtMBGX-_2FyFa7ntCAdf-KGo,5651
46
+ compressed_tensors-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
47
+ compressed_tensors-0.4.0.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
48
+ compressed_tensors-0.4.0.dist-info/RECORD,,