compressed-tensors 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/base.py +2 -1
- compressed_tensors/compressors/__init__.py +5 -1
- compressed_tensors/compressors/base.py +11 -54
- compressed_tensors/compressors/dense.py +4 -4
- compressed_tensors/compressors/helpers.py +12 -12
- compressed_tensors/compressors/int_quantized.py +126 -0
- compressed_tensors/compressors/marlin_24.py +250 -0
- compressed_tensors/compressors/model_compressor.py +315 -0
- compressed_tensors/compressors/pack_quantized.py +212 -0
- compressed_tensors/compressors/sparse_bitmask.py +3 -3
- compressed_tensors/compressors/utils/__init__.py +19 -0
- compressed_tensors/compressors/utils/helpers.py +43 -0
- compressed_tensors/compressors/utils/permutations_24.py +65 -0
- compressed_tensors/compressors/utils/semi_structured_conversions.py +341 -0
- compressed_tensors/config/base.py +7 -4
- compressed_tensors/config/dense.py +4 -4
- compressed_tensors/config/sparse_bitmask.py +3 -3
- compressed_tensors/quantization/lifecycle/__init__.py +1 -0
- compressed_tensors/quantization/lifecycle/apply.py +62 -11
- compressed_tensors/quantization/lifecycle/compressed.py +69 -0
- compressed_tensors/quantization/lifecycle/forward.py +161 -54
- compressed_tensors/quantization/lifecycle/frozen.py +4 -0
- compressed_tensors/quantization/lifecycle/initialize.py +33 -5
- compressed_tensors/quantization/observers/base.py +31 -27
- compressed_tensors/quantization/observers/helpers.py +6 -1
- compressed_tensors/quantization/observers/memoryless.py +17 -9
- compressed_tensors/quantization/observers/min_max.py +44 -13
- compressed_tensors/quantization/quant_args.py +2 -2
- compressed_tensors/quantization/quant_config.py +69 -21
- compressed_tensors/quantization/quant_scheme.py +81 -1
- compressed_tensors/quantization/utils/helpers.py +76 -8
- compressed_tensors/utils/helpers.py +24 -6
- compressed_tensors/utils/safetensors_load.py +3 -2
- compressed_tensors/version.py +53 -0
- {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/METADATA +46 -8
- compressed_tensors-0.4.0.dist-info/RECORD +48 -0
- compressed_tensors-0.3.3.dist-info/RECORD +0 -38
- {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/LICENSE +0 -0
- {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/WHEEL +0 -0
- {compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/top_level.txt +0 -0
@@ -13,10 +13,13 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from enum import Enum
|
16
|
-
from typing import Dict, List, Optional
|
16
|
+
from typing import Dict, List, Optional, Union
|
17
17
|
|
18
|
-
from compressed_tensors.
|
19
|
-
from compressed_tensors.quantization.quant_scheme import
|
18
|
+
from compressed_tensors.config import CompressionFormat
|
19
|
+
from compressed_tensors.quantization.quant_scheme import (
|
20
|
+
QuantizationScheme,
|
21
|
+
preset_name_to_scheme,
|
22
|
+
)
|
20
23
|
from compressed_tensors.quantization.utils import (
|
21
24
|
calculate_compression_ratio,
|
22
25
|
is_module_quantized,
|
@@ -25,13 +28,14 @@ from compressed_tensors.quantization.utils import (
|
|
25
28
|
)
|
26
29
|
from pydantic import BaseModel, Field
|
27
30
|
from torch.nn import Module
|
28
|
-
from transformers import AutoConfig
|
29
31
|
|
30
32
|
|
31
33
|
__all__ = [
|
32
34
|
"QuantizationStatus",
|
33
35
|
"QuantizationConfig",
|
34
36
|
"LIFECYCLE_ORDER",
|
37
|
+
"DEFAULT_QUANTIZATION_METHOD",
|
38
|
+
"DEFAULT_QUANTIZATION_FORMAT",
|
35
39
|
]
|
36
40
|
|
37
41
|
|
@@ -62,10 +66,33 @@ class QuantizationStatus(str, Enum):
|
|
62
66
|
return
|
63
67
|
|
64
68
|
def __ge__(self, other):
|
69
|
+
if other is None:
|
70
|
+
return True
|
65
71
|
if not isinstance(other, self.__class__):
|
66
72
|
raise NotImplementedError
|
67
73
|
return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
|
68
74
|
|
75
|
+
def __gt__(self, other):
|
76
|
+
if other is None:
|
77
|
+
return True
|
78
|
+
if not isinstance(other, self.__class__):
|
79
|
+
raise NotImplementedError
|
80
|
+
return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other)
|
81
|
+
|
82
|
+
def __lt__(self, other):
|
83
|
+
if other is None:
|
84
|
+
return False
|
85
|
+
if not isinstance(other, self.__class__):
|
86
|
+
raise NotImplementedError
|
87
|
+
return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other)
|
88
|
+
|
89
|
+
def __le__(self, other):
|
90
|
+
if other is None:
|
91
|
+
return False
|
92
|
+
if not isinstance(other, self.__class__):
|
93
|
+
raise NotImplementedError
|
94
|
+
return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other)
|
95
|
+
|
69
96
|
|
70
97
|
LIFECYCLE_ORDER = [
|
71
98
|
QuantizationStatus.INITIALIZED,
|
@@ -74,6 +101,9 @@ LIFECYCLE_ORDER = [
|
|
74
101
|
QuantizationStatus.COMPRESSED,
|
75
102
|
]
|
76
103
|
|
104
|
+
DEFAULT_QUANTIZATION_METHOD = "compressed-tensors"
|
105
|
+
DEFAULT_QUANTIZATION_FORMAT = "fakequant"
|
106
|
+
|
77
107
|
|
78
108
|
class QuantizationConfig(BaseModel):
|
79
109
|
"""
|
@@ -81,7 +111,8 @@ class QuantizationConfig(BaseModel):
|
|
81
111
|
mapped to a QuantizationScheme in config_groups.
|
82
112
|
|
83
113
|
:param config_groups: dict of QuantizationSchemes specifying the quantization
|
84
|
-
settings for each quantized layer
|
114
|
+
settings for each quantized layer. A group could also be a reference to
|
115
|
+
a predefined scheme name, mapped to a list of its target layers/classes
|
85
116
|
:param quant_method: a constant used to differentiate sparseML quantization from
|
86
117
|
other quantization configs
|
87
118
|
:param format: specifies how the quantized model is stored on disk
|
@@ -93,30 +124,34 @@ class QuantizationConfig(BaseModel):
|
|
93
124
|
are not quantized even if they match up with a target in config_groups
|
94
125
|
"""
|
95
126
|
|
96
|
-
config_groups: Dict[str, QuantizationScheme]
|
97
|
-
quant_method: str =
|
98
|
-
format: str =
|
127
|
+
config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
|
128
|
+
quant_method: str = DEFAULT_QUANTIZATION_METHOD
|
129
|
+
format: str = DEFAULT_QUANTIZATION_FORMAT
|
99
130
|
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
|
100
131
|
global_compression_ratio: Optional[float] = None
|
101
132
|
ignore: Optional[List[str]] = Field(default_factory=list)
|
102
133
|
|
103
|
-
|
104
|
-
def from_model_config(model_name_or_path) -> "QuantizationConfig":
|
134
|
+
def model_post_init(self, __context):
|
105
135
|
"""
|
106
|
-
|
107
|
-
|
108
|
-
:param pretrained_model_name_or_path: path to model config on disk or HF hub
|
109
|
-
:return: instantiated QuantizationConfig if config contains a quant config
|
136
|
+
updates any quantization schemes defined as presets to be fully loaded
|
137
|
+
schemes
|
110
138
|
"""
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
139
|
+
for group_name, targets_or_scheme in self.config_groups.items():
|
140
|
+
if isinstance(targets_or_scheme, QuantizationScheme):
|
141
|
+
continue # scheme already defined
|
142
|
+
self.config_groups[group_name] = preset_name_to_scheme(
|
143
|
+
name=group_name,
|
144
|
+
targets=targets_or_scheme,
|
145
|
+
)
|
146
|
+
|
147
|
+
def to_dict(self):
|
148
|
+
# for compatibility with HFQuantizer
|
149
|
+
return self.dict()
|
117
150
|
|
118
151
|
@staticmethod
|
119
|
-
def from_pretrained(
|
152
|
+
def from_pretrained(
|
153
|
+
model: Module, format: Optional[str] = None
|
154
|
+
) -> Optional["QuantizationConfig"]:
|
120
155
|
"""
|
121
156
|
Converts a model into its associated QuantizationConfig based on the
|
122
157
|
QuantizationScheme attached to each quanitzed module
|
@@ -147,6 +182,9 @@ class QuantizationConfig(BaseModel):
|
|
147
182
|
if not match_found:
|
148
183
|
quant_scheme_to_layers.append(scheme)
|
149
184
|
|
185
|
+
if len(quant_scheme_to_layers) == 0: # No quantized layers
|
186
|
+
return None
|
187
|
+
|
150
188
|
# clean up ignore list, we can leave out layers types if none of the
|
151
189
|
# instances are quantized
|
152
190
|
consolidated_ignore = []
|
@@ -162,10 +200,20 @@ class QuantizationConfig(BaseModel):
|
|
162
200
|
group_name = "group_" + str(idx)
|
163
201
|
config_groups[group_name] = scheme
|
164
202
|
|
203
|
+
# TODO: this is incorrect in compressed mode, since we are overwriting the
|
204
|
+
# original weight we lose the uncompressed bit_depth indo
|
165
205
|
compression_ratio = calculate_compression_ratio(model)
|
206
|
+
|
207
|
+
if format is None:
|
208
|
+
if quantization_status == QuantizationStatus.COMPRESSED:
|
209
|
+
format = CompressionFormat.int_quantized.value
|
210
|
+
else:
|
211
|
+
format = CompressionFormat.dense.value
|
212
|
+
|
166
213
|
return QuantizationConfig(
|
167
214
|
config_groups=config_groups,
|
168
215
|
quantization_status=quantization_status,
|
169
216
|
global_compression_ratio=compression_ratio,
|
217
|
+
format=format,
|
170
218
|
ignore=consolidated_ignore,
|
171
219
|
)
|
@@ -12,13 +12,18 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from copy import deepcopy
|
15
16
|
from typing import List, Optional
|
16
17
|
|
17
18
|
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
18
19
|
from pydantic import BaseModel
|
19
20
|
|
20
21
|
|
21
|
-
__all__ = [
|
22
|
+
__all__ = [
|
23
|
+
"QuantizationScheme",
|
24
|
+
"preset_name_to_scheme",
|
25
|
+
"is_preset_scheme",
|
26
|
+
]
|
22
27
|
|
23
28
|
|
24
29
|
class QuantizationScheme(BaseModel):
|
@@ -37,3 +42,78 @@ class QuantizationScheme(BaseModel):
|
|
37
42
|
weights: Optional[QuantizationArgs] = None
|
38
43
|
input_activations: Optional[QuantizationArgs] = None
|
39
44
|
output_activations: Optional[QuantizationArgs] = None
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
def default_scheme(
|
48
|
+
cls,
|
49
|
+
targets: Optional[List[str]] = None,
|
50
|
+
):
|
51
|
+
|
52
|
+
if targets is None:
|
53
|
+
# default to quantizing all Linear layers
|
54
|
+
targets = ["Linear"]
|
55
|
+
|
56
|
+
# default to 8 bit integer symmetric quantization
|
57
|
+
# for weights
|
58
|
+
weights = QuantizationArgs(num_bits=8, symmetric=True)
|
59
|
+
|
60
|
+
# default to 8 bit integer asymmetric quantization
|
61
|
+
input_activations = QuantizationArgs(num_bits=8, symmetric=True)
|
62
|
+
|
63
|
+
# Do not quantize the output activations
|
64
|
+
# by default
|
65
|
+
output_activations = None
|
66
|
+
|
67
|
+
return cls(
|
68
|
+
targets=targets,
|
69
|
+
weights=weights,
|
70
|
+
input_activations=input_activations,
|
71
|
+
output_activations=output_activations,
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
"""
|
76
|
+
Pre-Set Quantization Scheme Args
|
77
|
+
"""
|
78
|
+
|
79
|
+
|
80
|
+
def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
|
81
|
+
"""
|
82
|
+
:param name: preset quantization settings name. must exist in upper case in
|
83
|
+
PRESET_SCHEMES
|
84
|
+
:param targets: list of quantization targets to be passed to the Scheme
|
85
|
+
:return: new QuantizationScheme for a given name with the given targets
|
86
|
+
"""
|
87
|
+
name = name.upper()
|
88
|
+
|
89
|
+
if name not in PRESET_SCHEMES:
|
90
|
+
raise KeyError(
|
91
|
+
f"Unknown preset scheme name {name}, "
|
92
|
+
f"available names: {list(PRESET_SCHEMES.keys())}"
|
93
|
+
)
|
94
|
+
|
95
|
+
scheme_args = deepcopy(PRESET_SCHEMES[name]) # deepcopy to avoid args references
|
96
|
+
return QuantizationScheme(
|
97
|
+
targets=targets,
|
98
|
+
**scheme_args,
|
99
|
+
)
|
100
|
+
|
101
|
+
|
102
|
+
def is_preset_scheme(name: str) -> bool:
|
103
|
+
"""
|
104
|
+
:param name: preset quantization settings name
|
105
|
+
:return: True if the name is a preset scheme name
|
106
|
+
"""
|
107
|
+
return name.upper() in PRESET_SCHEMES
|
108
|
+
|
109
|
+
|
110
|
+
W8A8 = dict(
|
111
|
+
weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=True)
|
112
|
+
)
|
113
|
+
|
114
|
+
W4A16 = dict(weights=QuantizationArgs(num_bits=4, group_size=128))
|
115
|
+
|
116
|
+
PRESET_SCHEMES = {
|
117
|
+
"W8A8": W8A8,
|
118
|
+
"W4A16": W4A16,
|
119
|
+
}
|
@@ -12,21 +12,43 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
|
15
|
+
import logging
|
16
|
+
from typing import Optional, Tuple
|
16
17
|
|
17
18
|
import torch
|
19
|
+
from compressed_tensors.quantization.observers.base import Observer
|
18
20
|
from torch.nn import Module
|
19
21
|
from tqdm import tqdm
|
20
22
|
|
21
23
|
|
22
24
|
__all__ = [
|
25
|
+
"infer_quantization_status",
|
23
26
|
"is_module_quantized",
|
24
27
|
"is_model_quantized",
|
25
28
|
"iter_named_leaf_modules",
|
26
29
|
"module_type",
|
27
30
|
"calculate_compression_ratio",
|
31
|
+
"get_torch_bit_depth",
|
32
|
+
"can_quantize",
|
28
33
|
]
|
29
34
|
|
35
|
+
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
def infer_quantization_status(model: Module) -> Optional["QuantizationStatus"]: # noqa
|
39
|
+
"""
|
40
|
+
Checks the quantization status of a model. Assumes all modules in the model have
|
41
|
+
the same status, so only the first quantized model is checked.
|
42
|
+
|
43
|
+
:param model: model to check quantization status for
|
44
|
+
:return: quantization status if the model is quantized, otherwise None
|
45
|
+
"""
|
46
|
+
for module in model.modules():
|
47
|
+
status = getattr(module, "quantization_status", None)
|
48
|
+
if status is not None:
|
49
|
+
return status
|
50
|
+
return None
|
51
|
+
|
30
52
|
|
31
53
|
def is_module_quantized(module: Module) -> bool:
|
32
54
|
"""
|
@@ -78,11 +100,60 @@ def module_type(module: Module) -> str:
|
|
78
100
|
|
79
101
|
|
80
102
|
def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
|
81
|
-
|
82
|
-
|
103
|
+
"""
|
104
|
+
Yields modules that do not have any submodules except observers. The observers
|
105
|
+
themselves are not yielded
|
106
|
+
|
107
|
+
:param model: model to get leaf modules of
|
108
|
+
:returns: generator tuple of (name, leaf_submodule)
|
109
|
+
"""
|
83
110
|
for name, submodule in model.named_modules():
|
84
|
-
|
111
|
+
children = list(submodule.children())
|
112
|
+
if len(children) == 0 and not isinstance(submodule, Observer):
|
85
113
|
yield name, submodule
|
114
|
+
else:
|
115
|
+
has_non_observer_children = False
|
116
|
+
for child in children:
|
117
|
+
if not isinstance(child, Observer):
|
118
|
+
has_non_observer_children = True
|
119
|
+
|
120
|
+
if not has_non_observer_children:
|
121
|
+
yield name, submodule
|
122
|
+
|
123
|
+
|
124
|
+
def get_torch_bit_depth(value: torch.Tensor) -> int:
|
125
|
+
"""
|
126
|
+
Determine the number of bits used to represent the dtype of a tensor
|
127
|
+
|
128
|
+
:param value: tensor to check bit depth of
|
129
|
+
:return: bit depth of each element in the value tensor
|
130
|
+
"""
|
131
|
+
try:
|
132
|
+
bit_depth = torch.finfo(value.dtype).bits
|
133
|
+
except TypeError:
|
134
|
+
bit_depth = torch.iinfo(value.dtype).bits
|
135
|
+
|
136
|
+
return bit_depth
|
137
|
+
|
138
|
+
|
139
|
+
def can_quantize(value: torch.Tensor, quant_args: "QuantizationArgs") -> bool: # noqa
|
140
|
+
"""
|
141
|
+
Checks if value can be quantized by quant_args.
|
142
|
+
|
143
|
+
:param value: tensor to check for quantization
|
144
|
+
:param quant_args: QuantizationArgs to use for quantization
|
145
|
+
:return: False if value is already quantized to quant_args or value is incompatible
|
146
|
+
with quant_args, True if value can be quantized with quant_args
|
147
|
+
"""
|
148
|
+
bit_depth = get_torch_bit_depth(value)
|
149
|
+
requested_depth = quant_args.num_bits
|
150
|
+
if bit_depth < quant_args.num_bits:
|
151
|
+
_LOGGER.warn(
|
152
|
+
f"Can't quantize tensor with bit depth {bit_depth} to {requested_depth}."
|
153
|
+
"The QuantizationArgs provided are not compatible with the input tensor."
|
154
|
+
)
|
155
|
+
|
156
|
+
return bit_depth > quant_args.num_bits
|
86
157
|
|
87
158
|
|
88
159
|
def calculate_compression_ratio(model: Module) -> float:
|
@@ -101,10 +172,7 @@ def calculate_compression_ratio(model: Module) -> float:
|
|
101
172
|
desc="Calculating quantization compression ratio",
|
102
173
|
):
|
103
174
|
for parameter in model.parameters():
|
104
|
-
|
105
|
-
uncompressed_bits = torch.finfo(parameter.dtype).bits
|
106
|
-
except TypeError:
|
107
|
-
uncompressed_bits = torch.iinfo(parameter.dtype).bits
|
175
|
+
uncompressed_bits = get_torch_bit_depth(parameter)
|
108
176
|
compressed_bits = uncompressed_bits
|
109
177
|
if is_module_quantized(submodule):
|
110
178
|
compressed_bits = submodule.quantization_scheme.weights.num_bits
|
@@ -15,18 +15,17 @@
|
|
15
15
|
|
16
16
|
from typing import Optional
|
17
17
|
|
18
|
-
from compressed_tensors.base import SPARSITY_CONFIG_NAME
|
19
|
-
from compressed_tensors.compressors import ModelCompressor
|
20
|
-
from compressed_tensors.config import CompressionConfig
|
21
18
|
from transformers import AutoConfig
|
22
19
|
|
23
20
|
|
24
|
-
__all__ = ["infer_compressor_from_model_config"]
|
21
|
+
__all__ = ["infer_compressor_from_model_config", "fix_fsdp_module_name"]
|
22
|
+
|
23
|
+
FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
|
25
24
|
|
26
25
|
|
27
26
|
def infer_compressor_from_model_config(
|
28
27
|
pretrained_model_name_or_path: str,
|
29
|
-
) -> Optional[ModelCompressor]:
|
28
|
+
) -> Optional["ModelCompressor"]: # noqa: F821
|
30
29
|
"""
|
31
30
|
Given a path to a model config, extract a sparsity config if it exists and return
|
32
31
|
the associated ModelCompressor
|
@@ -34,8 +33,11 @@ def infer_compressor_from_model_config(
|
|
34
33
|
:param pretrained_model_name_or_path: path to model config on disk or HF hub
|
35
34
|
:return: matching compressor if config contains a sparsity config
|
36
35
|
"""
|
36
|
+
from compressed_tensors.compressors import ModelCompressor
|
37
|
+
from compressed_tensors.config import CompressionConfig
|
38
|
+
|
37
39
|
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
|
38
|
-
sparsity_config =
|
40
|
+
sparsity_config = ModelCompressor.parse_sparsity_config(config)
|
39
41
|
if sparsity_config is None:
|
40
42
|
return None
|
41
43
|
|
@@ -43,3 +45,19 @@ def infer_compressor_from_model_config(
|
|
43
45
|
sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
|
44
46
|
compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
|
45
47
|
return compressor
|
48
|
+
|
49
|
+
|
50
|
+
# TODO: There is already the same function in
|
51
|
+
# SparseML, should be moved to a shared location
|
52
|
+
# in the future
|
53
|
+
def fix_fsdp_module_name(name: str) -> str:
|
54
|
+
"""
|
55
|
+
Remove FSDP wrapper prefixes from a module name
|
56
|
+
Accounts for scenario where FSDP_WRAPPER_NAME is
|
57
|
+
at the end of the name, as well as in the middle.
|
58
|
+
:param name: name to strip
|
59
|
+
:return: stripped name
|
60
|
+
"""
|
61
|
+
return name.replace(FSDP_WRAPPER_NAME + ".", "").replace(
|
62
|
+
"." + FSDP_WRAPPER_NAME, ""
|
63
|
+
)
|
@@ -31,6 +31,7 @@ __all__ = [
|
|
31
31
|
"get_weight_mappings",
|
32
32
|
"get_nested_weight_mappings",
|
33
33
|
"get_quantization_state_dict",
|
34
|
+
"is_quantization_param",
|
34
35
|
]
|
35
36
|
|
36
37
|
|
@@ -214,7 +215,7 @@ def get_quantization_state_dict(model_path: str) -> Dict[str, Tensor]:
|
|
214
215
|
weight_mappings = get_weight_mappings(model_path)
|
215
216
|
state_dict = {}
|
216
217
|
for weight_name, safe_path in weight_mappings.items():
|
217
|
-
if not
|
218
|
+
if not is_quantization_param(weight_name):
|
218
219
|
continue
|
219
220
|
with safe_open(safe_path, framework="pt", device="cpu") as f:
|
220
221
|
state_dict[weight_name] = f.get_tensor(weight_name)
|
@@ -222,7 +223,7 @@ def get_quantization_state_dict(model_path: str) -> Dict[str, Tensor]:
|
|
222
223
|
return state_dict
|
223
224
|
|
224
225
|
|
225
|
-
def
|
226
|
+
def is_quantization_param(name: str) -> bool:
|
226
227
|
"""
|
227
228
|
Checks is a parameter name is associated with a quantization parameter
|
228
229
|
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""
|
16
|
+
Functionality for storing and setting the version info for SparseML
|
17
|
+
"""
|
18
|
+
|
19
|
+
|
20
|
+
version_base = "0.4.0"
|
21
|
+
is_release = True # change to True to set the generated version as a release version
|
22
|
+
|
23
|
+
|
24
|
+
def _generate_version(
|
25
|
+
is_release: bool,
|
26
|
+
version_base: str,
|
27
|
+
):
|
28
|
+
from datetime import date
|
29
|
+
|
30
|
+
if is_release:
|
31
|
+
return version_base
|
32
|
+
else:
|
33
|
+
return f"{version_base}.{date.today().strftime('%Y%m%d')}"
|
34
|
+
|
35
|
+
|
36
|
+
__all__ = [
|
37
|
+
"__version__",
|
38
|
+
"version_base",
|
39
|
+
"is_release",
|
40
|
+
"version",
|
41
|
+
"version_major",
|
42
|
+
"version_minor",
|
43
|
+
"version_patch",
|
44
|
+
"version_build",
|
45
|
+
"version_major_minor",
|
46
|
+
]
|
47
|
+
__version__ = _generate_version(is_release, version_base)
|
48
|
+
|
49
|
+
version = __version__
|
50
|
+
version_major, version_minor, version_patch, version_build = version.split(".") + (
|
51
|
+
[None] if len(version.split(".")) < 4 else []
|
52
|
+
) # handle conditional for version being 3 parts or 4 (4 containing build date)
|
53
|
+
version_major_minor = f"{version_major}.{version_minor}"
|
@@ -1,24 +1,23 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
7
7
|
Author-email: support@neuralmagic.com
|
8
8
|
License: Apache 2.0
|
9
|
-
Platform: UNKNOWN
|
10
9
|
Description-Content-Type: text/markdown
|
11
10
|
License-File: LICENSE
|
12
|
-
Requires-Dist: pydantic <2.7
|
13
11
|
Requires-Dist: torch >=1.7.0
|
14
|
-
Requires-Dist: transformers
|
12
|
+
Requires-Dist: transformers
|
13
|
+
Requires-Dist: pydantic >=2.0
|
15
14
|
Provides-Extra: dev
|
16
15
|
Requires-Dist: black ==22.12.0 ; extra == 'dev'
|
17
|
-
Requires-Dist: flake8 >=3.8.3 ; extra == 'dev'
|
18
16
|
Requires-Dist: isort ==5.8.0 ; extra == 'dev'
|
19
|
-
Requires-Dist: nbconvert >=7.16.3 ; extra == 'dev'
|
20
|
-
Requires-Dist: pytest >=6.0.0 ; extra == 'dev'
|
21
17
|
Requires-Dist: wheel >=0.36.2 ; extra == 'dev'
|
18
|
+
Requires-Dist: flake8 >=3.8.3 ; extra == 'dev'
|
19
|
+
Requires-Dist: pytest >=6.0.0 ; extra == 'dev'
|
20
|
+
Requires-Dist: nbconvert >=7.16.3 ; extra == 'dev'
|
22
21
|
|
23
22
|
# compressed_tensors
|
24
23
|
|
@@ -90,7 +89,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
|
|
90
89
|
from transformers import AutoModelForCausalLM
|
91
90
|
|
92
91
|
model_name = "neuralmagic/llama2.c-stories110M-pruned50"
|
93
|
-
model = AutoModelForCausalLM.from_pretrained(model_name)
|
92
|
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
94
93
|
|
95
94
|
original_state_dict = model.state_dict()
|
96
95
|
|
@@ -106,3 +105,42 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
|
|
106
105
|
For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
|
107
106
|
|
108
107
|
|
108
|
+
## Saving a Compressed Model with PTQ
|
109
|
+
|
110
|
+
We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
|
111
|
+
|
112
|
+
```python
|
113
|
+
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
|
114
|
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
|
115
|
+
|
116
|
+
config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
|
117
|
+
config.quantization_status = QuantizationStatus.CALIBRATION
|
118
|
+
apply_quantization_config(model, config)
|
119
|
+
|
120
|
+
dataset = load_dataset("ptb_text_only")["train"]
|
121
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
122
|
+
|
123
|
+
def tokenize_function(examples):
|
124
|
+
return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
|
125
|
+
|
126
|
+
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
127
|
+
data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
|
128
|
+
|
129
|
+
with torch.no_grad():
|
130
|
+
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
|
131
|
+
sample = {key: value.to(device) for key,value in sample.items()}
|
132
|
+
_ = model(**sample)
|
133
|
+
|
134
|
+
if idx >= 512:
|
135
|
+
break
|
136
|
+
|
137
|
+
model.apply(freeze_module_quantization)
|
138
|
+
model.apply(compress_quantized_weights)
|
139
|
+
|
140
|
+
output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
|
141
|
+
compressor = ModelCompressor(quantization_config=config)
|
142
|
+
compressed_state_dict = compressor.compress(model)
|
143
|
+
model.save_pretrained(output_dir, state_dict=compressed_state_dict)
|
144
|
+
```
|
145
|
+
|
146
|
+
For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
|
@@ -0,0 +1,48 @@
|
|
1
|
+
compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6hY,789
|
2
|
+
compressed_tensors/base.py,sha256=OA2TOLP1gP3LSH7gp508eqr2ZtDQ-pqRHElCp-aB0vs,755
|
3
|
+
compressed_tensors/version.py,sha256=_nj1yS4msz1OXd0H1v1m-z1JkMOuy19M9lFDTWP5xf0,1585
|
4
|
+
compressed_tensors/compressors/__init__.py,sha256=rhqPp3YXFxCJRLZs1KRNSHTIxK2rNU--sYwDI8MW47w,1061
|
5
|
+
compressed_tensors/compressors/base.py,sha256=LWEgbpgTxzmoqQ7Xhq2OQszUgWoDtFuGCiV1Y8nlBGw,2134
|
6
|
+
compressed_tensors/compressors/dense.py,sha256=G_XHbvuENyupIKlXSITOQgvPkNkcMEOLcLWQr70V9EE,1257
|
7
|
+
compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
|
8
|
+
compressed_tensors/compressors/int_quantized.py,sha256=Ct2vCK0yoPm6vkIFlzDMGQ7m14xT1GyURsSwH9DP770,5242
|
9
|
+
compressed_tensors/compressors/marlin_24.py,sha256=X_BjtFB3Mn0hqiLz56UM3jGX2eNmGLnvEIPfbg7di6U,9444
|
10
|
+
compressed_tensors/compressors/model_compressor.py,sha256=h3ixQtfzt6HxSNtdnB9OVdpCucTmIo4paDoaM7XYZXE,12559
|
11
|
+
compressed_tensors/compressors/pack_quantized.py,sha256=VPiLlgJlDgARrn7YmiQoLqUfxErKBfj54epMYWRsF8k,8451
|
12
|
+
compressed_tensors/compressors/sparse_bitmask.py,sha256=H9oZSTYI1oRCzAMbd4zThUnZd1h2rfs8DmA3tPcvuNE,8637
|
13
|
+
compressed_tensors/compressors/utils/__init__.py,sha256=-mbGDZh1hd9T6u62Ht_iBIK255UmMg0f5bLkSs1f9Cc,731
|
14
|
+
compressed_tensors/compressors/utils/helpers.py,sha256=4fq7KclSIK__jemCG9pwYlgWLrQjsaAMxhIrhjdw0BQ,1506
|
15
|
+
compressed_tensors/compressors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVyah6BUUir_StT28,2530
|
16
|
+
compressed_tensors/compressors/utils/semi_structured_conversions.py,sha256=g1EZHzdv-ko7ufPX430dp7wE33o6FWJXuSP4zZydCu0,13488
|
17
|
+
compressed_tensors/config/__init__.py,sha256=ZBqWn3r6ku1qfmlHHYp0mQueY0i7Pwhr9rbQk9dDlMc,704
|
18
|
+
compressed_tensors/config/base.py,sha256=ZnpuOevCE0pXdA8OJfIJnxj-ccproH7o1EOwRY8_hUU,1482
|
19
|
+
compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74jNbjks,1317
|
20
|
+
compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
|
21
|
+
compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
|
22
|
+
compressed_tensors/quantization/quant_args.py,sha256=Z9Zu20ooAwEWlliAdUw1f1zwSrheuD6vqm3YXgJ1Lws,4388
|
23
|
+
compressed_tensors/quantization/quant_config.py,sha256=hL42sXp1wAZxyrkHarw7tAMRcwSVEr0MT3wmrmL3NhE,8285
|
24
|
+
compressed_tensors/quantization/quant_scheme.py,sha256=aX4h8t8RDqrWeUqoqrYMOxc0xkWcu8Ue_CHLoG-fRjQ,3569
|
25
|
+
compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
|
26
|
+
compressed_tensors/quantization/lifecycle/apply.py,sha256=aZrglJ5mR3Xaxwj51-1BVVB1JGVkKQEeHxGfBaVmsHI,8881
|
27
|
+
compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
|
28
|
+
compressed_tensors/quantization/lifecycle/compressed.py,sha256=VreB10xPwgSLQQlTu20UCrFpRS--cA7-lx5s7nrPPrg,2247
|
29
|
+
compressed_tensors/quantization/lifecycle/forward.py,sha256=0T817yzYqFR1wUjk2XCtOISwr4u7cdkKqAv13jjfu24,11113
|
30
|
+
compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
|
31
|
+
compressed_tensors/quantization/lifecycle/initialize.py,sha256=9xgPzHejQUO_AkZcc_SH5kqFeieG-9uo0fMRYV51i7Y,4577
|
32
|
+
compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
|
33
|
+
compressed_tensors/quantization/observers/base.py,sha256=z_JC-CRz-PY7WlpSoyOoSQQWz5ekTEd5LbXt0iHQRes,5239
|
34
|
+
compressed_tensors/quantization/observers/helpers.py,sha256=FUyYUNd-3LbXt0-8Lwr7EPI2m-LXXBTXW1l5iOajNhA,2272
|
35
|
+
compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
|
36
|
+
compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
|
37
|
+
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
38
|
+
compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
|
39
|
+
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
40
|
+
compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
|
41
|
+
compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
|
42
|
+
compressed_tensors/utils/helpers.py,sha256=5ull5yFT31M2zVxKeFvpvvlvX5f1Sk1LGuj_wrfZWCY,2267
|
43
|
+
compressed_tensors/utils/safetensors_load.py,sha256=0MheXwx1jeY12PeISppiSIZHs6rmN2YddwPpFb9V67I,8527
|
44
|
+
compressed_tensors-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
45
|
+
compressed_tensors-0.4.0.dist-info/METADATA,sha256=NtnK_A9ck3KPmh4syGcGtMBGX-_2FyFa7ntCAdf-KGo,5651
|
46
|
+
compressed_tensors-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
47
|
+
compressed_tensors-0.4.0.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
48
|
+
compressed_tensors-0.4.0.dist-info/RECORD,,
|