compressed-tensors-nightly 0.3.3.20240521__py3-none-any.whl → 0.3.3.20240523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/quantization/observers/base.py +14 -7
- compressed_tensors/quantization/observers/memoryless.py +3 -1
- compressed_tensors/quantization/observers/min_max.py +31 -11
- compressed_tensors/quantization/quant_config.py +21 -4
- compressed_tensors/quantization/quant_scheme.py +72 -1
- {compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/METADATA +1 -1
- {compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/RECORD +10 -10
- {compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/LICENSE +0 -0
- {compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/WHEEL +0 -0
- {compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from typing import Optional, Tuple
|
15
|
+
from typing import Any, Optional, Tuple
|
16
16
|
|
17
17
|
import torch
|
18
18
|
from compressed_tensors.quantization.quant_args import (
|
@@ -93,15 +93,18 @@ class Observer(Module, RegistryMixin):
|
|
93
93
|
elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
|
94
94
|
columns = observed.shape[1]
|
95
95
|
scales, zero_points = [], []
|
96
|
-
|
96
|
+
group_idxs = range(0, columns, self.quantization_args.group_size)
|
97
|
+
for group_id, group_idx in enumerate(group_idxs):
|
97
98
|
scale, zero_point = self.get_qparams_along_dim(
|
98
|
-
observed[:,
|
99
|
+
observed[:, group_idx : (group_idx + group_size)],
|
99
100
|
0,
|
101
|
+
tensor_id=group_id,
|
100
102
|
)
|
101
103
|
scales.append(scale)
|
102
104
|
zero_points.append(zero_point)
|
103
|
-
|
104
|
-
self.
|
105
|
+
|
106
|
+
self._scale = torch.cat(scales, dim=1, out=self._scale)
|
107
|
+
self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
|
105
108
|
|
106
109
|
elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
|
107
110
|
# assume observed is transposed, because its the output, hence use dim 0
|
@@ -116,6 +119,10 @@ class Observer(Module, RegistryMixin):
|
|
116
119
|
|
117
120
|
return self._scale, self._zero_point
|
118
121
|
|
119
|
-
def get_qparams_along_dim(
|
122
|
+
def get_qparams_along_dim(
|
123
|
+
self, observed, dim: int, tensor_id: Optional[Any] = None
|
124
|
+
):
|
120
125
|
reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
|
121
|
-
return self.calculate_qparams(
|
126
|
+
return self.calculate_qparams(
|
127
|
+
observed, reduce_dims=reduce_dims, tensor_id=tensor_id
|
128
|
+
)
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from typing import Optional, Tuple
|
15
|
+
from typing import Any, Optional, Tuple
|
16
16
|
|
17
17
|
import torch
|
18
18
|
from compressed_tensors.quantization.observers.base import Observer
|
@@ -33,12 +33,14 @@ class MemorylessObserver(Observer):
|
|
33
33
|
def calculate_qparams(
|
34
34
|
self,
|
35
35
|
observed: Tensor,
|
36
|
+
tensor_id: Optional[Any] = None,
|
36
37
|
reduce_dims: Optional[Tuple[int]] = None,
|
37
38
|
) -> Tuple[FloatTensor, IntTensor]:
|
38
39
|
"""
|
39
40
|
Returns the min and max values of observed tensor
|
40
41
|
|
41
42
|
:param observed: observed tensor to calculate quantization parameters for
|
43
|
+
:param tensor_id: optional id for tensor; not used for memoryless
|
42
44
|
:param reduce_dims: optional tuple of dimensions to reduce along,
|
43
45
|
returned scale and zero point will be shaped (1,) along the
|
44
46
|
reduced dimensions
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from typing import Optional, Tuple
|
15
|
+
from typing import Any, Optional, Tuple
|
16
16
|
|
17
17
|
import torch
|
18
18
|
from compressed_tensors.quantization.observers.base import Observer
|
@@ -36,14 +36,15 @@ class MovingAverageMinMaxObserver(Observer):
|
|
36
36
|
):
|
37
37
|
super().__init__(quantization_args=quantization_args)
|
38
38
|
|
39
|
-
self.min_val =
|
40
|
-
self.max_val =
|
39
|
+
self.min_val = {}
|
40
|
+
self.max_val = {}
|
41
41
|
self.averaging_constant = averaging_constant
|
42
42
|
|
43
43
|
def calculate_qparams(
|
44
44
|
self,
|
45
45
|
observed: Tensor,
|
46
46
|
reduce_dims: Optional[Tuple[int]] = None,
|
47
|
+
tensor_id: Optional[Any] = None,
|
47
48
|
) -> Tuple[FloatTensor, IntTensor]:
|
48
49
|
"""
|
49
50
|
Updates the observed min and max using a moving average smoothed by the
|
@@ -53,8 +54,11 @@ class MovingAverageMinMaxObserver(Observer):
|
|
53
54
|
:param reduce_dims: optional tuple of dimensions to reduce along,
|
54
55
|
returned scale and zero point will be shaped (1,) along the
|
55
56
|
reduced dimensions
|
57
|
+
:param tensor_id: Optional id if different ranges of observed tensors are
|
58
|
+
passed, useful for sharding tensors by group_size
|
56
59
|
:return: tuple of scale and zero point derived from the observed tensor
|
57
60
|
"""
|
61
|
+
tensor_id = tensor_id or "default"
|
58
62
|
|
59
63
|
if not reduce_dims:
|
60
64
|
min_val, max_val = torch.aminmax(observed)
|
@@ -62,15 +66,31 @@ class MovingAverageMinMaxObserver(Observer):
|
|
62
66
|
min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
|
63
67
|
max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
|
64
68
|
|
65
|
-
|
66
|
-
|
67
|
-
|
69
|
+
running_min_val = self.min_val.get(tensor_id, None)
|
70
|
+
running_max_val = self.max_val.get(tensor_id, None)
|
71
|
+
|
72
|
+
if running_min_val is None or running_max_val is None:
|
73
|
+
updated_min_val = min_val
|
74
|
+
updated_max_val = max_val
|
68
75
|
else:
|
69
|
-
|
70
|
-
min_val -
|
76
|
+
updated_min_val = running_min_val + self.averaging_constant * (
|
77
|
+
min_val - running_min_val
|
71
78
|
)
|
72
|
-
|
73
|
-
max_val -
|
79
|
+
updated_max_val = running_max_val + self.averaging_constant * (
|
80
|
+
max_val - running_max_val
|
74
81
|
)
|
75
82
|
|
76
|
-
|
83
|
+
self.min_val[tensor_id] = updated_min_val
|
84
|
+
self.max_val[tensor_id] = updated_max_val
|
85
|
+
|
86
|
+
return calculate_qparams(
|
87
|
+
updated_min_val, updated_max_val, self.quantization_args
|
88
|
+
)
|
89
|
+
|
90
|
+
def get_qparams_along_dim(
|
91
|
+
self, observed, dim: int, tensor_id: Optional[Any] = None
|
92
|
+
):
|
93
|
+
reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
|
94
|
+
return self.calculate_qparams(
|
95
|
+
observed, reduce_dims=reduce_dims, tensor_id=tensor_id
|
96
|
+
)
|
@@ -13,11 +13,14 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from enum import Enum
|
16
|
-
from typing import Dict, List, Optional
|
16
|
+
from typing import Dict, List, Optional, Union
|
17
17
|
|
18
18
|
from compressed_tensors.base import QUANTIZATION_CONFIG_NAME
|
19
19
|
from compressed_tensors.config import CompressionFormat
|
20
|
-
from compressed_tensors.quantization.quant_scheme import
|
20
|
+
from compressed_tensors.quantization.quant_scheme import (
|
21
|
+
QuantizationScheme,
|
22
|
+
preset_name_to_scheme,
|
23
|
+
)
|
21
24
|
from compressed_tensors.quantization.utils import (
|
22
25
|
calculate_compression_ratio,
|
23
26
|
is_module_quantized,
|
@@ -105,7 +108,8 @@ class QuantizationConfig(BaseModel):
|
|
105
108
|
mapped to a QuantizationScheme in config_groups.
|
106
109
|
|
107
110
|
:param config_groups: dict of QuantizationSchemes specifying the quantization
|
108
|
-
settings for each quantized layer
|
111
|
+
settings for each quantized layer. A group could also be a reference to
|
112
|
+
a predefined scheme name, mapped to a list of its target layers/classes
|
109
113
|
:param quant_method: a constant used to differentiate sparseML quantization from
|
110
114
|
other quantization configs
|
111
115
|
:param format: specifies how the quantized model is stored on disk
|
@@ -117,13 +121,26 @@ class QuantizationConfig(BaseModel):
|
|
117
121
|
are not quantized even if they match up with a target in config_groups
|
118
122
|
"""
|
119
123
|
|
120
|
-
config_groups: Dict[str, QuantizationScheme]
|
124
|
+
config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
|
121
125
|
quant_method: str = "sparseml"
|
122
126
|
format: str = "fakequant"
|
123
127
|
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
|
124
128
|
global_compression_ratio: Optional[float] = None
|
125
129
|
ignore: Optional[List[str]] = Field(default_factory=list)
|
126
130
|
|
131
|
+
def model_post_init(self, __context):
|
132
|
+
"""
|
133
|
+
updates any quantization schemes defined as presets to be fully loaded
|
134
|
+
schemes
|
135
|
+
"""
|
136
|
+
for group_name, targets_or_scheme in self.config_groups.items():
|
137
|
+
if isinstance(targets_or_scheme, QuantizationScheme):
|
138
|
+
continue # scheme already defined
|
139
|
+
self.config_groups[group_name] = preset_name_to_scheme(
|
140
|
+
name=group_name,
|
141
|
+
targets=targets_or_scheme,
|
142
|
+
)
|
143
|
+
|
127
144
|
@staticmethod
|
128
145
|
def from_model_config(model_name_or_path) -> "QuantizationConfig":
|
129
146
|
"""
|
@@ -12,13 +12,17 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from copy import deepcopy
|
15
16
|
from typing import List, Optional
|
16
17
|
|
17
18
|
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
18
19
|
from pydantic import BaseModel
|
19
20
|
|
20
21
|
|
21
|
-
__all__ = [
|
22
|
+
__all__ = [
|
23
|
+
"QuantizationScheme",
|
24
|
+
"preset_name_to_scheme",
|
25
|
+
]
|
22
26
|
|
23
27
|
|
24
28
|
class QuantizationScheme(BaseModel):
|
@@ -37,3 +41,70 @@ class QuantizationScheme(BaseModel):
|
|
37
41
|
weights: Optional[QuantizationArgs] = None
|
38
42
|
input_activations: Optional[QuantizationArgs] = None
|
39
43
|
output_activations: Optional[QuantizationArgs] = None
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def default_scheme(
|
47
|
+
cls,
|
48
|
+
targets: Optional[List[str]] = None,
|
49
|
+
):
|
50
|
+
|
51
|
+
if targets is None:
|
52
|
+
# default to quantizing all Linear layers
|
53
|
+
targets = ["Linear"]
|
54
|
+
|
55
|
+
# default to 8 bit integer symmetric quantization
|
56
|
+
# for weights
|
57
|
+
weights = QuantizationArgs(num_bits=8, symmetric=True)
|
58
|
+
|
59
|
+
# default to 8 bit integer asymmetric quantization
|
60
|
+
input_activations = QuantizationArgs(num_bits=8, symmetric=True)
|
61
|
+
|
62
|
+
# Do not quantize the output activations
|
63
|
+
# by default
|
64
|
+
output_activations = None
|
65
|
+
|
66
|
+
return cls(
|
67
|
+
targets=targets,
|
68
|
+
weights=weights,
|
69
|
+
input_activations=input_activations,
|
70
|
+
output_activations=output_activations,
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
"""
|
75
|
+
Pre-Set Quantization Scheme Args
|
76
|
+
"""
|
77
|
+
|
78
|
+
|
79
|
+
def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
|
80
|
+
"""
|
81
|
+
:param name: preset quantization settings name. must exist in upper case in
|
82
|
+
PRESET_SCHEMES
|
83
|
+
:param targets: list of quantization targets to be passed to the Scheme
|
84
|
+
:return: new QuantizationScheme for a given name with the given targets
|
85
|
+
"""
|
86
|
+
name = name.upper()
|
87
|
+
|
88
|
+
if name not in PRESET_SCHEMES:
|
89
|
+
raise KeyError(
|
90
|
+
f"Unknown preset scheme name {name}, "
|
91
|
+
f"available names: {list(PRESET_SCHEMES.keys())}"
|
92
|
+
)
|
93
|
+
|
94
|
+
scheme_args = deepcopy(PRESET_SCHEMES[name]) # deepcopy to avoid args references
|
95
|
+
return QuantizationScheme(
|
96
|
+
targets=targets,
|
97
|
+
**scheme_args,
|
98
|
+
)
|
99
|
+
|
100
|
+
|
101
|
+
W8A8 = dict(
|
102
|
+
weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=False)
|
103
|
+
)
|
104
|
+
|
105
|
+
W4A16 = dict(weights=QuantizationArgs(num_bits=4, symmetric=False))
|
106
|
+
|
107
|
+
PRESET_SCHEMES = {
|
108
|
+
"W8A8": W8A8,
|
109
|
+
"W4A16": W4A16,
|
110
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors-nightly
|
3
|
-
Version: 0.3.3.
|
3
|
+
Version: 0.3.3.20240523
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -15,8 +15,8 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
|
|
15
15
|
compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
|
16
16
|
compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
|
17
17
|
compressed_tensors/quantization/quant_args.py,sha256=A6b2V8lhsM8Ho8RjlPBQdxRUDNWhqq-ie5E3RR2_GNg,4360
|
18
|
-
compressed_tensors/quantization/quant_config.py,sha256=
|
19
|
-
compressed_tensors/quantization/quant_scheme.py,sha256=
|
18
|
+
compressed_tensors/quantization/quant_config.py,sha256=3BcbQ8-Ah7LbTDSSkRu29Yiid33xo0C1ki6NVhxLiaY,8727
|
19
|
+
compressed_tensors/quantization/quant_scheme.py,sha256=QwZsCo8QR9ISB_d58WhIngk2gsMM8ooX-LcRPR-JDRw,3341
|
20
20
|
compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
|
21
21
|
compressed_tensors/quantization/lifecycle/apply.py,sha256=whKfNGC_EZm0BC23AP7qWfjRe5OJVWmcZOpX7lryZZc,7625
|
22
22
|
compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
|
@@ -25,10 +25,10 @@ compressed_tensors/quantization/lifecycle/forward.py,sha256=x9JaIX3TK7cb_-0aCOTT
|
|
25
25
|
compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
|
26
26
|
compressed_tensors/quantization/lifecycle/initialize.py,sha256=U6g9qifSF6pagQZQZEwd-rwWC6uQ_dZXn1wg6nr1Abg,3697
|
27
27
|
compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
|
28
|
-
compressed_tensors/quantization/observers/base.py,sha256=
|
28
|
+
compressed_tensors/quantization/observers/base.py,sha256=kywLVwycFvGxuZMU2cy8-KYyNrZCHkinN6YzCL7boLE,5121
|
29
29
|
compressed_tensors/quantization/observers/helpers.py,sha256=JwALNfBYY9Eyl8Q180t0lGh8szumQj8TygfNl-isErs,2166
|
30
|
-
compressed_tensors/quantization/observers/memoryless.py,sha256=
|
31
|
-
compressed_tensors/quantization/observers/min_max.py,sha256=
|
30
|
+
compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
|
31
|
+
compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
|
32
32
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
33
33
|
compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
|
34
34
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
@@ -36,8 +36,8 @@ compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85S
|
|
36
36
|
compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
|
37
37
|
compressed_tensors/utils/helpers.py,sha256=h0jfl9drs5FAx40tCHRcVtJqXixB5hT5yq_IG2aY_-w,1735
|
38
38
|
compressed_tensors/utils/safetensors_load.py,sha256=wo9UirGrGlenBqZeqotvpCT7D5MEdjCo2J3HeRaIFoU,8502
|
39
|
-
compressed_tensors_nightly-0.3.3.
|
40
|
-
compressed_tensors_nightly-0.3.3.
|
41
|
-
compressed_tensors_nightly-0.3.3.
|
42
|
-
compressed_tensors_nightly-0.3.3.
|
43
|
-
compressed_tensors_nightly-0.3.3.
|
39
|
+
compressed_tensors_nightly-0.3.3.20240523.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
40
|
+
compressed_tensors_nightly-0.3.3.20240523.dist-info/METADATA,sha256=_c67GXEm0cMZ_AGWhcLqsMZ3hSbFB4KdQ3lL9Dg7M8M,5633
|
41
|
+
compressed_tensors_nightly-0.3.3.20240523.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
42
|
+
compressed_tensors_nightly-0.3.3.20240523.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
43
|
+
compressed_tensors_nightly-0.3.3.20240523.dist-info/RECORD,,
|
File without changes
|
File without changes
|