compressed-tensors-nightly 0.3.3.20240521__py3-none-any.whl → 0.3.3.20240523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional, Tuple
15
+ from typing import Any, Optional, Tuple
16
16
 
17
17
  import torch
18
18
  from compressed_tensors.quantization.quant_args import (
@@ -93,15 +93,18 @@ class Observer(Module, RegistryMixin):
93
93
  elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
94
94
  columns = observed.shape[1]
95
95
  scales, zero_points = [], []
96
- for i in range(0, columns, self.quantization_args.group_size):
96
+ group_idxs = range(0, columns, self.quantization_args.group_size)
97
+ for group_id, group_idx in enumerate(group_idxs):
97
98
  scale, zero_point = self.get_qparams_along_dim(
98
- observed[:, i : (i + group_size)],
99
+ observed[:, group_idx : (group_idx + group_size)],
99
100
  0,
101
+ tensor_id=group_id,
100
102
  )
101
103
  scales.append(scale)
102
104
  zero_points.append(zero_point)
103
- self._scale = torch.stack(scales, dim=1, out=self._scale)
104
- self._zero_point = torch.stack(zero_points, dim=1, out=self._zero_point)
105
+
106
+ self._scale = torch.cat(scales, dim=1, out=self._scale)
107
+ self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
105
108
 
106
109
  elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
107
110
  # assume observed is transposed, because its the output, hence use dim 0
@@ -116,6 +119,10 @@ class Observer(Module, RegistryMixin):
116
119
 
117
120
  return self._scale, self._zero_point
118
121
 
119
- def get_qparams_along_dim(self, observed, dim: int):
122
+ def get_qparams_along_dim(
123
+ self, observed, dim: int, tensor_id: Optional[Any] = None
124
+ ):
120
125
  reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
121
- return self.calculate_qparams(observed, reduce_dims=reduce_dims)
126
+ return self.calculate_qparams(
127
+ observed, reduce_dims=reduce_dims, tensor_id=tensor_id
128
+ )
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional, Tuple
15
+ from typing import Any, Optional, Tuple
16
16
 
17
17
  import torch
18
18
  from compressed_tensors.quantization.observers.base import Observer
@@ -33,12 +33,14 @@ class MemorylessObserver(Observer):
33
33
  def calculate_qparams(
34
34
  self,
35
35
  observed: Tensor,
36
+ tensor_id: Optional[Any] = None,
36
37
  reduce_dims: Optional[Tuple[int]] = None,
37
38
  ) -> Tuple[FloatTensor, IntTensor]:
38
39
  """
39
40
  Returns the min and max values of observed tensor
40
41
 
41
42
  :param observed: observed tensor to calculate quantization parameters for
43
+ :param tensor_id: optional id for tensor; not used for memoryless
42
44
  :param reduce_dims: optional tuple of dimensions to reduce along,
43
45
  returned scale and zero point will be shaped (1,) along the
44
46
  reduced dimensions
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional, Tuple
15
+ from typing import Any, Optional, Tuple
16
16
 
17
17
  import torch
18
18
  from compressed_tensors.quantization.observers.base import Observer
@@ -36,14 +36,15 @@ class MovingAverageMinMaxObserver(Observer):
36
36
  ):
37
37
  super().__init__(quantization_args=quantization_args)
38
38
 
39
- self.min_val = None
40
- self.max_val = None
39
+ self.min_val = {}
40
+ self.max_val = {}
41
41
  self.averaging_constant = averaging_constant
42
42
 
43
43
  def calculate_qparams(
44
44
  self,
45
45
  observed: Tensor,
46
46
  reduce_dims: Optional[Tuple[int]] = None,
47
+ tensor_id: Optional[Any] = None,
47
48
  ) -> Tuple[FloatTensor, IntTensor]:
48
49
  """
49
50
  Updates the observed min and max using a moving average smoothed by the
@@ -53,8 +54,11 @@ class MovingAverageMinMaxObserver(Observer):
53
54
  :param reduce_dims: optional tuple of dimensions to reduce along,
54
55
  returned scale and zero point will be shaped (1,) along the
55
56
  reduced dimensions
57
+ :param tensor_id: Optional id if different ranges of observed tensors are
58
+ passed, useful for sharding tensors by group_size
56
59
  :return: tuple of scale and zero point derived from the observed tensor
57
60
  """
61
+ tensor_id = tensor_id or "default"
58
62
 
59
63
  if not reduce_dims:
60
64
  min_val, max_val = torch.aminmax(observed)
@@ -62,15 +66,31 @@ class MovingAverageMinMaxObserver(Observer):
62
66
  min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
63
67
  max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
64
68
 
65
- if self.min_val is None and self.max_val is None:
66
- self.min_val = min_val
67
- self.max_val = max_val
69
+ running_min_val = self.min_val.get(tensor_id, None)
70
+ running_max_val = self.max_val.get(tensor_id, None)
71
+
72
+ if running_min_val is None or running_max_val is None:
73
+ updated_min_val = min_val
74
+ updated_max_val = max_val
68
75
  else:
69
- self.min_val = self.min_val + self.averaging_constant * (
70
- min_val - self.min_val
76
+ updated_min_val = running_min_val + self.averaging_constant * (
77
+ min_val - running_min_val
71
78
  )
72
- self.max_val = self.max_val + self.averaging_constant * (
73
- max_val - self.max_val
79
+ updated_max_val = running_max_val + self.averaging_constant * (
80
+ max_val - running_max_val
74
81
  )
75
82
 
76
- return calculate_qparams(self.min_val, self.max_val, self.quantization_args)
83
+ self.min_val[tensor_id] = updated_min_val
84
+ self.max_val[tensor_id] = updated_max_val
85
+
86
+ return calculate_qparams(
87
+ updated_min_val, updated_max_val, self.quantization_args
88
+ )
89
+
90
+ def get_qparams_along_dim(
91
+ self, observed, dim: int, tensor_id: Optional[Any] = None
92
+ ):
93
+ reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
94
+ return self.calculate_qparams(
95
+ observed, reduce_dims=reduce_dims, tensor_id=tensor_id
96
+ )
@@ -13,11 +13,14 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from enum import Enum
16
- from typing import Dict, List, Optional
16
+ from typing import Dict, List, Optional, Union
17
17
 
18
18
  from compressed_tensors.base import QUANTIZATION_CONFIG_NAME
19
19
  from compressed_tensors.config import CompressionFormat
20
- from compressed_tensors.quantization.quant_scheme import QuantizationScheme
20
+ from compressed_tensors.quantization.quant_scheme import (
21
+ QuantizationScheme,
22
+ preset_name_to_scheme,
23
+ )
21
24
  from compressed_tensors.quantization.utils import (
22
25
  calculate_compression_ratio,
23
26
  is_module_quantized,
@@ -105,7 +108,8 @@ class QuantizationConfig(BaseModel):
105
108
  mapped to a QuantizationScheme in config_groups.
106
109
 
107
110
  :param config_groups: dict of QuantizationSchemes specifying the quantization
108
- settings for each quantized layer
111
+ settings for each quantized layer. A group could also be a reference to
112
+ a predefined scheme name, mapped to a list of its target layers/classes
109
113
  :param quant_method: a constant used to differentiate sparseML quantization from
110
114
  other quantization configs
111
115
  :param format: specifies how the quantized model is stored on disk
@@ -117,13 +121,26 @@ class QuantizationConfig(BaseModel):
117
121
  are not quantized even if they match up with a target in config_groups
118
122
  """
119
123
 
120
- config_groups: Dict[str, QuantizationScheme]
124
+ config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
121
125
  quant_method: str = "sparseml"
122
126
  format: str = "fakequant"
123
127
  quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
124
128
  global_compression_ratio: Optional[float] = None
125
129
  ignore: Optional[List[str]] = Field(default_factory=list)
126
130
 
131
+ def model_post_init(self, __context):
132
+ """
133
+ updates any quantization schemes defined as presets to be fully loaded
134
+ schemes
135
+ """
136
+ for group_name, targets_or_scheme in self.config_groups.items():
137
+ if isinstance(targets_or_scheme, QuantizationScheme):
138
+ continue # scheme already defined
139
+ self.config_groups[group_name] = preset_name_to_scheme(
140
+ name=group_name,
141
+ targets=targets_or_scheme,
142
+ )
143
+
127
144
  @staticmethod
128
145
  def from_model_config(model_name_or_path) -> "QuantizationConfig":
129
146
  """
@@ -12,13 +12,17 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from copy import deepcopy
15
16
  from typing import List, Optional
16
17
 
17
18
  from compressed_tensors.quantization.quant_args import QuantizationArgs
18
19
  from pydantic import BaseModel
19
20
 
20
21
 
21
- __all__ = ["QuantizationScheme"]
22
+ __all__ = [
23
+ "QuantizationScheme",
24
+ "preset_name_to_scheme",
25
+ ]
22
26
 
23
27
 
24
28
  class QuantizationScheme(BaseModel):
@@ -37,3 +41,70 @@ class QuantizationScheme(BaseModel):
37
41
  weights: Optional[QuantizationArgs] = None
38
42
  input_activations: Optional[QuantizationArgs] = None
39
43
  output_activations: Optional[QuantizationArgs] = None
44
+
45
+ @classmethod
46
+ def default_scheme(
47
+ cls,
48
+ targets: Optional[List[str]] = None,
49
+ ):
50
+
51
+ if targets is None:
52
+ # default to quantizing all Linear layers
53
+ targets = ["Linear"]
54
+
55
+ # default to 8 bit integer symmetric quantization
56
+ # for weights
57
+ weights = QuantizationArgs(num_bits=8, symmetric=True)
58
+
59
+ # default to 8 bit integer asymmetric quantization
60
+ input_activations = QuantizationArgs(num_bits=8, symmetric=True)
61
+
62
+ # Do not quantize the output activations
63
+ # by default
64
+ output_activations = None
65
+
66
+ return cls(
67
+ targets=targets,
68
+ weights=weights,
69
+ input_activations=input_activations,
70
+ output_activations=output_activations,
71
+ )
72
+
73
+
74
+ """
75
+ Pre-Set Quantization Scheme Args
76
+ """
77
+
78
+
79
+ def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
80
+ """
81
+ :param name: preset quantization settings name. must exist in upper case in
82
+ PRESET_SCHEMES
83
+ :param targets: list of quantization targets to be passed to the Scheme
84
+ :return: new QuantizationScheme for a given name with the given targets
85
+ """
86
+ name = name.upper()
87
+
88
+ if name not in PRESET_SCHEMES:
89
+ raise KeyError(
90
+ f"Unknown preset scheme name {name}, "
91
+ f"available names: {list(PRESET_SCHEMES.keys())}"
92
+ )
93
+
94
+ scheme_args = deepcopy(PRESET_SCHEMES[name]) # deepcopy to avoid args references
95
+ return QuantizationScheme(
96
+ targets=targets,
97
+ **scheme_args,
98
+ )
99
+
100
+
101
+ W8A8 = dict(
102
+ weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=False)
103
+ )
104
+
105
+ W4A16 = dict(weights=QuantizationArgs(num_bits=4, symmetric=False))
106
+
107
+ PRESET_SCHEMES = {
108
+ "W8A8": W8A8,
109
+ "W4A16": W4A16,
110
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors-nightly
3
- Version: 0.3.3.20240521
3
+ Version: 0.3.3.20240523
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -15,8 +15,8 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
15
15
  compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
16
16
  compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
17
17
  compressed_tensors/quantization/quant_args.py,sha256=A6b2V8lhsM8Ho8RjlPBQdxRUDNWhqq-ie5E3RR2_GNg,4360
18
- compressed_tensors/quantization/quant_config.py,sha256=U6oEzheNK1d-0kHARzwepasnmS7HHqU_zGwoDBJ-lxU,8042
19
- compressed_tensors/quantization/quant_scheme.py,sha256=X3oqmZPiIKtX5tEKKUj-0N6hB68NeiU2b1GcQEQPadQ,1480
18
+ compressed_tensors/quantization/quant_config.py,sha256=3BcbQ8-Ah7LbTDSSkRu29Yiid33xo0C1ki6NVhxLiaY,8727
19
+ compressed_tensors/quantization/quant_scheme.py,sha256=QwZsCo8QR9ISB_d58WhIngk2gsMM8ooX-LcRPR-JDRw,3341
20
20
  compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
21
21
  compressed_tensors/quantization/lifecycle/apply.py,sha256=whKfNGC_EZm0BC23AP7qWfjRe5OJVWmcZOpX7lryZZc,7625
22
22
  compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
@@ -25,10 +25,10 @@ compressed_tensors/quantization/lifecycle/forward.py,sha256=x9JaIX3TK7cb_-0aCOTT
25
25
  compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
26
26
  compressed_tensors/quantization/lifecycle/initialize.py,sha256=U6g9qifSF6pagQZQZEwd-rwWC6uQ_dZXn1wg6nr1Abg,3697
27
27
  compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
28
- compressed_tensors/quantization/observers/base.py,sha256=yIV2bd9PKPZwodgiBTZEco2ARbD3B0rOKDC0MOFluZs,4900
28
+ compressed_tensors/quantization/observers/base.py,sha256=kywLVwycFvGxuZMU2cy8-KYyNrZCHkinN6YzCL7boLE,5121
29
29
  compressed_tensors/quantization/observers/helpers.py,sha256=JwALNfBYY9Eyl8Q180t0lGh8szumQj8TygfNl-isErs,2166
30
- compressed_tensors/quantization/observers/memoryless.py,sha256=Gach22cZLhDms6ueKF56XOiLhyWVIEYIEXRRXP5Nu8I,2045
31
- compressed_tensors/quantization/observers/min_max.py,sha256=OGrtyn6_sWuTSx5QgUPVKRIiarfWrK9QqXeRXoJQynw,2861
30
+ compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
31
+ compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
32
32
  compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
33
33
  compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
34
34
  compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
@@ -36,8 +36,8 @@ compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85S
36
36
  compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
37
37
  compressed_tensors/utils/helpers.py,sha256=h0jfl9drs5FAx40tCHRcVtJqXixB5hT5yq_IG2aY_-w,1735
38
38
  compressed_tensors/utils/safetensors_load.py,sha256=wo9UirGrGlenBqZeqotvpCT7D5MEdjCo2J3HeRaIFoU,8502
39
- compressed_tensors_nightly-0.3.3.20240521.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
40
- compressed_tensors_nightly-0.3.3.20240521.dist-info/METADATA,sha256=DTxrrkh-4Wr9G5MAOS_2ILUsgrOIT-RDYi2IiVc13xg,5633
41
- compressed_tensors_nightly-0.3.3.20240521.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
42
- compressed_tensors_nightly-0.3.3.20240521.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
43
- compressed_tensors_nightly-0.3.3.20240521.dist-info/RECORD,,
39
+ compressed_tensors_nightly-0.3.3.20240523.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
40
+ compressed_tensors_nightly-0.3.3.20240523.dist-info/METADATA,sha256=_c67GXEm0cMZ_AGWhcLqsMZ3hSbFB4KdQ3lL9Dg7M8M,5633
41
+ compressed_tensors_nightly-0.3.3.20240523.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
42
+ compressed_tensors_nightly-0.3.3.20240523.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
43
+ compressed_tensors_nightly-0.3.3.20240523.dist-info/RECORD,,