PyPI - compressed-tensors - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

compressed-tensors 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

compressed_tensors/base.py +2 -1
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +11 -54
compressed_tensors/compressors/dense.py +4 -4
compressed_tensors/compressors/helpers.py +12 -12
compressed_tensors/compressors/int_quantized.py +126 -0
compressed_tensors/compressors/marlin_24.py +250 -0
compressed_tensors/compressors/model_compressor.py +315 -0
compressed_tensors/compressors/pack_quantized.py +212 -0
compressed_tensors/compressors/sparse_bitmask.py +3 -3
compressed_tensors/compressors/utils/__init__.py +19 -0
compressed_tensors/compressors/utils/helpers.py +43 -0
compressed_tensors/compressors/utils/permutations_24.py +65 -0
compressed_tensors/compressors/utils/semi_structured_conversions.py +341 -0
compressed_tensors/config/base.py +7 -4
compressed_tensors/config/dense.py +4 -4
compressed_tensors/config/sparse_bitmask.py +3 -3
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +62 -11
compressed_tensors/quantization/lifecycle/compressed.py +69 -0
compressed_tensors/quantization/lifecycle/forward.py +161 -54
compressed_tensors/quantization/lifecycle/frozen.py +4 -0
compressed_tensors/quantization/lifecycle/initialize.py +33 -5
compressed_tensors/quantization/observers/base.py +31 -27
compressed_tensors/quantization/observers/helpers.py +6 -1
compressed_tensors/quantization/observers/memoryless.py +17 -9
compressed_tensors/quantization/observers/min_max.py +44 -13
compressed_tensors/quantization/quant_args.py +2 -2
compressed_tensors/quantization/quant_config.py +69 -21
compressed_tensors/quantization/quant_scheme.py +81 -1
compressed_tensors/quantization/utils/helpers.py +76 -8
compressed_tensors/utils/helpers.py +24 -6
compressed_tensors/utils/safetensors_load.py +3 -2
compressed_tensors/version.py +53 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/METADATA +46 -8
compressed_tensors-0.4.0.dist-info/RECORD +48 -0
compressed_tensors-0.3.3.dist-info/RECORD +0 -38
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/WHEEL +0 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.4.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -14,6 +14,7 @@
 from functools import wraps
 from math import ceil
+from typing import Optional
 import torch
 from compressed_tensors.quantization.quant_args import (
@@ -25,7 +26,13 @@ from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module
-__all__ = ["wrap_module_forward_quantized", "maybe_calibrate_or_quantize"]
+__all__ = [
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "wrap_module_forward_quantized",
+    "maybe_calibrate_or_quantize",
+]
 @torch.no_grad()
@@ -33,14 +40,39 @@ def quantize(
     x: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
-    q_min: torch.Tensor,
-    q_max: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
+    """
+    Quantize the input tensor x using the QuantizationStrategy specified in args.
+    Quantization can be done per tensor, channel, token or group. For group
+    quantization, the group_size must be divisible by the column size. The input scale
+    and zero_points are reshaped to support vectorization (Assumes 1 is the
+    channel dimension)
-    return torch.clamp(
-        torch.round(x / scale + zero_point),
-        q_min,
-        q_max,
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :param dtype: optional dtype to cast the quantized output to
+    :return: fake quantized tensor
+    """
+    # ensure all tensors are on the same device
+    # assumes that the target device is the input
+    # tensor's device
+    if x.device != scale.device:
+        scale = scale.to(x.device)
+    if x.device != zero_point.device:
+        zero_point = zero_point.to(x.device)
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        dtype=dtype,
+        do_quantize=True,
+        do_dequantize=False,
     )
@@ -49,8 +81,42 @@ def dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
+    args: QuantizationArgs = None,
 ) -> torch.Tensor:
-    return (x_q - zero_point) * scale
+    """
+    Dequantize a quantized input tensor x_q based on the strategy specified in args. If
+    args is not provided, the strategy will be inferred.
+    :param x: quantized input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args used to quantize x_q
+    :return: dequantized float tensor
+    """
+    if args is None:
+        if scale.ndim == 0 or scale.ndim == 1:
+            args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR)
+        elif scale.ndim == 2:
+            if scale.shape[1] == 1:
+                args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
+            else:
+                group_size = int(x_q.shape[1] / scale.shape[1])
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.GROUP, group_size=group_size
+                )
+        else:
+            raise ValueError(
+                f"Could not infer a quantization strategy from scale with {scale.ndim} "
+                "dimmensions. Expected 0-2 dimmensions."
+            )
+    return _process_quantization(
+        x=x_q,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=False,
+        do_dequantize=True,
+    )
 @torch.no_grad()
@@ -61,30 +127,51 @@ def fake_quantize(
     args: QuantizationArgs,
 ) -> torch.Tensor:
     """
-    Fake quantize the input tensor x depending on the group_size.
-    if group_size is greater than 0, then q/dq by groups. The groups
-    must be divisible by the column size
-    if group_size is -1, then channel wise q/dq. THe input scale and
-    zero_points are reshaped to support vectorization (Assumes 1 is
-    the channel dimension)
+    Fake quantize the input tensor x by quantizing then dequantizing with
+    the QuantizationStrategy specified in args. Quantization can be done per tensor,
+    channel, token or group. For group quantization, the group_size must be divisible
+    by the column size. The input scale  and zero_points are reshaped to support
+    vectorization (Assumes 1 is the channel dimension)
     :param x: Input tensor
     :param scale: scale tensor
     :param zero_point: zero point tensor
-    :param args: quantization args that contain group_size info
+    :param args: quantization args dictating how to quantize x
     :return: fake quantized tensor
     """
-    bit_range = 2**args.num_bits
-    max_q = torch.tensor(bit_range / 2 - 1, device=x.device)
-    min_q = torch.tensor(-bit_range / 2, device=x.device)
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+    )
+@torch.no_grad()
+def _process_quantization(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+    do_quantize: bool = True,
+    do_dequantize: bool = True,
+) -> torch.Tensor:
+    bit_range = 2**args.num_bits
+    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
+    q_min = torch.tensor(-bit_range / 2, device=x.device)
     group_size = args.group_size
-    # group
     if args.strategy == QuantizationStrategy.GROUP:
-        DQ = torch.zeros_like(x)
+        if do_dequantize and not do_quantize:
+            # if dequantizing a quantized type infer the output type from the scale
+            output = torch.zeros_like(x, dtype=scale.dtype)
+        else:
+            output_dtype = dtype if dtype is not None else x.dtype
+            output = torch.zeros_like(x, dtype=output_dtype)
         # TODO: vectorize the for loop
         # TODO: fix genetric assumption about the tensor size for computing group
@@ -106,48 +193,38 @@ def fake_quantize(
         for i in range(ceil(columns / group_size)):
             # scale.shape should be [nchan, ndim]
             # sc.shape should be [nchan, 1] after unsqueeze
-            sc = scale[:, i].unsqueeze(1)
-            zp = zero_point[:, i].unsqueeze(1)
+            sc = scale[:, i].view(-1, 1)
+            zp = zero_point[:, i].view(-1, 1)
             idx = i * group_size
-            Q = quantize(x[:, idx : (idx + group_size)], sc, zp, min_q, max_q)
-            DQ[:, idx : (idx + group_size)] = dequantize(Q, sc, zp)
-    # channel-wise
-    elif args.strategy == QuantizationStrategy.CHANNEL:  # group_size == -1
-        # before: scale shape = [channel_size]
-        # after: scale shape = [1, channel_size]
-        scale = scale.unsqueeze(0)
-        zero_point = zero_point.unsqueeze(0)
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
-    # per-token
-    elif args.strategy == QuantizationStrategy.TOKEN:
-        # before: scale shape = [num_tokens]
-        # after: scale shape = [num_tokens, 1]
-        # x.shape = 1, num_tokens, 1]
-        # scale gets broadcasted as expected withput having [1, num_tokens, 1] shape
-        scale = scale.unsqueeze(1)
-        zero_point = zero_point.unsqueeze(1)
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
+            if do_quantize:
+                output[:, idx : (idx + group_size)] = _quantize(
+                    x[:, idx : (idx + group_size)], sc, zp, q_min, q_max, dtype=dtype
+                )
+            if do_dequantize:
+                input = (
+                    output[:, idx : (idx + group_size)]
+                    if do_quantize
+                    else x[:, idx : (idx + group_size)]
+                )
+                output[:, idx : (idx + group_size)] = _dequantize(input, sc, zp)
-    else:
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
+    else:  # covers channel, token and tensor strategies
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
-    return DQ
+    return output
 def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
     # expects a module already initialized and injected with the parameters in
     # initialize_module_for_quantization
-    forward_func_orig = module.forward.__func__
+    if hasattr(module.forward, "__func__"):
+        forward_func_orig = module.forward.__func__
+    else:
+        forward_func_orig = module.forward.func
     @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
     def wrapped_forward(self, *args, **kwargs):
@@ -219,3 +296,33 @@ def maybe_calibrate_or_quantize(
             scale.data = updated_scale.to(device)
             zero_point.data = updated_zero_point.to(device)
     return fake_quantize(value, scale, zero_point, args)
+@torch.no_grad()
+def _quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    quantized_value = torch.clamp(
+        torch.round(x / scale + zero_point),
+        q_min,
+        q_max,
+    )
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
+    return quantized_value
+@torch.no_grad()
+def _dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale

compressed_tensors/quantization/lifecycle/frozen.py CHANGED Viewed

@@ -35,6 +35,10 @@ def freeze_module_quantization(module: Module):
         # no quantization scheme nothing to do
         return
+    if module.quantization_status == QuantizationStatus.FROZEN:
+        # nothing to do, already frozen
+        return
     # delete observers from module if not dynamic
     if scheme.input_activations and not scheme.input_activations.dynamic:
         delattr(module, "input_observer")

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -20,7 +20,10 @@ import torch
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module, Parameter
@@ -58,7 +61,12 @@ def initialize_module_for_quantization(
         _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
     if scheme.weights is not None:
         if hasattr(module, "weight"):
-            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+            weight_shape = None
+            if isinstance(module, torch.nn.Linear):
+                weight_shape = module.weight.shape
+            _initialize_scale_zero_point_observer(
+                module, "weight", scheme.weights, weight_shape=weight_shape
+            )
         else:
             _LOGGER.warning(
                 f"module type {type(module)} targeted for weight quantization but "
@@ -78,7 +86,10 @@ def initialize_module_for_quantization(
 def _initialize_scale_zero_point_observer(
-    module: Module, base_name: str, quantization_args: QuantizationArgs
+    module: Module,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+    weight_shape: Optional[torch.Size] = None,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
@@ -89,11 +100,28 @@ def _initialize_scale_zero_point_observer(
     device = next(module.parameters()).device
+    # infer expected scale/zero point shape
+    expected_shape = 1  # per tensor
+    if base_name == "weight" and weight_shape is not None:
+        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
+            # (output_channels, 1)
+            expected_shape = (weight_shape[0], 1)
+        elif quantization_args.strategy == QuantizationStrategy.GROUP:
+            expected_shape = (
+                weight_shape[0],
+                weight_shape[1] // quantization_args.group_size,
+            )
     # initializes empty scale and zero point parameters for the module
-    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=module.weight.dtype, device=device),
+        requires_grad=False,
+    )
     module.register_parameter(f"{base_name}_scale", init_scale)
     init_zero_point = Parameter(
-        torch.empty(0, device=device, dtype=int), requires_grad=False
+        torch.empty(expected_shape, device=device, dtype=int),
+        requires_grad=False,
     )
     module.register_parameter(f"{base_name}_zero_point", init_zero_point)

compressed_tensors/quantization/observers/base.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Iterable, Optional, Tuple, Union
 import torch
 from compressed_tensors.quantization.quant_args import (
@@ -40,6 +40,7 @@ class Observer(Module, RegistryMixin):
         self._scale = None
         self._zero_point = None
+    @torch.no_grad()
     def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
@@ -49,9 +50,16 @@ class Observer(Module, RegistryMixin):
         """
         return self.get_qparams(observed=observed)
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
         :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
         :return: tuple of scale and zero point derived from the observed tensor
         """
         raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
@@ -69,6 +77,7 @@ class Observer(Module, RegistryMixin):
         Convenience function to wrap overwritten calculate_qparams
         adds support to make observed tensor optional and support for tracking latest
         calculated scale and zero point
         :param observed: optional observed tensor to calculate quantization parameters
             from
         :return: tuple of scale and zero point based on last observed value
@@ -84,47 +93,42 @@ class Observer(Module, RegistryMixin):
             elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
                 columns = observed.shape[1]
                 scales, zero_points = [], []
-                for i in range(0, columns, self.quantization_args.group_size):
+                group_idxs = range(0, columns, self.quantization_args.group_size)
+                for group_id, group_idx in enumerate(group_idxs):
                     scale, zero_point = self.get_qparams_along_dim(
-                        observed[:, i : (i + group_size)],
+                        observed[:, group_idx : (group_idx + group_size)],
                         0,
+                        tensor_id=group_id,
                     )
                     scales.append(scale)
                     zero_points.append(zero_point)
-                self._scale = torch.stack(scales, dim=1)
-                self._zero_point = torch.stack(zero_points, dim=1)
+                self._scale = torch.cat(scales, dim=1, out=self._scale)
+                self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
                 # assume observed is transposed, because its the output, hence use dim 0
                 self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
             elif self.quantization_args.strategy == QuantizationStrategy.TOKEN:
                 # use dim 1, assume the obsersed.shape = [batch, token, hidden]
                 # should be batch, token
                 self._scale, self._zero_point = self.get_qparams_along_dim(
-                    observed, dim=1
+                    observed,
+                    dim={0, 1},
                 )
         return self._scale, self._zero_point
-    def get_qparams_along_dim(self, observed, dim: int):
-        # TODO: add documentation that specifies the shape must
-        #   be padded with 1-dims so the scales are along the right channel
-        # TODO: generalize the logic for reduce_dims
-        scales, zero_points = [], []
-        # TODO: make a more generic way to get the channel
-        num_dims = observed.shape[dim]
-        for dim_idx in range(num_dims):
-            scale, zero_point = self.calculate_qparams(
-                observed.select(dim=dim, index=dim_idx)
-            )
-            scales.append(scale)
-            zero_points.append(zero_point)
-        # breakpoint()
-        return torch.stack(scales), torch.stack(zero_points)
+    def get_qparams_along_dim(
+        self,
+        observed,
+        dim: Union[int, Iterable[int]],
+        tensor_id: Optional[Any] = None,
+    ):
+        dim = set(dim)
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx not in dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed_tensors/quantization/observers/helpers.py CHANGED Viewed

@@ -35,19 +35,24 @@ def calculate_qparams(
     """
     min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
     max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
+    device = min_vals.device
     bit_range = 2**quantization_args.num_bits - 1
     bit_min = -(bit_range + 1) / 2
     bit_max = bit_min + bit_range
     if quantization_args.symmetric:
-        zero_points = torch.tensor(0).to(torch.int8)
         max_val_pos = torch.max(-min_vals, max_vals)
         scales = max_val_pos / (float(bit_range) / 2)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        zero_points = torch.zeros(scales.shape, device=device, dtype=torch.int8)
     else:
         scales = (max_vals - min_vals) / float(bit_range)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - torch.round(min_vals / scales)
         zero_points = torch.clamp(zero_points, bit_min, bit_max).to(torch.int8)
+    if scales.ndim == 0:
+        scales = scales.reshape(1)
+        zero_points = zero_points.reshape(1)
     return scales, zero_points

compressed_tensors/quantization/observers/memoryless.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -30,19 +30,27 @@ class MemorylessObserver(Observer):
     zero point based on the latest observed value without tracking state
     """
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        tensor_id: Optional[Any] = None,
+        reduce_dims: Optional[Tuple[int]] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
-        Returns the min and max values of observed
+        Returns the min and max values of observed tensor
         :param observed: observed tensor to calculate quantization parameters for
+        :param tensor_id: optional id for tensor; not used for memoryless
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
         :return: tuple of scale and zero point derived from the observed tensor
         """
-        # TODO: Add support for full range of quantization Args, only supports 8bit
-        #       per tensor
-        min_val, max_val = torch.aminmax(observed)
-        # ensure zero is in the range
-        min_val = torch.min(min_val, torch.zeros_like(min_val))
-        max_val = torch.max(max_val, torch.zeros_like(max_val))
+        if not reduce_dims:
+            min_val, max_val = torch.aminmax(observed)
+        else:
+            min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
+            max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
         return calculate_qparams(min_val, max_val, self.quantization_args)

compressed_tensors/quantization/observers/min_max.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -36,30 +36,61 @@ class MovingAverageMinMaxObserver(Observer):
     ):
         super().__init__(quantization_args=quantization_args)
-        self.min_val = float("inf")
-        self.max_val = -float("inf")
+        self.min_val = {}
+        self.max_val = {}
         self.averaging_constant = averaging_constant
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+        tensor_id: Optional[Any] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
         Updates the observed min and max using a moving average smoothed by the
         averaging_constant
         :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
+        :param tensor_id: Optional id if different ranges of observed tensors are
+            passed, useful for sharding tensors by group_size
         :return: tuple of scale and zero point derived from the observed tensor
         """
+        tensor_id = tensor_id or "default"
-        min_val, max_val = torch.aminmax(observed)
+        if not reduce_dims:
+            min_val, max_val = torch.aminmax(observed)
+        else:
+            min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
+            max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
+        running_min_val = self.min_val.get(tensor_id, None)
+        running_max_val = self.max_val.get(tensor_id, None)
-        if self.min_val == float("inf") and self.max_val == float("-inf"):
-            self.min_val = min_val
-            self.max_val = max_val
+        if running_min_val is None or running_max_val is None:
+            updated_min_val = min_val
+            updated_max_val = max_val
         else:
-            self.min_val = self.min_val + self.averaging_constant * (
-                min_val - self.min_val
+            updated_min_val = running_min_val + self.averaging_constant * (
+                min_val - running_min_val
             )
-            self.max_val = self.max_val + self.averaging_constant * (
-                max_val - self.max_val
+            updated_max_val = running_max_val + self.averaging_constant * (
+                max_val - running_max_val
             )
-        return calculate_qparams(self.min_val, self.max_val, self.quantization_args)
+        self.min_val[tensor_id] = updated_min_val
+        self.max_val[tensor_id] = updated_max_val
+        return calculate_qparams(
+            updated_min_val, updated_max_val, self.quantization_args
+        )
+    def get_qparams_along_dim(
+        self, observed, dim: int, tensor_id: Optional[Any] = None
+    ):
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -42,7 +42,7 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
-class QuantizationArgs(BaseModel):
+class QuantizationArgs(BaseModel, use_enum_values=True):
     """
     User facing arguments used to define a quantization config for weights or
     activations
@@ -62,7 +62,7 @@ class QuantizationArgs(BaseModel):
     """
     num_bits: int = 8
-    type: QuantizationType = QuantizationType.INT
+    type: QuantizationType = QuantizationType.INT.value
     symmetric: bool = True
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None

compressed-tensors 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

compressed-tensors 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl