PyPI - compressed-tensors - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

compressed-tensors 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

compressed_tensors/base.py +2 -1
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +11 -54
compressed_tensors/compressors/dense.py +4 -4
compressed_tensors/compressors/helpers.py +12 -12
compressed_tensors/compressors/int_quantized.py +126 -0
compressed_tensors/compressors/marlin_24.py +250 -0
compressed_tensors/compressors/model_compressor.py +315 -0
compressed_tensors/compressors/pack_quantized.py +212 -0
compressed_tensors/compressors/sparse_bitmask.py +4 -4
compressed_tensors/compressors/utils/__init__.py +19 -0
compressed_tensors/compressors/utils/helpers.py +43 -0
compressed_tensors/compressors/utils/permutations_24.py +65 -0
compressed_tensors/compressors/utils/semi_structured_conversions.py +341 -0
compressed_tensors/config/base.py +7 -4
compressed_tensors/config/dense.py +4 -4
compressed_tensors/config/sparse_bitmask.py +3 -3
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +75 -19
compressed_tensors/quantization/lifecycle/compressed.py +69 -0
compressed_tensors/quantization/lifecycle/forward.py +208 -22
compressed_tensors/quantization/lifecycle/frozen.py +4 -0
compressed_tensors/quantization/lifecycle/initialize.py +33 -5
compressed_tensors/quantization/observers/base.py +70 -5
compressed_tensors/quantization/observers/helpers.py +6 -1
compressed_tensors/quantization/observers/memoryless.py +17 -9
compressed_tensors/quantization/observers/min_max.py +44 -13
compressed_tensors/quantization/quant_args.py +33 -4
compressed_tensors/quantization/quant_config.py +69 -21
compressed_tensors/quantization/quant_scheme.py +81 -1
compressed_tensors/quantization/utils/helpers.py +77 -8
compressed_tensors/utils/helpers.py +26 -122
compressed_tensors/utils/safetensors_load.py +3 -2
compressed_tensors/version.py +53 -0
{compressed_tensors-0.3.2.dist-info → compressed_tensors-0.4.0.dist-info}/METADATA +46 -9
compressed_tensors-0.4.0.dist-info/RECORD +48 -0
compressed_tensors-0.3.2.dist-info/RECORD +0 -38
{compressed_tensors-0.3.2.dist-info → compressed_tensors-0.4.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.3.2.dist-info → compressed_tensors-0.4.0.dist-info}/WHEEL +0 -0
{compressed_tensors-0.3.2.dist-info → compressed_tensors-0.4.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -13,15 +13,26 @@
 # limitations under the License.
 from functools import wraps
+from math import ceil
+from typing import Optional
 import torch
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module
-__all__ = ["wrap_module_forward_quantized"]
+__all__ = [
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "wrap_module_forward_quantized",
+    "maybe_calibrate_or_quantize",
+]
 @torch.no_grad()
@@ -29,15 +40,39 @@ def quantize(
     x: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
-    q_min: torch.Tensor,
-    q_max: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    return torch.clamp(
-        torch.round(
-            x / scale + zero_point,
-        ),
-        q_min,
-        q_max,
+    """
+    Quantize the input tensor x using the QuantizationStrategy specified in args.
+    Quantization can be done per tensor, channel, token or group. For group
+    quantization, the group_size must be divisible by the column size. The input scale
+    and zero_points are reshaped to support vectorization (Assumes 1 is the
+    channel dimension)
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :param dtype: optional dtype to cast the quantized output to
+    :return: fake quantized tensor
+    """
+    # ensure all tensors are on the same device
+    # assumes that the target device is the input
+    # tensor's device
+    if x.device != scale.device:
+        scale = scale.to(x.device)
+    if x.device != zero_point.device:
+        zero_point = zero_point.to(x.device)
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        dtype=dtype,
+        do_quantize=True,
+        do_dequantize=False,
     )
@@ -46,8 +81,42 @@ def dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
+    args: QuantizationArgs = None,
 ) -> torch.Tensor:
-    return (x_q - zero_point) * scale
+    """
+    Dequantize a quantized input tensor x_q based on the strategy specified in args. If
+    args is not provided, the strategy will be inferred.
+    :param x: quantized input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args used to quantize x_q
+    :return: dequantized float tensor
+    """
+    if args is None:
+        if scale.ndim == 0 or scale.ndim == 1:
+            args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR)
+        elif scale.ndim == 2:
+            if scale.shape[1] == 1:
+                args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
+            else:
+                group_size = int(x_q.shape[1] / scale.shape[1])
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.GROUP, group_size=group_size
+                )
+        else:
+            raise ValueError(
+                f"Could not infer a quantization strategy from scale with {scale.ndim} "
+                "dimmensions. Expected 0-2 dimmensions."
+            )
+    return _process_quantization(
+        x=x_q,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=False,
+        do_dequantize=True,
+    )
 @torch.no_grad()
@@ -56,19 +125,106 @@ def fake_quantize(
     scale: torch.Tensor,
     zero_point: torch.Tensor,
     args: QuantizationArgs,
+) -> torch.Tensor:
+    """
+    Fake quantize the input tensor x by quantizing then dequantizing with
+    the QuantizationStrategy specified in args. Quantization can be done per tensor,
+    channel, token or group. For group quantization, the group_size must be divisible
+    by the column size. The input scale  and zero_points are reshaped to support
+    vectorization (Assumes 1 is the channel dimension)
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :return: fake quantized tensor
+    """
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+    )
+@torch.no_grad()
+def _process_quantization(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+    do_quantize: bool = True,
+    do_dequantize: bool = True,
 ) -> torch.Tensor:
     bit_range = 2**args.num_bits
-    max_q = torch.tensor(bit_range / 2 - 1, device=x.device)
-    min_q = torch.tensor(-bit_range / 2, device=x.device)
-    Q = torch.zeros_like(x)
-    Q = quantize(x, scale, zero_point, min_q, max_q)
-    return dequantize(Q, scale, zero_point)
+    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
+    q_min = torch.tensor(-bit_range / 2, device=x.device)
+    group_size = args.group_size
+    if args.strategy == QuantizationStrategy.GROUP:
+        if do_dequantize and not do_quantize:
+            # if dequantizing a quantized type infer the output type from the scale
+            output = torch.zeros_like(x, dtype=scale.dtype)
+        else:
+            output_dtype = dtype if dtype is not None else x.dtype
+            output = torch.zeros_like(x, dtype=output_dtype)
+        # TODO: vectorize the for loop
+        # TODO: fix genetric assumption about the tensor size for computing group
+        # TODO: make validation step for inputs
+        while scale.ndim < 2:
+            # pad scale and zero point dims for slicing
+            scale = scale.unsqueeze(1)
+            zero_point = zero_point.unsqueeze(1)
+        columns = x.shape[1]
+        if columns >= group_size:
+            if columns % group_size != 0:
+                raise ValueError(
+                    "tesnor column shape must be divisble "
+                    f"by the given group_size {group_size}"
+                )
+        for i in range(ceil(columns / group_size)):
+            # scale.shape should be [nchan, ndim]
+            # sc.shape should be [nchan, 1] after unsqueeze
+            sc = scale[:, i].view(-1, 1)
+            zp = zero_point[:, i].view(-1, 1)
+            idx = i * group_size
+            if do_quantize:
+                output[:, idx : (idx + group_size)] = _quantize(
+                    x[:, idx : (idx + group_size)], sc, zp, q_min, q_max, dtype=dtype
+                )
+            if do_dequantize:
+                input = (
+                    output[:, idx : (idx + group_size)]
+                    if do_quantize
+                    else x[:, idx : (idx + group_size)]
+                )
+                output[:, idx : (idx + group_size)] = _dequantize(input, sc, zp)
+    else:  # covers channel, token and tensor strategies
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
+    return output
 def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
     # expects a module already initialized and injected with the parameters in
     # initialize_module_for_quantization
-    forward_func_orig = module.forward.__func__
+    if hasattr(module.forward, "__func__"):
+        forward_func_orig = module.forward.__func__
+    else:
+        forward_func_orig = module.forward.func
     @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
     def wrapped_forward(self, *args, **kwargs):
@@ -76,14 +232,14 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         if scheme.input_activations is not None:
             # calibrate and (fake) quantize input activations when applicable
-            input_ = _maybe_calibrate_or_quantize(
+            input_ = maybe_calibrate_or_quantize(
                 module, input_, "input", scheme.input_activations
             )
         if scheme.weights is not None:
             # calibrate and (fake) quantize weights when applicable
             unquantized_weight = self.weight.data.clone()
-            self.weight.data = _maybe_calibrate_or_quantize(
+            self.weight.data = maybe_calibrate_or_quantize(
                 module, self.weight, "weight", scheme.weights
             )
@@ -94,7 +250,7 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         if scheme.output_activations is not None:
             # calibrate and (fake) quantize output activations when applicable
-            output = _maybe_calibrate_or_quantize(
+            output = maybe_calibrate_or_quantize(
                 module, output, "output", scheme.output_activations
             )
@@ -110,7 +266,7 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
     setattr(module, "forward", bound_wrapped_forward)
-def _maybe_calibrate_or_quantize(
+def maybe_calibrate_or_quantize(
     module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
     # only run quantized for the included stages
@@ -132,11 +288,41 @@ def _maybe_calibrate_or_quantize(
         if module.quantization_status == QuantizationStatus.CALIBRATION:
             # calibration mode - get new quant params from observer
             observer = getattr(module, f"{base_name}_observer")
             updated_scale, updated_zero_point = observer(value)
             # update scale and zero point
             device = next(module.parameters()).device
             scale.data = updated_scale.to(device)
             zero_point.data = updated_zero_point.to(device)
     return fake_quantize(value, scale, zero_point, args)
+@torch.no_grad()
+def _quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    quantized_value = torch.clamp(
+        torch.round(x / scale + zero_point),
+        q_min,
+        q_max,
+    )
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
+    return quantized_value
+@torch.no_grad()
+def _dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale

compressed_tensors/quantization/lifecycle/frozen.py CHANGED Viewed

@@ -35,6 +35,10 @@ def freeze_module_quantization(module: Module):
         # no quantization scheme nothing to do
         return
+    if module.quantization_status == QuantizationStatus.FROZEN:
+        # nothing to do, already frozen
+        return
     # delete observers from module if not dynamic
     if scheme.input_activations and not scheme.input_activations.dynamic:
         delattr(module, "input_observer")

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -20,7 +20,10 @@ import torch
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module, Parameter
@@ -58,7 +61,12 @@ def initialize_module_for_quantization(
         _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
     if scheme.weights is not None:
         if hasattr(module, "weight"):
-            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+            weight_shape = None
+            if isinstance(module, torch.nn.Linear):
+                weight_shape = module.weight.shape
+            _initialize_scale_zero_point_observer(
+                module, "weight", scheme.weights, weight_shape=weight_shape
+            )
         else:
             _LOGGER.warning(
                 f"module type {type(module)} targeted for weight quantization but "
@@ -78,7 +86,10 @@ def initialize_module_for_quantization(
 def _initialize_scale_zero_point_observer(
-    module: Module, base_name: str, quantization_args: QuantizationArgs
+    module: Module,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+    weight_shape: Optional[torch.Size] = None,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
@@ -89,11 +100,28 @@ def _initialize_scale_zero_point_observer(
     device = next(module.parameters()).device
+    # infer expected scale/zero point shape
+    expected_shape = 1  # per tensor
+    if base_name == "weight" and weight_shape is not None:
+        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
+            # (output_channels, 1)
+            expected_shape = (weight_shape[0], 1)
+        elif quantization_args.strategy == QuantizationStrategy.GROUP:
+            expected_shape = (
+                weight_shape[0],
+                weight_shape[1] // quantization_args.group_size,
+            )
     # initializes empty scale and zero point parameters for the module
-    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=module.weight.dtype, device=device),
+        requires_grad=False,
+    )
     module.register_parameter(f"{base_name}_scale", init_scale)
     init_zero_point = Parameter(
-        torch.empty(0, device=device, dtype=int), requires_grad=False
+        torch.empty(expected_shape, device=device, dtype=int),
+        requires_grad=False,
     )
     module.register_parameter(f"{base_name}_zero_point", init_zero_point)

compressed_tensors/quantization/observers/base.py CHANGED Viewed

@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Iterable, Optional, Tuple, Union
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+import torch
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from compressed_tensors.registry.registry import RegistryMixin
 from torch import FloatTensor, IntTensor, Tensor
 from torch.nn import Module
@@ -36,6 +40,7 @@ class Observer(Module, RegistryMixin):
         self._scale = None
         self._zero_point = None
+    @torch.no_grad()
     def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
@@ -45,13 +50,26 @@ class Observer(Module, RegistryMixin):
         """
         return self.get_qparams(observed=observed)
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
         :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
         :return: tuple of scale and zero point derived from the observed tensor
         """
         raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
+    def post_calculate_qparams(self) -> None:
+        """
+        Run any logic specific to its observers after running calculate_qparams
+        """
+        ...
     def get_qparams(
         self, observed: Optional[Tensor] = None
     ) -> Tuple[FloatTensor, IntTensor]:
@@ -59,11 +77,58 @@ class Observer(Module, RegistryMixin):
         Convenience function to wrap overwritten calculate_qparams
         adds support to make observed tensor optional and support for tracking latest
         calculated scale and zero point
         :param observed: optional observed tensor to calculate quantization parameters
             from
         :return: tuple of scale and zero point based on last observed value
         """
         if observed is not None:
-            # re-calcualte scale and zero point, update the stored value
-            self._scale, self._zero_point = self.calculate_qparams(observed)
+            group_size = self.quantization_args.group_size
+            if self.quantization_args.strategy == QuantizationStrategy.TENSOR:
+                # re-calculate scale and zero point, update the stored value
+                self._scale, self._zero_point = self.calculate_qparams(observed)
+            elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
+                columns = observed.shape[1]
+                scales, zero_points = [], []
+                group_idxs = range(0, columns, self.quantization_args.group_size)
+                for group_id, group_idx in enumerate(group_idxs):
+                    scale, zero_point = self.get_qparams_along_dim(
+                        observed[:, group_idx : (group_idx + group_size)],
+                        0,
+                        tensor_id=group_id,
+                    )
+                    scales.append(scale)
+                    zero_points.append(zero_point)
+                self._scale = torch.cat(scales, dim=1, out=self._scale)
+                self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
+            elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
+                # assume observed is transposed, because its the output, hence use dim 0
+                self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
+            elif self.quantization_args.strategy == QuantizationStrategy.TOKEN:
+                # use dim 1, assume the obsersed.shape = [batch, token, hidden]
+                # should be batch, token
+                self._scale, self._zero_point = self.get_qparams_along_dim(
+                    observed,
+                    dim={0, 1},
+                )
         return self._scale, self._zero_point
+    def get_qparams_along_dim(
+        self,
+        observed,
+        dim: Union[int, Iterable[int]],
+        tensor_id: Optional[Any] = None,
+    ):
+        dim = set(dim)
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx not in dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed_tensors/quantization/observers/helpers.py CHANGED Viewed

@@ -35,19 +35,24 @@ def calculate_qparams(
     """
     min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
     max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
+    device = min_vals.device
     bit_range = 2**quantization_args.num_bits - 1
     bit_min = -(bit_range + 1) / 2
     bit_max = bit_min + bit_range
     if quantization_args.symmetric:
-        zero_points = torch.tensor(0).to(torch.int8)
         max_val_pos = torch.max(-min_vals, max_vals)
         scales = max_val_pos / (float(bit_range) / 2)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        zero_points = torch.zeros(scales.shape, device=device, dtype=torch.int8)
     else:
         scales = (max_vals - min_vals) / float(bit_range)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - torch.round(min_vals / scales)
         zero_points = torch.clamp(zero_points, bit_min, bit_max).to(torch.int8)
+    if scales.ndim == 0:
+        scales = scales.reshape(1)
+        zero_points = zero_points.reshape(1)
     return scales, zero_points

compressed_tensors/quantization/observers/memoryless.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -30,19 +30,27 @@ class MemorylessObserver(Observer):
     zero point based on the latest observed value without tracking state
     """
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        tensor_id: Optional[Any] = None,
+        reduce_dims: Optional[Tuple[int]] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
-        Returns the min and max values of observed
+        Returns the min and max values of observed tensor
         :param observed: observed tensor to calculate quantization parameters for
+        :param tensor_id: optional id for tensor; not used for memoryless
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
         :return: tuple of scale and zero point derived from the observed tensor
         """
-        # TODO: Add support for full range of quantization Args, only supports 8bit
-        #       per tensor
-        min_val, max_val = torch.aminmax(observed)
-        # ensure zero is in the range
-        min_val = torch.min(min_val, torch.zeros_like(min_val))
-        max_val = torch.max(max_val, torch.zeros_like(max_val))
+        if not reduce_dims:
+            min_val, max_val = torch.aminmax(observed)
+        else:
+            min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
+            max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
         return calculate_qparams(min_val, max_val, self.quantization_args)

compressed_tensors/quantization/observers/min_max.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -36,30 +36,61 @@ class MovingAverageMinMaxObserver(Observer):
     ):
         super().__init__(quantization_args=quantization_args)
-        self.min_val = float("inf")
-        self.max_val = -float("inf")
+        self.min_val = {}
+        self.max_val = {}
         self.averaging_constant = averaging_constant
-    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+        tensor_id: Optional[Any] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
         Updates the observed min and max using a moving average smoothed by the
         averaging_constant
         :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
+        :param tensor_id: Optional id if different ranges of observed tensors are
+            passed, useful for sharding tensors by group_size
         :return: tuple of scale and zero point derived from the observed tensor
         """
+        tensor_id = tensor_id or "default"
-        min_val, max_val = torch.aminmax(observed)
+        if not reduce_dims:
+            min_val, max_val = torch.aminmax(observed)
+        else:
+            min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
+            max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
+        running_min_val = self.min_val.get(tensor_id, None)
+        running_max_val = self.max_val.get(tensor_id, None)
-        if self.min_val == float("inf") and self.max_val == float("-inf"):
-            self.min_val = min_val
-            self.max_val = max_val
+        if running_min_val is None or running_max_val is None:
+            updated_min_val = min_val
+            updated_max_val = max_val
         else:
-            self.min_val = self.min_val + self.averaging_constant * (
-                min_val - self.min_val
+            updated_min_val = running_min_val + self.averaging_constant * (
+                min_val - running_min_val
             )
-            self.max_val = self.max_val + self.averaging_constant * (
-                max_val - self.max_val
+            updated_max_val = running_max_val + self.averaging_constant * (
+                max_val - running_max_val
             )
-        return calculate_qparams(self.min_val, self.max_val, self.quantization_args)
+        self.min_val[tensor_id] = updated_min_val
+        self.max_val[tensor_id] = updated_max_val
+        return calculate_qparams(
+            updated_min_val, updated_max_val, self.quantization_args
+        )
+    def get_qparams_along_dim(
+        self, observed, dim: int, tensor_id: Optional[Any] = None
+    ):
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed-tensors 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

compressed-tensors 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl