PyPI - compressed-tensors - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

compressed_tensors/base.py +1 -0
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +200 -8
compressed_tensors/compressors/dense.py +1 -1
compressed_tensors/compressors/marlin_24.py +11 -10
compressed_tensors/compressors/model_compressor.py +101 -13
compressed_tensors/compressors/naive_quantized.py +140 -0
compressed_tensors/compressors/pack_quantized.py +128 -132
compressed_tensors/compressors/sparse_bitmask.py +1 -1
compressed_tensors/config/base.py +8 -1
compressed_tensors/{compressors/utils → linear}/__init__.py +0 -6
compressed_tensors/linear/compressed_linear.py +87 -0
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -44
compressed_tensors/quantization/lifecycle/calibration.py +22 -2
compressed_tensors/quantization/lifecycle/compressed.py +3 -1
compressed_tensors/quantization/lifecycle/forward.py +139 -61
compressed_tensors/quantization/lifecycle/helpers.py +80 -0
compressed_tensors/quantization/lifecycle/initialize.py +77 -13
compressed_tensors/quantization/observers/__init__.py +1 -0
compressed_tensors/quantization/observers/base.py +93 -14
compressed_tensors/quantization/observers/helpers.py +64 -11
compressed_tensors/quantization/observers/min_max.py +8 -0
compressed_tensors/quantization/observers/mse.py +162 -0
compressed_tensors/quantization/quant_args.py +139 -23
compressed_tensors/quantization/quant_config.py +35 -2
compressed_tensors/quantization/quant_scheme.py +112 -13
compressed_tensors/quantization/utils/helpers.py +68 -2
compressed_tensors/utils/__init__.py +5 -0
compressed_tensors/utils/helpers.py +44 -2
compressed_tensors/utils/offload.py +116 -0
compressed_tensors/utils/permute.py +70 -0
compressed_tensors/utils/safetensors_load.py +2 -0
compressed_tensors/{compressors/utils → utils}/semi_structured_conversions.py +1 -0
compressed_tensors/version.py +1 -1
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -22
compressed_tensors-0.6.0.dist-info/RECORD +52 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
compressed_tensors/compressors/int_quantized.py +0 -126
compressed_tensors/compressors/utils/helpers.py +0 -43
compressed_tensors-0.4.0.dist-info/RECORD +0 -48
/compressed_tensors/{compressors/utils → utils}/permutations_24.py +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/observers/base.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
+from math import ceil
 from typing import Any, Iterable, Optional, Tuple, Union
 import torch
@@ -20,10 +22,14 @@ from compressed_tensors.quantization.quant_args import (
     QuantizationStrategy,
 )
 from compressed_tensors.registry.registry import RegistryMixin
+from compressed_tensors.utils import safe_permute
 from torch import FloatTensor, IntTensor, Tensor
 from torch.nn import Module
+_LOGGER = logging.getLogger(__name__)
 __all__ = ["Observer"]
@@ -39,16 +45,21 @@ class Observer(Module, RegistryMixin):
         super().__init__()
         self._scale = None
         self._zero_point = None
+        self._num_observed_tokens = None
     @torch.no_grad()
-    def forward(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+    def forward(
+        self, observed: Tensor, g_idx: Optional[Tensor] = None
+    ) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
-        :param observed: optional observed tensor to calculate quantization parameters
-            from
+        :param observed: optional observed tensor from which to calculate
+            quantization parameters
+        :param g_idx: optional mapping from column index to group index
         :return: tuple of scale and zero point based on last observed value
         """
-        return self.get_qparams(observed=observed)
+        self.record_observed_tokens(observed)
+        return self.get_qparams(observed=observed, g_idx=g_idx)
     def calculate_qparams(
         self,
@@ -71,7 +82,9 @@ class Observer(Module, RegistryMixin):
         ...
     def get_qparams(
-        self, observed: Optional[Tensor] = None
+        self,
+        observed: Optional[Tensor] = None,
+        g_idx: Optional[Tensor] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Convenience function to wrap overwritten calculate_qparams
@@ -80,6 +93,7 @@ class Observer(Module, RegistryMixin):
         :param observed: optional observed tensor to calculate quantization parameters
             from
+        :param g_idx: optional mapping from column index to group index
         :return: tuple of scale and zero point based on last observed value
         """
         if observed is not None:
@@ -91,20 +105,42 @@ class Observer(Module, RegistryMixin):
                 self._scale, self._zero_point = self.calculate_qparams(observed)
             elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
+                rows = observed.shape[0]
                 columns = observed.shape[1]
-                scales, zero_points = [], []
-                group_idxs = range(0, columns, self.quantization_args.group_size)
-                for group_id, group_idx in enumerate(group_idxs):
+                num_groups = int(ceil(columns / group_size))
+                self._scale = torch.empty(
+                    (rows, num_groups), dtype=observed.dtype, device=observed.device
+                )
+                zp_dtype = self.quantization_args.pytorch_dtype()
+                self._zero_point = torch.empty(
+                    (rows, num_groups), dtype=zp_dtype, device=observed.device
+                )
+                # support column-order (default) quantization as well as other orderings
+                # such as activation ordering. Below checks if g_idx has initialized
+                is_column_order = g_idx is None or -1 in g_idx
+                if is_column_order:
+                    group_sizes = torch.full((num_groups,), group_size, dtype=torch.int)
+                else:
+                    group_indices, group_sizes = torch.unique(g_idx, return_counts=True)
+                    group_sizes = group_sizes[torch.argsort(group_indices)]
+                    perm = torch.argsort(g_idx)
+                    observed = safe_permute(observed, perm, dim=1)
+                # TODO: experiment with vectorizing for loop for performance
+                end = 0
+                for group_index, group_count in enumerate(group_sizes):
+                    start = end
+                    end = start + group_count
                     scale, zero_point = self.get_qparams_along_dim(
-                        observed[:, group_idx : (group_idx + group_size)],
+                        observed[:, start:end],
                         0,
-                        tensor_id=group_id,
+                        tensor_id=group_index,
                     )
-                    scales.append(scale)
-                    zero_points.append(zero_point)
-                self._scale = torch.cat(scales, dim=1, out=self._scale)
-                self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
+                    self._scale[:, group_index] = scale.squeeze(1)
+                    self._zero_point[:, group_index] = zero_point.squeeze(1)
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
                 # assume observed is transposed, because its the output, hence use dim 0
@@ -126,9 +162,52 @@ class Observer(Module, RegistryMixin):
         dim: Union[int, Iterable[int]],
         tensor_id: Optional[Any] = None,
     ):
+        if isinstance(dim, int):
+            dim = [dim]
         dim = set(dim)
         reduce_dims = tuple(idx for idx in range(observed.ndim) if idx not in dim)
         return self.calculate_qparams(
             observed, reduce_dims=reduce_dims, tensor_id=tensor_id
         )
+    def record_observed_tokens(self, batch_tensor: Tensor):
+        """
+        Counts the number of tokens observed during the
+        forward passes. The count is aggregated in the
+        _num_observed_tokens attribute of the class.
+        Note: The batch_tensor is expected to have two dimensions
+            (batch_size * sequence_length, num_features). This is the
+            general shape expected by the forward pass of the expert
+            layers in a MOE model. If the input tensor does not have
+            two dimensions, the _num_observed_tokens attribute will be set
+            to None.
+        """
+        if not isinstance(batch_tensor, Tensor):
+            raise ValueError(f"Expected value to be a tensor, got {type(batch_tensor)}")
+        if batch_tensor.ndim != 2:
+            _LOGGER.debug(
+                "The input tensor is expected to have two dimensions "
+                "(batch_size * sequence_length, num_features). "
+                f"The input tensor has {batch_tensor.ndim} dimensions."
+            )
+            return
+        if self._num_observed_tokens is None:
+            # initialize the count
+            self._num_observed_tokens = 0
+        # batch_tensor (batch_size * sequence_length, num_features)
+        # observed_tokens (batch_size * sequence_length)
+        observed_tokens, _ = batch_tensor.shape
+        self._num_observed_tokens += observed_tokens
+    def reset(self):
+        """
+        Reset the state of the observer
+        """
+        self._num_observed_tokens = None
+        self._scale = None
+        self._zero_point = None

compressed_tensors/quantization/observers/helpers.py CHANGED Viewed

@@ -12,23 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import Counter
 from typing import Tuple
 import torch
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    FP8_DTYPE,
+    QuantizationArgs,
+    QuantizationType,
+)
 from torch import FloatTensor, IntTensor, Tensor
-__all__ = ["calculate_qparams"]
+__all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
+def get_observer_token_count(module: torch.nn.Module) -> Counter:
+    """
+    Parse the module and return the number of tokens observed by
+    each module's observer.
+    :param module: module to parse
+    :return: counter with the number of tokens observed by each observer
+    """
+    token_counts = Counter()
+    for name, module in module.named_modules():
+        if name.endswith(".input_observer"):
+            token_counts[
+                name.replace(".input_observer", "")
+            ] = module._num_observed_tokens
+    return token_counts
 def calculate_qparams(
     min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
 ) -> Tuple[FloatTensor, IntTensor]:
     """
-    :param min_vals: tensor of min value(s) to caluclate scale(s) and zero point(s)
+    :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
         from
-    :param max_vals: tensor of max value(s) to caluclate scale(s) and zero point(s)
+    :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
         from
     :param quantization_args: settings to quantization
     :return: tuple of the calculated scale(s) and zero point(s)
@@ -37,22 +59,53 @@ def calculate_qparams(
     max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
     device = min_vals.device
-    bit_range = 2**quantization_args.num_bits - 1
-    bit_min = -(bit_range + 1) / 2
-    bit_max = bit_min + bit_range
+    bit_min, bit_max = calculate_range(quantization_args, device)
+    bit_range = bit_max - bit_min
+    zp_dtype = quantization_args.pytorch_dtype()
     if quantization_args.symmetric:
-        max_val_pos = torch.max(-min_vals, max_vals)
+        max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
         scales = max_val_pos / (float(bit_range) / 2)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
-        zero_points = torch.zeros(scales.shape, device=device, dtype=torch.int8)
+        zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
         scales = (max_vals - min_vals) / float(bit_range)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
-        zero_points = bit_min - torch.round(min_vals / scales)
-        zero_points = torch.clamp(zero_points, bit_min, bit_max).to(torch.int8)
+        zero_points = bit_min - (min_vals / scales)
+        zero_points = torch.clamp(zero_points, bit_min, bit_max)
+    # match zero-points to quantized type
+    zero_points = zero_points.to(zp_dtype)
     if scales.ndim == 0:
         scales = scales.reshape(1)
         zero_points = zero_points.reshape(1)
     return scales, zero_points
+def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
+    """
+    Calculated the effective quantization range for the given Quantization Args
+    :param quantization_args: quantization args to get range of
+    :param device: device to store the range to
+    :return: tuple endpoints for the given quantization range
+    """
+    if quantization_args.type == QuantizationType.INT:
+        bit_range = 2**quantization_args.num_bits
+        q_max = torch.tensor(bit_range / 2 - 1, device=device)
+        q_min = torch.tensor(-bit_range / 2, device=device)
+    elif quantization_args.type == QuantizationType.FLOAT:
+        if quantization_args.num_bits != 8:
+            raise ValueError(
+                "Floating point quantization is only supported for 8 bits,"
+                f"got {quantization_args.num_bits}"
+            )
+        fp_range_info = torch.finfo(FP8_DTYPE)
+        q_max = torch.tensor(fp_range_info.max, device=device)
+        q_min = torch.tensor(fp_range_info.min, device=device)
+    else:
+        raise ValueError(f"Invalid quantization type {quantization_args.type}")
+    return q_min, q_max

compressed_tensors/quantization/observers/min_max.py CHANGED Viewed

@@ -94,3 +94,11 @@ class MovingAverageMinMaxObserver(Observer):
         return self.calculate_qparams(
             observed, reduce_dims=reduce_dims, tensor_id=tensor_id
         )
+    def reset(self):
+        """
+        Reset the state of the observer, including min and maximum values
+        """
+        super().reset()
+        self.min_val = {}
+        self.max_val = {}

compressed_tensors/quantization/observers/mse.py ADDED Viewed

@@ -0,0 +1,162 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional, Tuple
+import torch
+from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.observers.helpers import calculate_qparams
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from torch import FloatTensor, IntTensor, Tensor
+__all__ = ["MovingAverageMSEObserver"]
+@Observer.register("mse")
+class MovingAverageMSEObserver(Observer):
+    """
+    Implements a dynamic quantization observer that sets the scale and
+    zero point based on a moving average of the mse-clipped min and max observed values
+    """
+    def __init__(
+        self,
+        quantization_args: QuantizationArgs,
+        averaging_constant: float = 0.01,
+        grid: float = 100.0,
+        maxshrink: float = 0.80,
+        norm: float = 2.4,
+    ):
+        super().__init__(quantization_args=quantization_args)
+        self.min_val = {}
+        self.max_val = {}
+        self.averaging_constant = averaging_constant
+        self.grid = grid
+        self.maxshrink = maxshrink
+        self.norm = norm
+    def calculate_mse_min_max(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+    ):
+        """
+        Computes the mse-clipped min and max values of the observed tensor by
+        optimizing for quantization error
+        :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned values will be shaped (1,) along the reduced dimensions
+        :return: tuple of min and max values derived from the observed tensor
+        """
+        from compressed_tensors.quantization.lifecycle import fake_quantize
+        if not reduce_dims:
+            absolute_min_val, absolute_max_val = torch.aminmax(observed)
+        else:
+            absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
+            absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
+        best = torch.full(absolute_min_val.shape, float("inf"))
+        min_val = torch.ones(absolute_min_val.shape)
+        max_val = torch.zeros(absolute_max_val.shape)
+        for i in range(int(self.maxshrink * self.grid)):
+            p = 1 - i / self.grid
+            shrinked_min_val = p * absolute_min_val
+            shrinked_max_val = p * absolute_max_val
+            candidate_scales, candidate_zero_points = calculate_qparams(
+                shrinked_min_val, shrinked_max_val, self.quantization_args
+            )
+            q = fake_quantize(
+                observed,
+                candidate_scales,
+                candidate_zero_points,
+                self.quantization_args,
+            )
+            q -= observed
+            q.abs_()
+            q.pow_(self.norm)
+            if not reduce_dims:
+                err = torch.sum(q)
+            else:
+                err = torch.sum(q, reduce_dims, keepdims=True)
+            tmp = err < best
+            if torch.any(tmp):
+                best[tmp] = err[tmp]
+                min_val[tmp] = shrinked_min_val[tmp]
+                max_val[tmp] = shrinked_max_val[tmp]
+        return min_val, max_val
+    def calculate_qparams(
+        self,
+        observed: Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+        tensor_id: Optional[Any] = None,
+    ) -> Tuple[FloatTensor, IntTensor]:
+        """
+        Updates the mse-clipped min and max values of the observed tensor using
+        a moving average smoothed by the averaging_constant
+        :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
+        :param tensor_id: Optional id if different ranges of observed tensors are
+            passed, useful for sharding tensors by group_size
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        min_val, max_val = self.calculate_mse_min_max(observed, reduce_dims)
+        running_min_val = self.min_val.get(tensor_id, None)
+        running_max_val = self.max_val.get(tensor_id, None)
+        if running_min_val is None or running_max_val is None:
+            updated_min_val = min_val
+            updated_max_val = max_val
+        else:
+            updated_min_val = running_min_val + self.averaging_constant * (
+                min_val - running_min_val
+            )
+            updated_max_val = running_max_val + self.averaging_constant * (
+                max_val - running_max_val
+            )
+        tensor_id = tensor_id or "default"
+        self.min_val[tensor_id] = updated_min_val
+        self.max_val[tensor_id] = updated_max_val
+        return calculate_qparams(
+            updated_min_val, updated_max_val, self.quantization_args
+        )
+    def get_qparams_along_dim(
+        self, observed, dim: int, tensor_id: Optional[Any] = None
+    ):
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )
+    def reset(self):
+        """
+        Reset the state of the observer, including min and maximum values
+        """
+        super().reset()
+        self.min_val = {}
+        self.max_val = {}

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -13,12 +13,22 @@
 # limitations under the License.
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
-from pydantic import BaseModel, Field, validator
+import torch
+from pydantic import BaseModel, Field, field_validator, model_validator
-__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"]
+__all__ = [
+    "FP8_DTYPE",
+    "QuantizationType",
+    "QuantizationStrategy",
+    "QuantizationArgs",
+    "round_to_quantized_type",
+    "ActivationOrdering",
+]
+FP8_DTYPE = torch.float8_e4m3fn
 class QuantizationType(str, Enum):
@@ -42,6 +52,19 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
+class ActivationOrdering(str, Enum):
+    """
+    Enum storing strategies for activation ordering
+    Group: reorder groups and weight\n
+    Weight: only reorder weight, not groups. Slightly lower latency and
+    accuracy compared to group actorder\n
+    """
+    GROUP = "group"
+    WEIGHT = "weight"
 class QuantizationArgs(BaseModel, use_enum_values=True):
     """
     User facing arguments used to define a quantization config for weights or
@@ -59,15 +82,18 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         ranges will be observed with every sample. Defaults to False for static
         quantization. Note that enabling dynamic quantization will change the default
         observer to a memoryless one
+    :param actorder: whether to apply group quantization in decreasing order of
+        activation. Defaults to None for arbitrary ordering
     """
     num_bits: int = 8
-    type: QuantizationType = QuantizationType.INT.value
+    type: QuantizationType = QuantizationType.INT
     symmetric: bool = True
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
     dynamic: bool = False
+    actorder: Union[ActivationOrdering, bool, None] = None
     observer: str = Field(
         default="minmax",
         description=(
@@ -89,37 +115,127 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         """
         from compressed_tensors.quantization.observers.base import Observer
-        if self.observer == "minmax" and self.dynamic:
+        if self.dynamic:
             # override defualt observer for dynamic, you never want minmax which
             # keeps state across samples for dynamic
             self.observer = "memoryless"
         return Observer.load_from_registry(self.observer, quantization_args=self)
-    @validator("strategy", pre=True, always=True)
-    def validate_strategy(cls, value, values):
-        group_size = values.get("group_size")
+    @field_validator("type", mode="before")
+    def validate_type(cls, value) -> QuantizationType:
+        if isinstance(value, str):
+            return QuantizationType(value.lower())
-        # use group_size to determinine strategy if not given explicity
-        if group_size is not None and value is None:
-            if group_size > 0:
-                return QuantizationStrategy.GROUP
+        return value
-            elif group_size == -1:
-                return QuantizationStrategy.CHANNEL
+    @field_validator("group_size", mode="before")
+    def validate_group(cls, value) -> Union[int, None]:
+        if value is None:
+            return value
+        if value < -1:
+            raise ValueError(
+                f"Invalid group size {value}. Use group_size > 0 for "
+                "strategy='group' and group_size = -1 for 'channel'"
+            )
+        return value
+    @field_validator("strategy", mode="before")
+    def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
+        if isinstance(value, str):
+            return QuantizationStrategy(value.lower())
+        return value
+    @field_validator("actorder", mode="before")
+    def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
+        if isinstance(value, bool):
+            return ActivationOrdering.GROUP if value else None
+        if isinstance(value, str):
+            return ActivationOrdering(value.lower())
+        return value
+    @model_validator(mode="after")
+    def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
+        # extract user-passed values from dictionary
+        strategy = model.strategy
+        group_size = model.group_size
+        actorder = model.actorder
+        # infer strategy
+        if strategy is None:
+            if group_size is None:
+                strategy = QuantizationStrategy.TENSOR
+            elif group_size > 0:
+                strategy = QuantizationStrategy.GROUP
+            elif group_size == -1:
+                strategy = QuantizationStrategy.CHANNEL
             else:
                 raise ValueError(
-                    f"group_size={group_size} with strategy {value} is invald. "
-                    "group_size > 0 for strategy='group' and "
-                    "group_size = -1 for 'channel'"
+                    f"Invalid group size {group_size}. Use group_size > 0 for "
+                    "strategy='group' and group_size = -1 for 'channel'"
                 )
-        if value == QuantizationStrategy.GROUP:
-            if group_size is None:
-                raise ValueError(f"strategy {value} requires group_size to be set.")
+        # validate strategy and group
+        if strategy == QuantizationStrategy.GROUP:
+            if group_size is None or group_size <= 0:
+                raise ValueError(
+                    f"strategy {strategy} requires group_size to be "
+                    "set to a positive value"
+                )
+        if (
+            group_size is not None
+            and group_size > 0
+            and strategy != QuantizationStrategy.GROUP
+        ):
+            raise ValueError("group_size requires strategy to be set to 'group'")
+        # validate activation ordering and strategy
+        if actorder is not None and strategy != QuantizationStrategy.GROUP:
+            raise ValueError(
+                "Must use group quantization strategy in order to apply "
+                "activation ordering"
+            )
+        # write back modified values
+        model.strategy = strategy
+        return model
+    def pytorch_dtype(self) -> torch.dtype:
+        if self.type == QuantizationType.FLOAT:
+            return FP8_DTYPE
+        elif self.type == QuantizationType.INT:
+            if self.num_bits <= 8:
+                return torch.int8
+            elif self.num_bits <= 16:
+                return torch.int16
+            else:
+                return torch.int32
+        else:
+            raise ValueError(f"Invalid quantization type {self.type}")
-        if value is None:
-            return QuantizationStrategy.TENSOR
-        return value
+def round_to_quantized_type(
+    tensor: torch.Tensor, args: QuantizationArgs
+) -> torch.Tensor:
+    """
+    Rounds each element of the input tensor to the nearest quantized representation,
+    keeping to original dtype
+    :param tensor: tensor to round
+    :param args: QuantizationArgs to pull appropriate dtype from
+    :return: rounded tensor
+    """
+    original_dtype = tensor.dtype
+    if args.type == QuantizationType.FLOAT:
+        rounded = tensor.to(FP8_DTYPE)
+    elif args.type == QuantizationType.INT:
+        rounded = torch.round(tensor)
+    else:
+        raise ValueError(f"Invalid quantization type {args.type}")
+    return rounded.to(original_dtype)

compressed-tensors 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl