PyPI - compressed-tensors-nightly - Versions diffs - 0.3.3.20240521__py3-none-any.whl → 0.3.3.20240523__py3-none-any.whl - Mend

compressed-tensors-nightly 0.3.3.20240521py3-none-any.whl → 0.3.3.20240523py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

compressed_tensors/quantization/observers/base.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.quant_args import (
@@ -93,15 +93,18 @@ class Observer(Module, RegistryMixin):
             elif self.quantization_args.strategy == QuantizationStrategy.GROUP:
                 columns = observed.shape[1]
                 scales, zero_points = [], []
-                for i in range(0, columns, self.quantization_args.group_size):
+                group_idxs = range(0, columns, self.quantization_args.group_size)
+                for group_id, group_idx in enumerate(group_idxs):
                     scale, zero_point = self.get_qparams_along_dim(
-                        observed[:, i : (i + group_size)],
+                        observed[:, group_idx : (group_idx + group_size)],
                         0,
+                        tensor_id=group_id,
                     )
                     scales.append(scale)
                     zero_points.append(zero_point)
-                self._scale = torch.stack(scales, dim=1, out=self._scale)
-                self._zero_point = torch.stack(zero_points, dim=1, out=self._zero_point)
+                self._scale = torch.cat(scales, dim=1, out=self._scale)
+                self._zero_point = torch.cat(zero_points, dim=1, out=self._zero_point)
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
                 # assume observed is transposed, because its the output, hence use dim 0
@@ -116,6 +119,10 @@ class Observer(Module, RegistryMixin):
         return self._scale, self._zero_point
-    def get_qparams_along_dim(self, observed, dim: int):
+    def get_qparams_along_dim(
+        self, observed, dim: int, tensor_id: Optional[Any] = None
+    ):
         reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
-        return self.calculate_qparams(observed, reduce_dims=reduce_dims)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed_tensors/quantization/observers/memoryless.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -33,12 +33,14 @@ class MemorylessObserver(Observer):
     def calculate_qparams(
         self,
         observed: Tensor,
+        tensor_id: Optional[Any] = None,
         reduce_dims: Optional[Tuple[int]] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Returns the min and max values of observed tensor
         :param observed: observed tensor to calculate quantization parameters for
+        :param tensor_id: optional id for tensor; not used for memoryless
         :param reduce_dims: optional tuple of dimensions to reduce along,
             returned scale and zero point will be shaped (1,) along the
             reduced dimensions

compressed_tensors/quantization/observers/min_max.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -36,14 +36,15 @@ class MovingAverageMinMaxObserver(Observer):
     ):
         super().__init__(quantization_args=quantization_args)
-        self.min_val = None
-        self.max_val = None
+        self.min_val = {}
+        self.max_val = {}
         self.averaging_constant = averaging_constant
     def calculate_qparams(
         self,
         observed: Tensor,
         reduce_dims: Optional[Tuple[int]] = None,
+        tensor_id: Optional[Any] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Updates the observed min and max using a moving average smoothed by the
@@ -53,8 +54,11 @@ class MovingAverageMinMaxObserver(Observer):
         :param reduce_dims: optional tuple of dimensions to reduce along,
             returned scale and zero point will be shaped (1,) along the
             reduced dimensions
+        :param tensor_id: Optional id if different ranges of observed tensors are
+            passed, useful for sharding tensors by group_size
         :return: tuple of scale and zero point derived from the observed tensor
         """
+        tensor_id = tensor_id or "default"
         if not reduce_dims:
             min_val, max_val = torch.aminmax(observed)
@@ -62,15 +66,31 @@ class MovingAverageMinMaxObserver(Observer):
             min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
             max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
-        if self.min_val is None and self.max_val is None:
-            self.min_val = min_val
-            self.max_val = max_val
+        running_min_val = self.min_val.get(tensor_id, None)
+        running_max_val = self.max_val.get(tensor_id, None)
+        if running_min_val is None or running_max_val is None:
+            updated_min_val = min_val
+            updated_max_val = max_val
         else:
-            self.min_val = self.min_val + self.averaging_constant * (
-                min_val - self.min_val
+            updated_min_val = running_min_val + self.averaging_constant * (
+                min_val - running_min_val
             )
-            self.max_val = self.max_val + self.averaging_constant * (
-                max_val - self.max_val
+            updated_max_val = running_max_val + self.averaging_constant * (
+                max_val - running_max_val
             )
-        return calculate_qparams(self.min_val, self.max_val, self.quantization_args)
+        self.min_val[tensor_id] = updated_min_val
+        self.max_val[tensor_id] = updated_max_val
+        return calculate_qparams(
+            updated_min_val, updated_max_val, self.quantization_args
+        )
+    def get_qparams_along_dim(
+        self, observed, dim: int, tensor_id: Optional[Any] = None
+    ):
+        reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
+        return self.calculate_qparams(
+            observed, reduce_dims=reduce_dims, tensor_id=tensor_id
+        )

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -13,11 +13,14 @@
 # limitations under the License.
 from enum import Enum
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 from compressed_tensors.base import QUANTIZATION_CONFIG_NAME
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.quantization.quant_scheme import (
+    QuantizationScheme,
+    preset_name_to_scheme,
+)
 from compressed_tensors.quantization.utils import (
     calculate_compression_ratio,
     is_module_quantized,
@@ -105,7 +108,8 @@ class QuantizationConfig(BaseModel):
     mapped to a QuantizationScheme in config_groups.
     :param config_groups: dict of QuantizationSchemes specifying the quantization
-    settings for each quantized layer
+    settings for each quantized layer. A group could also be a reference to
+    a predefined scheme name, mapped to a list of its target layers/classes
     :param quant_method: a constant used to differentiate sparseML quantization from
     other quantization configs
     :param format: specifies how the quantized model is stored on disk
@@ -117,13 +121,26 @@ class QuantizationConfig(BaseModel):
     are not quantized even if they match up with a target in config_groups
     """
-    config_groups: Dict[str, QuantizationScheme]
+    config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
     quant_method: str = "sparseml"
     format: str = "fakequant"
     quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
     global_compression_ratio: Optional[float] = None
     ignore: Optional[List[str]] = Field(default_factory=list)
+    def model_post_init(self, __context):
+        """
+        updates any quantization schemes defined as presets to be fully loaded
+        schemes
+        """
+        for group_name, targets_or_scheme in self.config_groups.items():
+            if isinstance(targets_or_scheme, QuantizationScheme):
+                continue  # scheme already defined
+            self.config_groups[group_name] = preset_name_to_scheme(
+                name=group_name,
+                targets=targets_or_scheme,
+            )
     @staticmethod
     def from_model_config(model_name_or_path) -> "QuantizationConfig":
         """

compressed_tensors/quantization/quant_scheme.py CHANGED Viewed

@@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from copy import deepcopy
 from typing import List, Optional
 from compressed_tensors.quantization.quant_args import QuantizationArgs
 from pydantic import BaseModel
-__all__ = ["QuantizationScheme"]
+__all__ = [
+    "QuantizationScheme",
+    "preset_name_to_scheme",
+]
 class QuantizationScheme(BaseModel):
@@ -37,3 +41,70 @@ class QuantizationScheme(BaseModel):
     weights: Optional[QuantizationArgs] = None
     input_activations: Optional[QuantizationArgs] = None
     output_activations: Optional[QuantizationArgs] = None
+    @classmethod
+    def default_scheme(
+        cls,
+        targets: Optional[List[str]] = None,
+    ):
+        if targets is None:
+            # default to quantizing all Linear layers
+            targets = ["Linear"]
+        # default to 8 bit integer symmetric quantization
+        # for weights
+        weights = QuantizationArgs(num_bits=8, symmetric=True)
+        # default to 8 bit integer asymmetric quantization
+        input_activations = QuantizationArgs(num_bits=8, symmetric=True)
+        # Do not quantize the output activations
+        # by default
+        output_activations = None
+        return cls(
+            targets=targets,
+            weights=weights,
+            input_activations=input_activations,
+            output_activations=output_activations,
+        )
+"""
+Pre-Set Quantization Scheme Args
+"""
+def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
+    """
+    :param name: preset quantization settings name. must exist in upper case in
+        PRESET_SCHEMES
+    :param targets: list of quantization targets to be passed to the Scheme
+    :return: new QuantizationScheme for a given name with the given targets
+    """
+    name = name.upper()
+    if name not in PRESET_SCHEMES:
+        raise KeyError(
+            f"Unknown preset scheme name {name}, "
+            f"available names: {list(PRESET_SCHEMES.keys())}"
+        )
+    scheme_args = deepcopy(PRESET_SCHEMES[name])  # deepcopy to avoid args references
+    return QuantizationScheme(
+        targets=targets,
+        **scheme_args,
+    )
+W8A8 = dict(
+    weights=QuantizationArgs(), input_activations=QuantizationArgs(symmetric=False)
+)
+W4A16 = dict(weights=QuantizationArgs(num_bits=4, symmetric=False))
+PRESET_SCHEMES = {
+    "W8A8": W8A8,
+    "W4A16": W4A16,
+}

{compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.3.3.20240521
+Version: 0.3.3.20240523
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/RECORD RENAMED Viewed

@@ -15,8 +15,8 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
 compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
 compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
 compressed_tensors/quantization/quant_args.py,sha256=A6b2V8lhsM8Ho8RjlPBQdxRUDNWhqq-ie5E3RR2_GNg,4360
-compressed_tensors/quantization/quant_config.py,sha256=U6oEzheNK1d-0kHARzwepasnmS7HHqU_zGwoDBJ-lxU,8042
-compressed_tensors/quantization/quant_scheme.py,sha256=X3oqmZPiIKtX5tEKKUj-0N6hB68NeiU2b1GcQEQPadQ,1480
+compressed_tensors/quantization/quant_config.py,sha256=3BcbQ8-Ah7LbTDSSkRu29Yiid33xo0C1ki6NVhxLiaY,8727
+compressed_tensors/quantization/quant_scheme.py,sha256=QwZsCo8QR9ISB_d58WhIngk2gsMM8ooX-LcRPR-JDRw,3341
 compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
 compressed_tensors/quantization/lifecycle/apply.py,sha256=whKfNGC_EZm0BC23AP7qWfjRe5OJVWmcZOpX7lryZZc,7625
 compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
@@ -25,10 +25,10 @@ compressed_tensors/quantization/lifecycle/forward.py,sha256=x9JaIX3TK7cb_-0aCOTT
 compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
 compressed_tensors/quantization/lifecycle/initialize.py,sha256=U6g9qifSF6pagQZQZEwd-rwWC6uQ_dZXn1wg6nr1Abg,3697
 compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
-compressed_tensors/quantization/observers/base.py,sha256=yIV2bd9PKPZwodgiBTZEco2ARbD3B0rOKDC0MOFluZs,4900
+compressed_tensors/quantization/observers/base.py,sha256=kywLVwycFvGxuZMU2cy8-KYyNrZCHkinN6YzCL7boLE,5121
 compressed_tensors/quantization/observers/helpers.py,sha256=JwALNfBYY9Eyl8Q180t0lGh8szumQj8TygfNl-isErs,2166
-compressed_tensors/quantization/observers/memoryless.py,sha256=Gach22cZLhDms6ueKF56XOiLhyWVIEYIEXRRXP5Nu8I,2045
-compressed_tensors/quantization/observers/min_max.py,sha256=OGrtyn6_sWuTSx5QgUPVKRIiarfWrK9QqXeRXoJQynw,2861
+compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
+compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
 compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
@@ -36,8 +36,8 @@ compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85S
 compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
 compressed_tensors/utils/helpers.py,sha256=h0jfl9drs5FAx40tCHRcVtJqXixB5hT5yq_IG2aY_-w,1735
 compressed_tensors/utils/safetensors_load.py,sha256=wo9UirGrGlenBqZeqotvpCT7D5MEdjCo2J3HeRaIFoU,8502
-compressed_tensors_nightly-0.3.3.20240521.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors_nightly-0.3.3.20240521.dist-info/METADATA,sha256=DTxrrkh-4Wr9G5MAOS_2ILUsgrOIT-RDYi2IiVc13xg,5633
-compressed_tensors_nightly-0.3.3.20240521.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-compressed_tensors_nightly-0.3.3.20240521.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors_nightly-0.3.3.20240521.dist-info/RECORD,,
+compressed_tensors_nightly-0.3.3.20240523.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors_nightly-0.3.3.20240523.dist-info/METADATA,sha256=_c67GXEm0cMZ_AGWhcLqsMZ3hSbFB4KdQ3lL9Dg7M8M,5633
+compressed_tensors_nightly-0.3.3.20240523.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+compressed_tensors_nightly-0.3.3.20240523.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors_nightly-0.3.3.20240523.dist-info/RECORD,,

{compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/LICENSE RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/WHEEL RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.3.3.20240521.dist-info → compressed_tensors_nightly-0.3.3.20240523.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors-nightly 0.3.3.20240521__py3-none-any.whl → 0.3.3.20240523__py3-none-any.whl

compressed-tensors-nightly 0.3.3.20240521py3-none-any.whl → 0.3.3.20240523py3-none-any.whl