PyPI - compressed-tensors - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

compressed_tensors/base.py +1 -0
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +200 -8
compressed_tensors/compressors/dense.py +1 -1
compressed_tensors/compressors/marlin_24.py +11 -10
compressed_tensors/compressors/model_compressor.py +101 -13
compressed_tensors/compressors/naive_quantized.py +140 -0
compressed_tensors/compressors/pack_quantized.py +128 -132
compressed_tensors/compressors/sparse_bitmask.py +1 -1
compressed_tensors/config/base.py +8 -1
compressed_tensors/{compressors/utils → linear}/__init__.py +0 -6
compressed_tensors/linear/compressed_linear.py +87 -0
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -44
compressed_tensors/quantization/lifecycle/calibration.py +22 -2
compressed_tensors/quantization/lifecycle/compressed.py +3 -1
compressed_tensors/quantization/lifecycle/forward.py +139 -61
compressed_tensors/quantization/lifecycle/helpers.py +80 -0
compressed_tensors/quantization/lifecycle/initialize.py +77 -13
compressed_tensors/quantization/observers/__init__.py +1 -0
compressed_tensors/quantization/observers/base.py +93 -14
compressed_tensors/quantization/observers/helpers.py +64 -11
compressed_tensors/quantization/observers/min_max.py +8 -0
compressed_tensors/quantization/observers/mse.py +162 -0
compressed_tensors/quantization/quant_args.py +139 -23
compressed_tensors/quantization/quant_config.py +35 -2
compressed_tensors/quantization/quant_scheme.py +112 -13
compressed_tensors/quantization/utils/helpers.py +68 -2
compressed_tensors/utils/__init__.py +5 -0
compressed_tensors/utils/helpers.py +44 -2
compressed_tensors/utils/offload.py +116 -0
compressed_tensors/utils/permute.py +70 -0
compressed_tensors/utils/safetensors_load.py +2 -0
compressed_tensors/{compressors/utils → utils}/semi_structured_conversions.py +1 -0
compressed_tensors/version.py +1 -1
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -22
compressed_tensors-0.6.0.dist-info/RECORD +52 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
compressed_tensors/compressors/int_quantized.py +0 -126
compressed_tensors/compressors/utils/helpers.py +0 -43
compressed_tensors-0.4.0.dist-info/RECORD +0 -48
/compressed_tensors/{compressors/utils → utils}/permutations_24.py +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0

compressed_tensors/compressors/pack_quantized.py CHANGED Viewed

@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import math
-from typing import Dict, Generator, Tuple
+from typing import Dict, Optional, Tuple
 import numpy as np
 import torch
@@ -23,15 +21,10 @@ from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import QuantizationArgs
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
-from compressed_tensors.utils import get_nested_weight_mappings, merge_names
-from safetensors import safe_open
 from torch import Tensor
-from tqdm import tqdm
-__all__ = ["PackedQuantizationCompressor", "pack_4bit_ints", "unpack_4bit_ints"]
-_LOGGER: logging.Logger = logging.getLogger(__name__)
+__all__ = ["PackedQuantizationCompressor", "pack_to_int32", "unpack_from_int32"]
 @Compressor.register(name=CompressionFormat.pack_quantized.value)
@@ -44,142 +37,142 @@ class PackedQuantizationCompressor(Compressor):
         "weight_packed",
         "weight_scale",
         "weight_zero_point",
+        "weight_g_idx",
         "weight_shape",
     ]
-    def compress(
+    def compression_param_info(
         self,
-        model_state: Dict[str, Tensor],
-        model_quant_args: Dict[str, QuantizationArgs],
-        **kwargs,
-    ) -> Dict[str, Tensor]:
+        weight_shape: torch.Size,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
         """
-        Compresses a dense state dict
+        Creates a dictionary of expected shapes and dtypes for each compression
+            parameter used by the compressor
-        :param model_state: state dict of uncompressed model
-        :param model_quant_args: quantization args for each quantized weight, needed for
-        quantize function to calculate bit depth
-        :return: compressed state dict
+        :param weight_shape: uncompressed weight shape
+        :param quantization_args: quantization parameters for the weight
+        :return: dictionary mapping compressed parameter names to shape and dtype
+        """
+        pack_factor = 32 // quantization_args.num_bits
+        packed_size = math.ceil(weight_shape[1] / pack_factor)
+        return {
+            "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
+            "weight_shape": (torch.Size((2,)), torch.int32),
+        }
+    def compress_weight(
+        self,
+        weight: Tensor,
+        scale: Tensor,
+        zero_point: Optional[Tensor] = None,
+        g_idx: Optional[torch.Tensor] = None,
+        quantization_args: Optional[QuantizationArgs] = None,
+        device: Optional[torch.device] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compresses a single uncompressed weight
+        :param weight: uncompressed weight tensor
+        :param scale: quantization scale for weight
+        :param zero_point: quantization zero point for weight
+        :param g_idx: optional mapping from column index to group index
+        :param quantization_args: quantization parameters for weight
+        :param device: optional device to move compressed output to
+        :return: dictionary of compressed weight data
         """
         compressed_dict = {}
-        weight_suffix = ".weight"
-        _LOGGER.debug(
-            f"Compressing model with {len(model_state)} parameterized layers..."
-        )
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
-            if name.endswith(weight_suffix):
-                prefix = name[: -(len(weight_suffix))]
-                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
-                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
-                shape = torch.tensor(value.shape)
-                if scale is not None and zp is not None:
-                    # weight is quantized, compress it
-                    quant_args = model_quant_args[prefix]
-                    if can_quantize(value, quant_args):
-                        # convert weight to an int if not already compressed
-                        value = quantize(
-                            x=value,
-                            scale=scale,
-                            zero_point=zp,
-                            args=quant_args,
-                            dtype=torch.int8,
-                        )
-                    value = pack_4bit_ints(value.cpu())
-                    compressed_dict[merge_names(prefix, "weight_shape")] = shape
-                    compressed_dict[merge_names(prefix, "weight_packed")] = value
-                    continue
-            elif name.endswith("zero_point"):
-                if torch.all(value == 0):
-                    # all zero_points are 0, no need to include in
-                    # compressed state_dict
-                    continue
-            compressed_dict[name] = value.to("cpu")
+        if can_quantize(weight, quantization_args):
+            quantized_weight = quantize(
+                x=weight,
+                scale=scale,
+                zero_point=zero_point,
+                g_idx=g_idx,
+                args=quantization_args,
+                dtype=torch.int8,
+            )
+        packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
+        weight_shape = torch.tensor(weight.shape)
+        if device is not None:
+            packed_weight = packed_weight.to(device)
+            weight_shape = weight_shape.to(device)
+        compressed_dict["weight_shape"] = weight_shape
+        compressed_dict["weight_packed"] = packed_weight
         return compressed_dict
-    def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
-    ) -> Generator[Tuple[str, Tensor], None, None]:
+    def decompress_weight(
+        self,
+        compressed_data: Dict[str, Tensor],
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> torch.Tensor:
         """
-        Reads a compressed state dict located at path_to_model_or_tensors
-        and returns a generator for sequentially decompressing back to a
-        dense state dict
-        :param model_path: path to compressed safetensors model (directory with
-            one or more safetensors files) or compressed tensors file
-        :param device: optional device to load intermediate weights into
-        :return: compressed state dict
+        Decompresses a single compressed weight
+        :param compressed_data: dictionary of data needed for decompression
+        :param quantization_args: quantization parameters for the weight
+        :return: tensor of the decompressed weight
         """
-        weight_mappings = get_nested_weight_mappings(
-            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
+        weight = compressed_data["weight_packed"]
+        scale = compressed_data["weight_scale"]
+        zero_point = compressed_data.get("weight_zero_point", None)
+        g_idx = compressed_data.get("weight_g_idx", None)
+        original_shape = torch.Size(compressed_data["weight_shape"])
+        num_bits = quantization_args.num_bits
+        unpacked = unpack_from_int32(weight, num_bits, original_shape)
+        decompressed_weight = dequantize(
+            x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
-        for weight_name in weight_mappings.keys():
-            weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device=device) as f:
-                    weight_data[param_name] = f.get_tensor(full_name)
-            if "weight_scale" in weight_data:
-                zero_point = weight_data.get("weight_zero_point", None)
-                scale = weight_data["weight_scale"]
-                if zero_point is None:
-                    # zero_point assumed to be 0 if not included in state_dict
-                    zero_point = torch.zeros_like(scale)
-                weight = weight_data["weight_packed"]
-                original_shape = torch.Size(weight_data["weight_shape"])
-                unpacked = unpack_4bit_ints(weight, original_shape)
-                decompressed = dequantize(
-                    x_q=unpacked,
-                    scale=scale,
-                    zero_point=zero_point,
-                )
-                yield merge_names(weight_name, "weight"), decompressed
-def pack_4bit_ints(value: torch.Tensor) -> torch.Tensor:
+        return decompressed_weight
+def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
     """
-    Packs a tensor of int4 weights stored in int8 into int32s with padding
+    Packs a tensor of quantized weights stored in int8 into int32s with padding
     :param value: tensor to pack
+    :param num_bits: number of bits used to store underlying data
     :returns: packed int32 tensor
     """
     if value.dtype is not torch.int8:
         raise ValueError("Tensor must be quantized to torch.int8 before packing")
-    # need to convert to unsigned 8bit to use numpy's pack/unpack
-    temp = (value - 8).to(torch.uint8)
-    bits = np.unpackbits(temp.numpy(), axis=-1, bitorder="little")
-    ranges = np.array([range(x, x + 4) for x in range(0, bits.shape[1], 8)]).flatten()
-    only_4_bits = bits[:, ranges]  # top 4 bits are 0 because we're really uint4
+    if num_bits > 8:
+        raise ValueError("Packing is only supported for less than 8 bits")
-    # pad each row to fill a full 32bit int
-    pack_depth = 32
-    padding = (
-        math.ceil(only_4_bits.shape[1] / pack_depth) * pack_depth - only_4_bits.shape[1]
-    )
-    padded_bits = np.pad(
-        only_4_bits, pad_width=[(0, 0), (0, padding)], constant_values=0
-    )
+    # convert to unsigned for packing
+    offset = pow(2, num_bits) // 2
+    value = (value + offset).to(torch.uint8)
+    value = value.cpu().numpy().astype(np.uint32)
+    pack_factor = 32 // num_bits
+    # pad input tensor and initialize packed output
+    packed_size = math.ceil(value.shape[1] / pack_factor)
+    packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
+    padding = packed.shape[1] * pack_factor - value.shape[1]
+    value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
-    # after packbits each uint8 is two packed uint4s
-    # then we keep the bit pattern the same but convert to int32
-    compressed = np.packbits(padded_bits, axis=-1, bitorder="little")
-    compressed = np.ascontiguousarray(compressed).view(np.int32)
+    # pack values
+    for i in range(pack_factor):
+        packed |= value[:, i::pack_factor] << num_bits * i
-    return torch.from_numpy(compressed)
+    # convert back to signed and torch
+    packed = np.ascontiguousarray(packed).view(np.int32)
+    return torch.from_numpy(packed)
-def unpack_4bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
+def unpack_from_int32(
+    value: torch.Tensor, num_bits: int, shape: torch.Size
+) -> torch.Tensor:
     """
-    Unpacks a tensor packed int4 weights into individual int8s, maintaining the
-    original their int4 range
+    Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
+    original their bit range
     :param value: tensor to upack
+    :param num_bits: number of bits to unpack each data point into
     :param shape: shape to unpack into, used to remove padding
     :returns: unpacked int8 tensor
     """
@@ -188,25 +181,28 @@ def unpack_4bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
             f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
         )
-    # unpack bits and undo padding to nearest int32 bits
-    individual_depth = 4
-    as_uint8 = value.numpy().view(np.uint8)
-    bits = np.unpackbits(as_uint8, axis=-1, bitorder="little")
-    original_row_size = int(shape[1] * individual_depth)
-    bits = bits[:, :original_row_size]
+    if num_bits > 8:
+        raise ValueError("Unpacking is only supported for less than 8 bits")
-    # reformat each packed uint4 to a uint8 by filling to top 4 bits with zeros
-    # (uint8 format is required by np.packbits)
-    shape_8bit = (bits.shape[0], bits.shape[1] * 2)
-    bits_as_8bit = np.zeros(shape_8bit, dtype=np.uint8)
-    ranges = np.array([range(x, x + 4) for x in range(0, shape_8bit[1], 8)]).flatten()
-    bits_as_8bit[:, ranges] = bits
+    pack_factor = 32 // num_bits
+    # unpack
+    mask = pow(2, num_bits) - 1
+    unpacked = torch.zeros(
+        (value.shape[0], value.shape[1] * pack_factor),
+        device=value.device,
+        dtype=torch.int32,
+    )
+    for i in range(pack_factor):
+        unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
-    # repack the bits to uint8
-    repacked = np.packbits(bits_as_8bit, axis=-1, bitorder="little")
+    # remove padding
+    original_row_size = int(shape[1])
+    unpacked = unpacked[:, :original_row_size]
     # bits are packed in unsigned format, reformat to signed
-    # update the value range from uint4 to int4
-    final = repacked.astype(np.int8) - 8
+    # update the value range from unsigned to signed
+    offset = pow(2, num_bits) // 2
+    unpacked = (unpacked - offset).to(torch.int8)
-    return torch.from_numpy(final)
+    return unpacked

compressed_tensors/compressors/sparse_bitmask.py CHANGED Viewed

@@ -72,7 +72,7 @@ class BitmaskCompressor(Compressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located

compressed_tensors/config/base.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from enum import Enum
-from typing import Optional
+from typing import List, Optional
 from compressed_tensors.registry import RegistryMixin
 from pydantic import BaseModel
@@ -26,6 +26,8 @@ class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
     int_quantized = "int-quantized"
+    float_quantized = "float-quantized"
+    naive_quantized = "naive-quantized"
     pack_quantized = "pack-quantized"
     marlin_24 = "marlin-24"
@@ -35,11 +37,16 @@ class SparsityCompressionConfig(RegistryMixin, BaseModel):
     Base data class for storing sparsity compression parameters
     :param format: name of compression format
+    :param targets: List of layer names or layer types that aren't sparse and should
+        be ignored during compression. By default, assume all layers are targeted
+    :param ignore: List of layer names (unique) to ignore from targets. Defaults to None
     :param global_sparsity: average sparsity of the entire model
     :param sparsity_structure: structure of the sparsity, such as
     "unstructured", "2:4", "8:16" etc
     """
     format: str
+    targets: Optional[List[str]] = None
+    ignore: Optional[List[str]] = None
     global_sparsity: Optional[float] = 0.0
     sparsity_structure: Optional[str] = "unstructured"

compressed_tensors/{compressors/utils → linear}/__init__.py RENAMED Viewed

@@ -11,9 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# flake8: noqa
-from .helpers import *
-from .permutations_24 import *
-from .semi_structured_conversions import *

compressed_tensors/linear/compressed_linear.py ADDED Viewed

@@ -0,0 +1,87 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from compressed_tensors.compressors.base import Compressor
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    QuantizationStatus,
+    initialize_module_for_quantization,
+)
+from torch import Tensor
+from torch.nn import Parameter
+from torch.nn.functional import linear
+from torch.nn.modules import Linear
+class CompressedLinear(Linear):
+    """
+    Wrapper module for running a compressed forward pass of a quantized Linear module.
+    The wrapped layer will decompressed on each forward call.
+    :param module: dense linear module to replace
+    :param quantization_scheme: quantization config for the module to wrap
+    :param quantization_format: compression format module is stored as
+    """
+    @classmethod
+    @torch.no_grad()
+    def from_linear(
+        cls,
+        module: Linear,
+        quantization_scheme: QuantizationScheme,
+        quantization_format: str,
+    ):
+        module.__class__ = CompressedLinear
+        module.compressor = Compressor.load_from_registry(quantization_format)
+        device = next(module.parameters()).device
+        # this will initialize all the scales and zero points
+        initialize_module_for_quantization(
+            module, quantization_scheme, force_zero_point=False
+        )
+        # get the shape and dtype of compressed parameters
+        compression_params = module.compressor.compression_param_info(
+            module.weight.shape, quantization_scheme.weights
+        )
+        # no need for this once quantization is initialized, will be replaced
+        # with the compressed parameter
+        delattr(module, "weight")
+        # populate compressed weights and quantization parameters
+        for name, (shape, dtype) in compression_params.items():
+            param = Parameter(
+                torch.empty(shape, device=device, dtype=dtype), requires_grad=False
+            )
+            module.register_parameter(name, param)
+        # mark module as compressed
+        module.quantization_status = QuantizationStatus.COMPRESSED
+        # handles case where forward is wrapped in new_forward by accelerate hooks
+        if hasattr(module, "_old_forward"):
+            module._old_forward = CompressedLinear.forward.__get__(
+                module, CompressedLinear
+            )
+        return module
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Decompresses the weight, then runs the wrapped forward pass
+        """
+        uncompressed_weight = self.compressor.decompress_module(self)
+        return linear(input, uncompressed_weight, self.bias)

compressed_tensors/quantization/lifecycle/__init__.py CHANGED Viewed

@@ -21,3 +21,4 @@ from .frozen import *
 from .initialize import *
 from .compressed import *
 from .apply import *
+from .helpers import *

compressed-tensors 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl