PyPI - compressed-tensors-nightly - Versions diffs - 0.5.0.20240814__py3-none-any.whl → 0.5.0.20240830__py3-none-any.whl - Mend

compressed-tensors-nightly 0.5.0.20240814py3-none-any.whl → 0.5.0.20240830py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

compressed_tensors/compressors/pack_quantized.py CHANGED Viewed

@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import math
-from typing import Dict, Generator, Tuple
+from typing import Dict, Optional, Tuple
 import numpy as np
 import torch
@@ -23,16 +21,11 @@ from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import QuantizationArgs
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
-from compressed_tensors.utils import get_nested_weight_mappings, merge_names
-from safetensors import safe_open
 from torch import Tensor
-from tqdm import tqdm
 __all__ = ["PackedQuantizationCompressor", "pack_to_int32", "unpack_from_int32"]
-_LOGGER: logging.Logger = logging.getLogger(__name__)
 @Compressor.register(name=CompressionFormat.pack_quantized.value)
 class PackedQuantizationCompressor(Compressor):
@@ -44,102 +37,96 @@ class PackedQuantizationCompressor(Compressor):
         "weight_packed",
         "weight_scale",
         "weight_zero_point",
+        "weight_g_idx",
         "weight_shape",
     ]
-    def compress(
+    def compression_param_info(
         self,
-        model_state: Dict[str, Tensor],
-        names_to_scheme: Dict[str, QuantizationArgs],
-        **kwargs,
-    ) -> Dict[str, Tensor]:
+        weight_shape: torch.Size,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
         """
-        Compresses a dense state dict
+        Creates a dictionary of expected shapes and dtypes for each compression
+            parameter used by the compressor
-        :param model_state: state dict of uncompressed model
-        :param names_to_scheme: quantization args for each quantized weight, needed for
-        quantize function to calculate bit depth
-        :return: compressed state dict
+        :param weight_shape: uncompressed weight shape
+        :param quantization_args: quantization parameters for the weight
+        :return: dictionary mapping compressed parameter names to shape and dtype
+        """
+        pack_factor = 32 // quantization_args.num_bits
+        packed_size = math.ceil(weight_shape[1] / pack_factor)
+        return {
+            "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
+            "weight_shape": (torch.Size((2,)), torch.int32),
+        }
+    def compress_weight(
+        self,
+        weight: Tensor,
+        scale: Tensor,
+        zero_point: Optional[Tensor] = None,
+        g_idx: Optional[torch.Tensor] = None,
+        quantization_args: Optional[QuantizationArgs] = None,
+        device: Optional[torch.device] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compresses a single uncompressed weight
+        :param weight: uncompressed weight tensor
+        :param scale: quantization scale for weight
+        :param zero_point: quantization zero point for weight
+        :param g_idx: optional mapping from column index to group index
+        :param quantization_args: quantization parameters for weight
+        :param device: optional device to move compressed output to
+        :return: dictionary of compressed weight data
         """
         compressed_dict = {}
-        weight_suffix = ".weight"
-        _LOGGER.debug(
-            f"Compressing model with {len(model_state)} parameterized layers..."
-        )
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
-            if name.endswith(weight_suffix):
-                prefix = name[: -(len(weight_suffix))]
-                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
-                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
-                shape = torch.tensor(value.shape)
-                if scale is not None and zp is not None:
-                    # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
-                    if can_quantize(value, quant_args):
-                        # convert weight to an int if not already compressed
-                        value = quantize(
-                            x=value,
-                            scale=scale,
-                            zero_point=zp,
-                            args=quant_args,
-                            dtype=torch.int8,
-                        )
-                    value = pack_to_int32(value.cpu(), quant_args.num_bits)
-                    compressed_dict[merge_names(prefix, "weight_shape")] = shape
-                    compressed_dict[merge_names(prefix, "weight_packed")] = value
-                    continue
-            elif name.endswith("zero_point"):
-                if torch.all(value == 0):
-                    # all zero_points are 0, no need to include in
-                    # compressed state_dict
-                    continue
-            compressed_dict[name] = value.to("cpu")
+        if can_quantize(weight, quantization_args):
+            quantized_weight = quantize(
+                x=weight,
+                scale=scale,
+                zero_point=zero_point,
+                g_idx=g_idx,
+                args=quantization_args,
+                dtype=torch.int8,
+            )
+        packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
+        weight_shape = torch.tensor(weight.shape)
+        if device is not None:
+            packed_weight = packed_weight.to(device)
+            weight_shape = weight_shape.to(device)
+        compressed_dict["weight_shape"] = weight_shape
+        compressed_dict["weight_packed"] = packed_weight
         return compressed_dict
-    def decompress(
+    def decompress_weight(
         self,
-        path_to_model_or_tensors: str,
-        names_to_scheme: Dict[str, QuantizationArgs],
-        device: str = "cpu",
-    ) -> Generator[Tuple[str, Tensor], None, None]:
+        compressed_data: Dict[str, Tensor],
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> torch.Tensor:
         """
-        Reads a compressed state dict located at path_to_model_or_tensors
-        and returns a generator for sequentially decompressing back to a
-        dense state dict
-        :param model_path: path to compressed safetensors model (directory with
-            one or more safetensors files) or compressed tensors file
-        :param device: optional device to load intermediate weights into
-        :return: compressed state dict
+        Decompresses a single compressed weight
+        :param compressed_data: dictionary of data needed for decompression
+        :param quantization_args: quantization parameters for the weight
+        :return: tensor of the decompressed weight
         """
-        weight_mappings = get_nested_weight_mappings(
-            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
+        weight = compressed_data["weight_packed"]
+        scale = compressed_data["weight_scale"]
+        zero_point = compressed_data.get("weight_zero_point", None)
+        g_idx = compressed_data.get("weight_g_idx", None)
+        original_shape = torch.Size(compressed_data["weight_shape"])
+        num_bits = quantization_args.num_bits
+        unpacked = unpack_from_int32(weight, num_bits, original_shape)
+        decompressed_weight = dequantize(
+            x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
-        for weight_name in weight_mappings.keys():
-            weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                weight_data["num_bits"] = names_to_scheme.get(weight_name).num_bits
-                full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device=device) as f:
-                    weight_data[param_name] = f.get_tensor(full_name)
-            if "weight_scale" in weight_data:
-                zero_point = weight_data.get("weight_zero_point", None)
-                scale = weight_data["weight_scale"]
-                weight = weight_data["weight_packed"]
-                num_bits = weight_data["num_bits"]
-                original_shape = torch.Size(weight_data["weight_shape"])
-                unpacked = unpack_from_int32(weight, num_bits, original_shape)
-                decompressed = dequantize(
-                    x_q=unpacked,
-                    scale=scale,
-                    zero_point=zero_point,
-                )
-                yield merge_names(weight_name, "weight"), decompressed
+        return decompressed_weight
 def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
@@ -197,13 +184,15 @@ def unpack_from_int32(
     if num_bits > 8:
         raise ValueError("Unpacking is only supported for less than 8 bits")
-    # convert packed input to unsigned numpy
-    value = value.numpy().view(np.uint32)
     pack_factor = 32 // num_bits
     # unpack
     mask = pow(2, num_bits) - 1
-    unpacked = np.zeros((value.shape[0], value.shape[1] * pack_factor))
+    unpacked = torch.zeros(
+        (value.shape[0], value.shape[1] * pack_factor),
+        device=value.device,
+        dtype=torch.int32,
+    )
     for i in range(pack_factor):
         unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
@@ -214,6 +203,6 @@ def unpack_from_int32(
     # bits are packed in unsigned format, reformat to signed
     # update the value range from unsigned to signed
     offset = pow(2, num_bits) // 2
-    unpacked = (unpacked.astype(np.int16) - offset).astype(np.int8)
+    unpacked = (unpacked - offset).to(torch.int8)
-    return torch.from_numpy(unpacked)
+    return unpacked

compressed_tensors/linear/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

compressed_tensors/linear/compressed_linear.py ADDED Viewed

@@ -0,0 +1,87 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from compressed_tensors.compressors.base import Compressor
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    QuantizationStatus,
+    initialize_module_for_quantization,
+)
+from torch import Tensor
+from torch.nn import Parameter
+from torch.nn.functional import linear
+from torch.nn.modules import Linear
+class CompressedLinear(Linear):
+    """
+    Wrapper module for running a compressed forward pass of a quantized Linear module.
+    The wrapped layer will decompressed on each forward call.
+    :param module: dense linear module to replace
+    :param quantization_scheme: quantization config for the module to wrap
+    :param quantization_format: compression format module is stored as
+    """
+    @classmethod
+    @torch.no_grad()
+    def from_linear(
+        cls,
+        module: Linear,
+        quantization_scheme: QuantizationScheme,
+        quantization_format: str,
+    ):
+        module.__class__ = CompressedLinear
+        module.compressor = Compressor.load_from_registry(quantization_format)
+        device = next(module.parameters()).device
+        # this will initialize all the scales and zero points
+        initialize_module_for_quantization(
+            module, quantization_scheme, force_zero_point=False
+        )
+        # get the shape and dtype of compressed parameters
+        compression_params = module.compressor.compression_param_info(
+            module.weight.shape, quantization_scheme.weights
+        )
+        # no need for this once quantization is initialized, will be replaced
+        # with the compressed parameter
+        delattr(module, "weight")
+        # populate compressed weights and quantization parameters
+        for name, (shape, dtype) in compression_params.items():
+            param = Parameter(
+                torch.empty(shape, device=device, dtype=dtype), requires_grad=False
+            )
+            module.register_parameter(name, param)
+        # mark module as compressed
+        module.quantization_status = QuantizationStatus.COMPRESSED
+        # handles case where forward is wrapped in new_forward by accelerate hooks
+        if hasattr(module, "_old_forward"):
+            module._old_forward = CompressedLinear.forward.__get__(
+                module, CompressedLinear
+            )
+        return module
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Decompresses the weight, then runs the wrapped forward pass
+        """
+        uncompressed_weight = self.compressor.decompress_module(self)
+        return linear(input, uncompressed_weight, self.bias)

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import OrderedDict as OrderedDictType
 from typing import Union
 import torch
+from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.lifecycle.calibration import (
     set_module_for_calibration,
 )
@@ -43,7 +44,7 @@ from compressed_tensors.quantization.utils import (
     is_kv_cache_quant_scheme,
     iter_named_leaf_modules,
 )
-from compressed_tensors.utils.helpers import fix_fsdp_module_name
+from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
 from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
 from torch.nn import Module
@@ -104,12 +105,16 @@ def load_pretrained_quantization(model: Module, model_name_or_path: str):
             )
-def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict:
+def apply_quantization_config(
+    model: Module, config: QuantizationConfig, run_compressed: bool = False
+) -> Dict:
     """
     Initializes the model for quantization in-place based on the given config
     :param model: model to apply quantization config to
     :param config: quantization config
+    :param run_compressed: Whether the model will be run in compressed mode or
+        decompressed fully on load
     """
     # remove reference to the original `config`
     # argument. This function can mutate it, and we'd
@@ -124,6 +129,9 @@ def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict
         for target in scheme.targets:
             target_to_scheme[target] = scheme
+    if run_compressed:
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
     # list of submodules to ignore
     ignored_submodules = defaultdict(list)
     # mark appropriate layers for quantization by setting their quantization schemes
@@ -136,10 +144,24 @@ def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict
             continue  # layer matches ignore list, continue
         targets = find_name_or_class_matches(name, submodule, target_to_scheme)
         if targets:
+            scheme = _scheme_from_targets(target_to_scheme, targets, name)
+            if run_compressed:
+                format = config.format
+                if format != CompressionFormat.dense.value:
+                    if isinstance(submodule, torch.nn.Linear):
+                        # TODO: expand to more module types
+                        compressed_linear = CompressedLinear.from_linear(
+                            submodule,
+                            quantization_scheme=scheme,
+                            quantization_format=format,
+                        )
+                        replace_module(model, name, compressed_linear)
             # target matched - add layer and scheme to target list
             submodule.quantization_scheme = _scheme_from_targets(
                 target_to_scheme, targets, name
             )
             names_to_scheme[name] = submodule.quantization_scheme.weights
     if config.ignore is not None and ignored_submodules is not None:
@@ -149,8 +171,8 @@ def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict
                 "not found in the model: "
                 f"{set(config.ignore) - set(ignored_submodules)}"
             )
-    # apply current quantization status across all targeted layers
+    # apply current quantization status across all targeted layers
     apply_quantization_status(model, config.quantization_status)
     return names_to_scheme
@@ -198,7 +220,12 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     current_status = infer_quantization_status(model)
     if status >= QuantizationStatus.INITIALIZED > current_status:
-        model.apply(initialize_module_for_quantization)
+        force_zero_point_init = status != QuantizationStatus.COMPRESSED
+        model.apply(
+            lambda module: initialize_module_for_quantization(
+                module, force_zero_point=force_zero_point_init
+            )
+        )
     if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
         # only quantize weights up front when our end goal state is calibration,
@@ -279,9 +306,11 @@ def _load_quant_args_from_state_dict(
     """
     scale_name = f"{base_name}_scale"
     zp_name = f"{base_name}_zero_point"
+    g_idx_name = f"{base_name}_g_idx"
     state_dict_scale = state_dict.get(f"{module_name}.{scale_name}", None)
     state_dict_zp = state_dict.get(f"{module_name}.{zp_name}", None)
+    state_dict_g_idx = state_dict.get(f"{module_name}.{g_idx_name}", None)
     if state_dict_scale is not None:
         # module is quantized
@@ -291,6 +320,9 @@ def _load_quant_args_from_state_dict(
             state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
         update_parameter_data(module, state_dict_zp, zp_name)
+    if state_dict_g_idx is not None:
+        update_parameter_data(module, state_dict_g_idx, g_idx_name)
 def _scheme_from_targets(
     target_to_scheme: OrderedDictType[str, QuantizationScheme],

compressed_tensors/quantization/lifecycle/calibration.py CHANGED Viewed

@@ -44,7 +44,7 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
         return
     status = getattr(module, "quantization_status", None)
     if not status or status != QuantizationStatus.INITIALIZED:
-        raise _LOGGER.warning(
+        _LOGGER.warning(
             f"Attempting set module with status {status} to calibration mode. "
             f"but status is not {QuantizationStatus.INITIALIZED} - you may "
             "be calibrating an uninitialized module which may fail or attempting "
@@ -54,13 +54,14 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
     if quantize_weights_upfront and module.quantization_scheme.weights is not None:
         # set weight scale and zero_point up front, calibration data doesn't affect it
         observer = module.weight_observer
+        g_idx = getattr(module, "weight_g_idx", None)
         offloaded = False
         if is_module_offloaded(module):
             module._hf_hook.pre_forward(module)
             offloaded = True
-        scale, zero_point = observer(module.weight)
+        scale, zero_point = observer(module.weight, g_idx=g_idx)
         update_parameter_data(module, scale, "weight_scale")
         update_parameter_data(module, zero_point, "weight_zero_point")

compressed_tensors/quantization/lifecycle/compressed.py CHANGED Viewed

@@ -50,7 +50,7 @@ def compress_quantized_weights(module: Module):
     scale = getattr(module, "weight_scale", None)
     zero_point = getattr(module, "weight_zero_point", None)
-    if weight is None or scale is None or zero_point is None:
+    if weight is None or scale is None:
         # no weight, scale, or ZP, nothing to do
         # mark as compressed here to maintain consistent status throughout the model

compressed-tensors-nightly 0.5.0.20240814__py3-none-any.whl → 0.5.0.20240830__py3-none-any.whl

compressed-tensors-nightly 0.5.0.20240814py3-none-any.whl → 0.5.0.20240830py3-none-any.whl