PyPI - compressed-tensors - Versions diffs - 0.3.3__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

compressed-tensors 0.3.3py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

compressed_tensors/base.py +3 -1
compressed_tensors/compressors/__init__.py +9 -1
compressed_tensors/compressors/base.py +12 -55
compressed_tensors/compressors/dense.py +5 -5
compressed_tensors/compressors/helpers.py +12 -12
compressed_tensors/compressors/marlin_24.py +251 -0
compressed_tensors/compressors/model_compressor.py +336 -0
compressed_tensors/compressors/naive_quantized.py +144 -0
compressed_tensors/compressors/pack_quantized.py +219 -0
compressed_tensors/compressors/sparse_bitmask.py +4 -4
compressed_tensors/config/base.py +9 -4
compressed_tensors/config/dense.py +4 -4
compressed_tensors/config/sparse_bitmask.py +3 -3
compressed_tensors/quantization/lifecycle/__init__.py +2 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -31
compressed_tensors/quantization/lifecycle/calibration.py +20 -1
compressed_tensors/quantization/lifecycle/compressed.py +69 -0
compressed_tensors/quantization/lifecycle/forward.py +214 -62
compressed_tensors/quantization/lifecycle/frozen.py +4 -0
compressed_tensors/quantization/lifecycle/helpers.py +53 -0
compressed_tensors/quantization/lifecycle/initialize.py +62 -5
compressed_tensors/quantization/observers/base.py +66 -23
compressed_tensors/quantization/observers/helpers.py +69 -11
compressed_tensors/quantization/observers/memoryless.py +17 -9
compressed_tensors/quantization/observers/min_max.py +44 -13
compressed_tensors/quantization/quant_args.py +47 -3
compressed_tensors/quantization/quant_config.py +104 -23
compressed_tensors/quantization/quant_scheme.py +183 -2
compressed_tensors/quantization/utils/helpers.py +142 -8
compressed_tensors/utils/__init__.py +4 -0
compressed_tensors/utils/helpers.py +54 -7
compressed_tensors/utils/offload.py +104 -0
compressed_tensors/utils/permutations_24.py +65 -0
compressed_tensors/utils/safetensors_load.py +3 -2
compressed_tensors/utils/semi_structured_conversions.py +341 -0
compressed_tensors/version.py +53 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/METADATA +47 -8
compressed_tensors-0.5.0.dist-info/RECORD +48 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/WHEEL +1 -1
compressed_tensors-0.3.3.dist-info/RECORD +0 -38
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -14,18 +14,28 @@
 from functools import wraps
 from math import ceil
+from typing import Optional
 import torch
+from compressed_tensors.quantization.observers.helpers import calculate_range
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.utils import update_parameter_data
 from torch.nn import Module
-__all__ = ["wrap_module_forward_quantized", "maybe_calibrate_or_quantize"]
+__all__ = [
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "wrap_module_forward_quantized",
+    "maybe_calibrate_or_quantize",
+]
 @torch.no_grad()
@@ -33,14 +43,39 @@ def quantize(
     x: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
-    q_min: torch.Tensor,
-    q_max: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
+    """
+    Quantize the input tensor x using the QuantizationStrategy specified in args.
+    Quantization can be done per tensor, channel, token or group. For group
+    quantization, the group_size must be divisible by the column size. The input scale
+    and zero_points are reshaped to support vectorization (Assumes 1 is the
+    channel dimension)
-    return torch.clamp(
-        torch.round(x / scale + zero_point),
-        q_min,
-        q_max,
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :param dtype: optional dtype to cast the quantized output to
+    :return: fake quantized tensor
+    """
+    # ensure all tensors are on the same device
+    # assumes that the target device is the input
+    # tensor's device
+    if x.device != scale.device:
+        scale = scale.to(x.device)
+    if x.device != zero_point.device:
+        zero_point = zero_point.to(x.device)
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        dtype=dtype,
+        do_quantize=True,
+        do_dequantize=False,
     )
@@ -48,9 +83,50 @@ def quantize(
 def dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
-    zero_point: torch.Tensor,
+    zero_point: torch.Tensor = None,
+    args: QuantizationArgs = None,
+    dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    return (x_q - zero_point) * scale
+    """
+    Dequantize a quantized input tensor x_q based on the strategy specified in args. If
+    args is not provided, the strategy will be inferred.
+    :param x: quantized input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args used to quantize x_q
+    :param dtype: optional dtype to cast the dequantized output to
+    :return: dequantized float tensor
+    """
+    if args is None:
+        if scale.ndim == 0 or scale.ndim == 1:
+            args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR)
+        elif scale.ndim == 2:
+            if scale.shape[1] == 1:
+                args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
+            else:
+                group_size = int(x_q.shape[1] / scale.shape[1])
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.GROUP, group_size=group_size
+                )
+        else:
+            raise ValueError(
+                f"Could not infer a quantization strategy from scale with {scale.ndim} "
+                "dimmensions. Expected 0 or 2 dimmensions."
+            )
+    if dtype is None:
+        dtype = scale.dtype
+    return _process_quantization(
+        x=x_q,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=False,
+        do_dequantize=True,
+        dtype=dtype,
+    )
 @torch.no_grad()
@@ -61,30 +137,45 @@ def fake_quantize(
     args: QuantizationArgs,
 ) -> torch.Tensor:
     """
-    Fake quantize the input tensor x depending on the group_size.
-    if group_size is greater than 0, then q/dq by groups. The groups
-    must be divisible by the column size
-    if group_size is -1, then channel wise q/dq. THe input scale and
-    zero_points are reshaped to support vectorization (Assumes 1 is
-    the channel dimension)
+    Fake quantize the input tensor x by quantizing then dequantizing with
+    the QuantizationStrategy specified in args. Quantization can be done per tensor,
+    channel, token or group. For group quantization, the group_size must be divisible
+    by the column size. The input scale  and zero_points are reshaped to support
+    vectorization (Assumes 1 is the channel dimension)
     :param x: Input tensor
     :param scale: scale tensor
     :param zero_point: zero point tensor
-    :param args: quantization args that contain group_size info
+    :param args: quantization args dictating how to quantize x
     :return: fake quantized tensor
     """
-    bit_range = 2**args.num_bits
-    max_q = torch.tensor(bit_range / 2 - 1, device=x.device)
-    min_q = torch.tensor(-bit_range / 2, device=x.device)
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+    )
+@torch.no_grad()
+def _process_quantization(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+    do_quantize: bool = True,
+    do_dequantize: bool = True,
+) -> torch.Tensor:
+    q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
-    # group
     if args.strategy == QuantizationStrategy.GROUP:
-        DQ = torch.zeros_like(x)
+        output_dtype = dtype if dtype is not None else x.dtype
+        output = torch.zeros_like(x).to(output_dtype)
         # TODO: vectorize the for loop
         # TODO: fix genetric assumption about the tensor size for computing group
@@ -94,7 +185,7 @@ def fake_quantize(
         while scale.ndim < 2:
             # pad scale and zero point dims for slicing
             scale = scale.unsqueeze(1)
-            zero_point = zero_point.unsqueeze(1)
+            zero_point = zero_point.unsqueeze(1) if zero_point is not None else None
         columns = x.shape[1]
         if columns >= group_size:
@@ -106,51 +197,60 @@ def fake_quantize(
         for i in range(ceil(columns / group_size)):
             # scale.shape should be [nchan, ndim]
             # sc.shape should be [nchan, 1] after unsqueeze
-            sc = scale[:, i].unsqueeze(1)
-            zp = zero_point[:, i].unsqueeze(1)
+            sc = scale[:, i].view(-1, 1)
+            zp = zero_point[:, i].view(-1, 1) if zero_point is not None else None
             idx = i * group_size
-            Q = quantize(x[:, idx : (idx + group_size)], sc, zp, min_q, max_q)
-            DQ[:, idx : (idx + group_size)] = dequantize(Q, sc, zp)
-    # channel-wise
-    elif args.strategy == QuantizationStrategy.CHANNEL:  # group_size == -1
-        # before: scale shape = [channel_size]
-        # after: scale shape = [1, channel_size]
-        scale = scale.unsqueeze(0)
-        zero_point = zero_point.unsqueeze(0)
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
-    # per-token
-    elif args.strategy == QuantizationStrategy.TOKEN:
-        # before: scale shape = [num_tokens]
-        # after: scale shape = [num_tokens, 1]
-        # x.shape = 1, num_tokens, 1]
-        # scale gets broadcasted as expected withput having [1, num_tokens, 1] shape
-        scale = scale.unsqueeze(1)
-        zero_point = zero_point.unsqueeze(1)
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
-    else:
-        Q = quantize(x, scale, zero_point, min_q, max_q)
-        DQ = dequantize(Q, scale, zero_point)
+            if do_quantize:
+                output[:, idx : (idx + group_size)] = _quantize(
+                    x[:, idx : (idx + group_size)],
+                    sc,
+                    zp,
+                    q_min,
+                    q_max,
+                    args,
+                    dtype=dtype,
+                )
+            if do_dequantize:
+                input = (
+                    output[:, idx : (idx + group_size)]
+                    if do_quantize
+                    else x[:, idx : (idx + group_size)]
+                )
+                output[:, idx : (idx + group_size)] = _dequantize(input, sc, zp)
+    else:  # covers channel, token and tensor strategies
+        if do_quantize:
+            output = _quantize(
+                x,
+                scale,
+                zero_point,
+                q_min,
+                q_max,
+                args,
+                dtype=dtype,
+            )
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
-    return DQ
+    return output
 def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
     # expects a module already initialized and injected with the parameters in
     # initialize_module_for_quantization
-    forward_func_orig = module.forward.__func__
+    if hasattr(module.forward, "__func__"):
+        forward_func_orig = module.forward.__func__
+    else:
+        forward_func_orig = module.forward.func
     @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
     def wrapped_forward(self, *args, **kwargs):
+        if not getattr(module, "quantization_enabled", True):
+            # quantization is disabled on forward passes, return baseline
+            # forward call
+            return forward_func_orig.__get__(module, module.__class__)(*args, **kwargs)
         input_ = args[0]
         if scheme.input_activations is not None:
@@ -199,6 +299,11 @@ def maybe_calibrate_or_quantize(
     }:
         return value
+    if value.numel() == 0:
+        # if the tensor is empty,
+        # skip quantization
+        return value
     if args.dynamic:
         # dynamic quantization - get scale and zero point directly from observer
         observer = getattr(module, f"{base_name}_observer")
@@ -208,14 +313,61 @@ def maybe_calibrate_or_quantize(
         scale = getattr(module, f"{base_name}_scale")
         zero_point = getattr(module, f"{base_name}_zero_point")
-        if module.quantization_status == QuantizationStatus.CALIBRATION:
+        if (
+            module.quantization_status == QuantizationStatus.CALIBRATION
+            and base_name != "weight"
+        ):
             # calibration mode - get new quant params from observer
             observer = getattr(module, f"{base_name}_observer")
             updated_scale, updated_zero_point = observer(value)
             # update scale and zero point
-            device = next(module.parameters()).device
-            scale.data = updated_scale.to(device)
-            zero_point.data = updated_zero_point.to(device)
+            update_parameter_data(module, updated_scale, f"{base_name}_scale")
+            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
     return fake_quantize(value, scale, zero_point, args)
+@torch.no_grad()
+def _quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    scaled = x / scale + zero_point.to(x.dtype)
+    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
+    clamped_value = torch.clamp(
+        scaled,
+        q_min,
+        q_max,
+    )
+    quantized_value = round_to_quantized_type(clamped_value, args)
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
+    return quantized_value
+@torch.no_grad()
+def _dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor = None,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    dequant_value = x_q
+    if zero_point is not None:
+        dequant_value = dequant_value - zero_point.to(scale.dtype)
+    dequant_value = dequant_value.to(scale.dtype) * scale
+    if dtype is not None:
+        dequant_value = dequant_value.to(dtype)
+    return dequant_value

compressed_tensors/quantization/lifecycle/frozen.py CHANGED Viewed

@@ -35,6 +35,10 @@ def freeze_module_quantization(module: Module):
         # no quantization scheme nothing to do
         return
+    if module.quantization_status == QuantizationStatus.FROZEN:
+        # nothing to do, already frozen
+        return
     # delete observers from module if not dynamic
     if scheme.input_activations and not scheme.input_activations.dynamic:
         delattr(module, "input_observer")

compressed_tensors/quantization/lifecycle/helpers.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Miscelaneous helpers for the quantization lifecycle
+"""
+from torch.nn import Module
+__all__ = [
+    "update_layer_weight_quant_params",
+    "enable_quantization",
+    "disable_quantization",
+]
+def update_layer_weight_quant_params(layer: Module):
+    weight = getattr(layer, "weight", None)
+    scale = getattr(layer, "weight_scale", None)
+    zero_point = getattr(layer, "weight_zero_point", None)
+    observer = getattr(layer, "weight_observer", None)
+    if weight is None or observer is None or scale is None or zero_point is None:
+        # scale, zp, or observer not calibratable or weight not available
+        return
+    updated_scale, updated_zero_point = observer(weight)
+    # update scale and zero point
+    device = next(layer.parameters()).device
+    scale.data = updated_scale.to(device)
+    zero_point.data = updated_zero_point.to(device)
+def enable_quantization(module: Module):
+    module.quantization_enabled = True
+def disable_quantization(module: Module):
+    module.quantization_enabled = False

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -17,12 +17,18 @@ import logging
 from typing import Optional
 import torch
+from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+from accelerate.utils import PrefixedDataset
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.utils import get_execution_device, is_module_offloaded
 from torch.nn import Module, Parameter
@@ -58,7 +64,12 @@ def initialize_module_for_quantization(
         _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
     if scheme.weights is not None:
         if hasattr(module, "weight"):
-            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+            weight_shape = None
+            if isinstance(module, torch.nn.Linear):
+                weight_shape = module.weight.shape
+            _initialize_scale_zero_point_observer(
+                module, "weight", scheme.weights, weight_shape=weight_shape
+            )
         else:
             _LOGGER.warning(
                 f"module type {type(module)} targeted for weight quantization but "
@@ -73,12 +84,38 @@ def initialize_module_for_quantization(
     module.quantization_scheme = scheme
     module.quantization_status = QuantizationStatus.INITIALIZED
+    offloaded = False
+    if is_module_offloaded(module):
+        offloaded = True
+        hook = module._hf_hook
+        prefix_dict = module._hf_hook.weights_map
+        new_prefix = {}
+        # recreate the prefix dict (since it is immutable)
+        # and add quantization parameters
+        for key, data in module.named_parameters():
+            if key not in prefix_dict:
+                new_prefix[f"{prefix_dict.prefix}{key}"] = data
+            else:
+                new_prefix[f"{prefix_dict.prefix}{key}"] = prefix_dict[key]
+        new_prefix_dict = PrefixedDataset(new_prefix, prefix_dict.prefix)
+        remove_hook_from_module(module)
     # wrap forward call of module to perform quantized actions based on calltime status
     wrap_module_forward_quantized(module, scheme)
+    if offloaded:
+        # we need to re-add the hook for offloading now that we've wrapped forward
+        add_hook_to_module(module, hook)
+        if prefix_dict is not None:
+            module._hf_hook.weights_map = new_prefix_dict
 def _initialize_scale_zero_point_observer(
-    module: Module, base_name: str, quantization_args: QuantizationArgs
+    module: Module,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+    weight_shape: Optional[torch.Size] = None,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
@@ -88,12 +125,32 @@ def _initialize_scale_zero_point_observer(
         return  # no need to register a scale and zero point for a dynamic observer
     device = next(module.parameters()).device
+    if is_module_offloaded(module):
+        device = get_execution_device(module)
+    # infer expected scale/zero point shape
+    expected_shape = 1  # per tensor
+    if base_name == "weight" and weight_shape is not None:
+        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
+            # (output_channels, 1)
+            expected_shape = (weight_shape[0], 1)
+        elif quantization_args.strategy == QuantizationStrategy.GROUP:
+            expected_shape = (
+                weight_shape[0],
+                weight_shape[1] // quantization_args.group_size,
+            )
     # initializes empty scale and zero point parameters for the module
-    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=module.weight.dtype, device=device),
+        requires_grad=False,
+    )
     module.register_parameter(f"{base_name}_scale", init_scale)
+    zp_dtype = quantization_args.pytorch_dtype()
     init_zero_point = Parameter(
-        torch.empty(0, device=device, dtype=int), requires_grad=False
+        torch.empty(expected_shape, device=device, dtype=zp_dtype),
+        requires_grad=False,
     )
     module.register_parameter(f"{base_name}_zero_point", init_zero_point)

compressed-tensors 0.3.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

compressed-tensors 0.3.3py3-none-any.whl → 0.5.0py3-none-any.whl