PyPI - compressed-tensors-nightly - Versions diffs - 0.3.3.20240514__py3-none-any.whl - Mend

compressed-tensors-nightly 0.3.3.20240514__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

compressed_tensors/__init__.py +21 -0
compressed_tensors/base.py +17 -0
compressed_tensors/compressors/__init__.py +22 -0
compressed_tensors/compressors/base.py +59 -0
compressed_tensors/compressors/dense.py +34 -0
compressed_tensors/compressors/helpers.py +137 -0
compressed_tensors/compressors/int_quantized.py +95 -0
compressed_tensors/compressors/model_compressor.py +264 -0
compressed_tensors/compressors/sparse_bitmask.py +239 -0
compressed_tensors/config/__init__.py +18 -0
compressed_tensors/config/base.py +43 -0
compressed_tensors/config/dense.py +36 -0
compressed_tensors/config/sparse_bitmask.py +36 -0
compressed_tensors/quantization/__init__.py +21 -0
compressed_tensors/quantization/lifecycle/__init__.py +23 -0
compressed_tensors/quantization/lifecycle/apply.py +196 -0
compressed_tensors/quantization/lifecycle/calibration.py +51 -0
compressed_tensors/quantization/lifecycle/compressed.py +69 -0
compressed_tensors/quantization/lifecycle/forward.py +333 -0
compressed_tensors/quantization/lifecycle/frozen.py +50 -0
compressed_tensors/quantization/lifecycle/initialize.py +99 -0
compressed_tensors/quantization/observers/__init__.py +21 -0
compressed_tensors/quantization/observers/base.py +130 -0
compressed_tensors/quantization/observers/helpers.py +54 -0
compressed_tensors/quantization/observers/memoryless.py +48 -0
compressed_tensors/quantization/observers/min_max.py +80 -0
compressed_tensors/quantization/quant_args.py +125 -0
compressed_tensors/quantization/quant_config.py +210 -0
compressed_tensors/quantization/quant_scheme.py +39 -0
compressed_tensors/quantization/utils/__init__.py +16 -0
compressed_tensors/quantization/utils/helpers.py +131 -0
compressed_tensors/registry/__init__.py +17 -0
compressed_tensors/registry/registry.py +360 -0
compressed_tensors/utils/__init__.py +16 -0
compressed_tensors/utils/helpers.py +45 -0
compressed_tensors/utils/safetensors_load.py +237 -0
compressed_tensors/version.py +50 -0
compressed_tensors_nightly-0.3.3.20240514.dist-info/LICENSE +201 -0
compressed_tensors_nightly-0.3.3.20240514.dist-info/METADATA +105 -0
compressed_tensors_nightly-0.3.3.20240514.dist-info/RECORD +42 -0
compressed_tensors_nightly-0.3.3.20240514.dist-info/WHEEL +5 -0
compressed_tensors_nightly-0.3.3.20240514.dist-info/top_level.txt +1 -0

compressed_tensors/quantization/lifecycle/compressed.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import torch
+from compressed_tensors.quantization.lifecycle.forward import quantize
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Module
+__all__ = [
+    "compress_quantized_weights",
+]
+_LOGGER = logging.getLogger(__name__)
+def compress_quantized_weights(module: Module):
+    """
+    Quantizes the module weight representation to use fewer bits in memory
+    apply to full model with `model.apply(compress_quantized_weights)`
+    :param module: module to compress to quantized representation
+    """
+    scheme = getattr(module, "quantization_scheme", None)
+    if not scheme or not scheme.weights:
+        # no quantization scheme or weights not quantized, nothing to do
+        return
+    if scheme is QuantizationStatus.COMPRESSED:
+        # module is already compressed, nothing to do
+        return
+    weight = getattr(module, "weight", None)
+    scale = getattr(module, "weight_scale", None)
+    zero_point = getattr(module, "weight_zero_point", None)
+    if weight is None or scale is None or zero_point is None:
+        # no weight, scale, or ZP, nothing to do
+        # mark as compressed here to maintain consistent status throughout the model
+        module.quantization_status = QuantizationStatus.COMPRESSED
+        return
+    module.weight.requires_grad = False  # cannot use auto grad after compression
+    module.weight.data = quantize(
+        x=weight,
+        scale=scale,
+        zero_point=zero_point,
+        args=scheme.weights,
+        dtype=torch.int8,
+    )
+    module.quantization_status = QuantizationStatus.COMPRESSED

compressed_tensors/quantization/lifecycle/forward.py ADDED Viewed

@@ -0,0 +1,333 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+from math import ceil
+from typing import Optional
+import torch
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+)
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from torch.nn import Module
+__all__ = [
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "wrap_module_forward_quantized",
+    "maybe_calibrate_or_quantize",
+]
+@torch.no_grad()
+def quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """
+    Quantize the input tensor x using the QuantizationStrategy specified in args.
+    Quantization can be done per tensor, channel, token or group. For group
+    quantization, the group_size must be divisible by the column size. The input scale
+    and zero_points are reshaped to support vectorization (Assumes 1 is the
+    channel dimension)
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :param dtype: optional dtype to cast the quantized output to
+    :return: fake quantized tensor
+    """
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        dtype=dtype,
+        do_quantize=True,
+        do_dequantize=False,
+    )
+@torch.no_grad()
+def dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs = None,
+) -> torch.Tensor:
+    """
+    Dequantize a quantized input tensor x_q based on the strategy specified in args. If
+    args is not provided, the strategy will be inferred.
+    :param x: quantized input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args used to quantize x_q
+    :return: dequantized float tensor
+    """
+    if args is None:
+        if scale.ndim == 0:
+            args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR)
+        elif scale.ndim == 2:
+            args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
+        elif scale.ndim == 3:
+            group_size = int(x_q.shape[1] / scale.shape[1])
+            args = QuantizationArgs(
+                strategy=QuantizationStrategy.GROUP, group_size=group_size
+            )
+    return _process_quantization(
+        x=x_q,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=False,
+        do_dequantize=True,
+    )
+@torch.no_grad()
+def fake_quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+) -> torch.Tensor:
+    """
+    Fake quantize the input tensor x by quantizing then dequantizing with
+    the QuantizationStrategy specified in args. Quantization can be done per tensor,
+    channel, token or group. For group quantization, the group_size must be divisible
+    by the column size. The input scale  and zero_points are reshaped to support
+    vectorization (Assumes 1 is the channel dimension)
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :return: fake quantized tensor
+    """
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+    )
+@torch.no_grad()
+def _process_quantization(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+    do_quantize: bool = True,
+    do_dequantize: bool = True,
+) -> torch.Tensor:
+    bit_range = 2**args.num_bits
+    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
+    q_min = torch.tensor(-bit_range / 2, device=x.device)
+    group_size = args.group_size
+    # group
+    if args.strategy == QuantizationStrategy.GROUP:
+        if do_dequantize:  # if dequantizing the output should be a fp type
+            output = torch.zeros_like(x, dtype=scale.dtype)
+        else:
+            output_dtype = dtype if dtype is not None else x.dtype
+            output = torch.zeros_like(x, dtype=output_dtype)
+        # TODO: vectorize the for loop
+        # TODO: fix genetric assumption about the tensor size for computing group
+        # TODO: make validation step for inputs
+        while scale.ndim < 2:
+            # pad scale and zero point dims for slicing
+            scale = scale.unsqueeze(1)
+            zero_point = zero_point.unsqueeze(1)
+        columns = x.shape[1]
+        if columns >= group_size:
+            if columns % group_size != 0:
+                raise ValueError(
+                    "tesnor column shape must be divisble "
+                    f"by the given group_size {group_size}"
+                )
+        for i in range(ceil(columns / group_size)):
+            # scale.shape should be [nchan, ndim]
+            # sc.shape should be [nchan, 1] after unsqueeze
+            sc = scale[:, i].view(-1, 1)
+            zp = zero_point[:, i].view(-1, 1)
+            idx = i * group_size
+            if do_quantize:
+                output[:, idx : (idx + group_size)] = _quantize(
+                    x[:, idx : (idx + group_size)], sc, zp, q_min, q_max, dtype=dtype
+                )
+            if do_dequantize:
+                input = (
+                    output[:, idx : (idx + group_size)]
+                    if do_quantize
+                    else x[:, idx : (idx + group_size)]
+                )
+                output[:, idx : (idx + group_size)] = _dequantize(input, sc, zp)
+    # channel-wise
+    elif args.strategy == QuantizationStrategy.CHANNEL:  # group_size == -1
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
+    # per-token
+    elif args.strategy == QuantizationStrategy.TOKEN:
+        # before: scale shape = [num_tokens]
+        # after: scale shape = [num_tokens, 1]
+        # x.shape = 1, num_tokens, 1]
+        # scale gets broadcasted as expected withput having [1, num_tokens, 1] shape
+        scale = scale.unsqueeze(1)
+        zero_point = zero_point.unsqueeze(1)
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
+    else:
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
+    return output
+def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
+    # expects a module already initialized and injected with the parameters in
+    # initialize_module_for_quantization
+    forward_func_orig = module.forward.__func__
+    @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
+    def wrapped_forward(self, *args, **kwargs):
+        input_ = args[0]
+        if scheme.input_activations is not None:
+            # calibrate and (fake) quantize input activations when applicable
+            input_ = maybe_calibrate_or_quantize(
+                module, input_, "input", scheme.input_activations
+            )
+        if scheme.weights is not None:
+            # calibrate and (fake) quantize weights when applicable
+            unquantized_weight = self.weight.data.clone()
+            self.weight.data = maybe_calibrate_or_quantize(
+                module, self.weight, "weight", scheme.weights
+            )
+        # perform wrapped forward call
+        output = forward_func_orig.__get__(module, module.__class__)(
+            input_, *args[1:], **kwargs
+        )
+        if scheme.output_activations is not None:
+            # calibrate and (fake) quantize output activations when applicable
+            output = maybe_calibrate_or_quantize(
+                module, output, "output", scheme.output_activations
+            )
+        # restore back to unquantized_value
+        if scheme.weights is not None:
+            self.weight.data = unquantized_weight
+        return output
+    # bind wrapped forward to module class so reference to `self` is correct
+    bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
+    # set forward to wrapped forward
+    setattr(module, "forward", bound_wrapped_forward)
+def maybe_calibrate_or_quantize(
+    module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
+) -> torch.Tensor:
+    # only run quantized for the included stages
+    if module.quantization_status not in {
+        QuantizationStatus.CALIBRATION,
+        QuantizationStatus.FROZEN,
+    }:
+        return value
+    if args.dynamic:
+        # dynamic quantization - get scale and zero point directly from observer
+        observer = getattr(module, f"{base_name}_observer")
+        scale, zero_point = observer(value)
+    else:
+        # static quantization - get previous scale and zero point from layer
+        scale = getattr(module, f"{base_name}_scale")
+        zero_point = getattr(module, f"{base_name}_zero_point")
+        if module.quantization_status == QuantizationStatus.CALIBRATION:
+            # calibration mode - get new quant params from observer
+            observer = getattr(module, f"{base_name}_observer")
+            updated_scale, updated_zero_point = observer(value)
+            # update scale and zero point
+            device = next(module.parameters()).device
+            scale.data = updated_scale.to(device)
+            zero_point.data = updated_zero_point.to(device)
+    return fake_quantize(value, scale, zero_point, args)
+@torch.no_grad()
+def _quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    quantized_value = torch.clamp(
+        torch.round(x / scale + zero_point),
+        q_min,
+        q_max,
+    )
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
+    return quantized_value
+@torch.no_grad()
+def _dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale

compressed_tensors/quantization/lifecycle/frozen.py ADDED Viewed

@@ -0,0 +1,50 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Module
+__all__ = [
+    "freeze_module_quantization",
+]
+def freeze_module_quantization(module: Module):
+    """
+    deletes observers so static quantization is completed.
+    apply to full model with `model.apply(freeze_module_quantization)`
+    :param module: module to freeze quantization for
+    """
+    scheme = getattr(module, "quantization_scheme", None)
+    if not scheme:
+        # no quantization scheme nothing to do
+        return
+    if module.quantization_status == QuantizationStatus.FROZEN:
+        # nothing to do, already frozen
+        return
+    # delete observers from module if not dynamic
+    if scheme.input_activations and not scheme.input_activations.dynamic:
+        delattr(module, "input_observer")
+    if scheme.weights and not scheme.weights.dynamic:
+        delattr(module, "weight_observer")
+    if scheme.output_activations and not scheme.output_activations.dynamic:
+        delattr(module, "output_observer")
+    module.quantization_status = QuantizationStatus.FROZEN

compressed_tensors/quantization/lifecycle/initialize.py ADDED Viewed

@@ -0,0 +1,99 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Optional
+import torch
+from compressed_tensors.quantization.lifecycle.forward import (
+    wrap_module_forward_quantized,
+)
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from torch.nn import Module, Parameter
+__all__ = [
+    "initialize_module_for_quantization",
+]
+_LOGGER = logging.getLogger(__name__)
+def initialize_module_for_quantization(
+    module: Module,
+    scheme: Optional[QuantizationScheme] = None,
+):
+    """
+    attaches appropriate scales, zero points, and observers to a layer
+    given its target quantization scheme
+    apply to full model with `model.apply(initialize_module_for_quantization)`
+    :param module: module to set for calibration
+    :param scheme: scheme to use for quantization. if None is provided,
+        will attempt to use scheme stored in the module under `quantization_scheme`,
+        if not provided, the layer will be skipped
+    """
+    scheme = scheme or getattr(module, "quantization_scheme", None)
+    if scheme is None:
+        # no scheme passed and layer not targeted for quantization - skip
+        return
+    if scheme.input_activations is not None:
+        _initialize_scale_zero_point_observer(module, "input", scheme.input_activations)
+    if scheme.weights is not None:
+        if hasattr(module, "weight"):
+            _initialize_scale_zero_point_observer(module, "weight", scheme.weights)
+        else:
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for weight quantization but "
+                "has no attribute weight, skipping weight quantization "
+                f"for {type(module)}"
+            )
+    if scheme.output_activations is not None:
+        _initialize_scale_zero_point_observer(
+            module, "output", scheme.output_activations
+        )
+    module.quantization_scheme = scheme
+    module.quantization_status = QuantizationStatus.INITIALIZED
+    # wrap forward call of module to perform quantized actions based on calltime status
+    wrap_module_forward_quantized(module, scheme)
+def _initialize_scale_zero_point_observer(
+    module: Module, base_name: str, quantization_args: QuantizationArgs
+):
+    # initialize observer module and attach as submodule
+    observer = quantization_args.get_observer()
+    module.register_module(f"{base_name}_observer", observer)
+    if quantization_args.dynamic:
+        return  # no need to register a scale and zero point for a dynamic observer
+    device = next(module.parameters()).device
+    # initializes empty scale and zero point parameters for the module
+    init_scale = Parameter(torch.empty(0, device=device), requires_grad=False)
+    module.register_parameter(f"{base_name}_scale", init_scale)
+    init_zero_point = Parameter(
+        torch.empty(0, device=device, dtype=int), requires_grad=False
+    )
+    module.register_parameter(f"{base_name}_zero_point", init_zero_point)

compressed_tensors/quantization/observers/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+# isort: skip_file
+from .helpers import *
+from .base import *
+from .memoryless import *
+from .min_max import *