PyPI - compressed-tensors - Versions diffs - 0.3.0__py3-none-any.whl - Mend

compressed-tensors 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

compressed_tensors/__init__.py +21 -0
compressed_tensors/base.py +16 -0
compressed_tensors/compressors/__init__.py +25 -0
compressed_tensors/compressors/base.py +79 -0
compressed_tensors/compressors/dense.py +34 -0
compressed_tensors/compressors/helpers.py +161 -0
compressed_tensors/compressors/sparse_bitmask.py +238 -0
compressed_tensors/config/__init__.py +18 -0
compressed_tensors/config/base.py +42 -0
compressed_tensors/config/dense.py +36 -0
compressed_tensors/config/sparse_bitmask.py +36 -0
compressed_tensors/quantization/__init__.py +21 -0
compressed_tensors/quantization/lifecycle/__init__.py +22 -0
compressed_tensors/quantization/lifecycle/apply.py +173 -0
compressed_tensors/quantization/lifecycle/calibration.py +51 -0
compressed_tensors/quantization/lifecycle/forward.py +136 -0
compressed_tensors/quantization/lifecycle/frozen.py +46 -0
compressed_tensors/quantization/lifecycle/initialize.py +96 -0
compressed_tensors/quantization/observers/__init__.py +21 -0
compressed_tensors/quantization/observers/base.py +69 -0
compressed_tensors/quantization/observers/helpers.py +53 -0
compressed_tensors/quantization/observers/memoryless.py +48 -0
compressed_tensors/quantization/observers/min_max.py +65 -0
compressed_tensors/quantization/quant_args.py +85 -0
compressed_tensors/quantization/quant_config.py +171 -0
compressed_tensors/quantization/quant_scheme.py +39 -0
compressed_tensors/quantization/utils/__init__.py +16 -0
compressed_tensors/quantization/utils/helpers.py +115 -0
compressed_tensors/registry/__init__.py +17 -0
compressed_tensors/registry/registry.py +360 -0
compressed_tensors/utils/__init__.py +16 -0
compressed_tensors/utils/helpers.py +151 -0
compressed_tensors/utils/safetensors_load.py +237 -0
compressed_tensors-0.3.0.dist-info/METADATA +22 -0
compressed_tensors-0.3.0.dist-info/RECORD +37 -0
compressed_tensors-0.3.0.dist-info/WHEEL +5 -0
compressed_tensors-0.3.0.dist-info/top_level.txt +1 -0

compressed_tensors/quantization/observers/helpers.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from torch import FloatTensor, IntTensor, Tensor
+__all__ = ["calculate_qparams"]
+def calculate_qparams(
+    min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
+) -> Tuple[FloatTensor, IntTensor]:
+    """
+    :param min_vals: tensor of min value(s) to caluclate scale(s) and zero point(s)
+        from
+    :param max_vals: tensor of max value(s) to caluclate scale(s) and zero point(s)
+        from
+    :param quantization_args: settings to quantization
+    :return: tuple of the calculated scale(s) and zero point(s)
+    """
+    min_vals = torch.min(min_vals, torch.zeros_like(min_vals))
+    max_vals = torch.max(max_vals, torch.zeros_like(max_vals))
+    bit_range = 2**quantization_args.num_bits - 1
+    bit_min = -(bit_range + 1) / 2
+    bit_max = bit_min + bit_range
+    if quantization_args.symmetric:
+        zero_points = torch.tensor(0).to(torch.int8)
+        max_val_pos = torch.max(-min_vals, max_vals)
+        scales = max_val_pos / (float(bit_range) / 2)
+        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+    else:
+        scales = (max_vals - min_vals) / float(bit_range)
+        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        zero_points = bit_min - torch.round(min_vals / scales)
+        zero_points = torch.clamp(zero_points, bit_min, bit_max).to(torch.int8)
+    return scales, zero_points

compressed_tensors/quantization/observers/memoryless.py ADDED Viewed

@@ -0,0 +1,48 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.observers.helpers import calculate_qparams
+from torch import FloatTensor, IntTensor, Tensor
+__all__ = ["MemorylessObserver"]
+@Observer.register("memoryless")
+class MemorylessObserver(Observer):
+    """
+    Implements a dynamic quantization observer that sets the scale and
+    zero point based on the latest observed value without tracking state
+    """
+    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        Returns the min and max values of observed
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        # TODO: Add support for full range of quantization Args, only supports 8bit
+        #       per tensor
+        min_val, max_val = torch.aminmax(observed)
+        # ensure zero is in the range
+        min_val = torch.min(min_val, torch.zeros_like(min_val))
+        max_val = torch.max(max_val, torch.zeros_like(max_val))
+        return calculate_qparams(min_val, max_val, self.quantization_args)

compressed_tensors/quantization/observers/min_max.py ADDED Viewed

@@ -0,0 +1,65 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.observers.helpers import calculate_qparams
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from torch import FloatTensor, IntTensor, Tensor
+__all__ = ["MovingAverageMinMaxObserver"]
+@Observer.register("minmax")
+class MovingAverageMinMaxObserver(Observer):
+    """
+    Implements a dynamic quantization observer that sets the scale and
+    zero point based on a moving average of the overall min and max observed values
+    """
+    def __init__(
+        self, quantization_args: QuantizationArgs, averaging_constant: float = 0.01
+    ):
+        super().__init__(quantization_args=quantization_args)
+        self.min_val = float("inf")
+        self.max_val = -float("inf")
+        self.averaging_constant = averaging_constant
+    def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
+        """
+        Updates the observed min and max using a moving average smoothed by the
+        averaging_constant
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+        min_val, max_val = torch.aminmax(observed)
+        if self.min_val == float("inf") and self.max_val == float("-inf"):
+            self.min_val = min_val
+            self.max_val = max_val
+        else:
+            self.min_val = self.min_val + self.averaging_constant * (
+                min_val - self.min_val
+            )
+            self.max_val = self.max_val + self.averaging_constant * (
+                max_val - self.max_val
+            )
+        return calculate_qparams(self.min_val, self.max_val, self.quantization_args)

compressed_tensors/quantization/quant_args.py ADDED Viewed

@@ -0,0 +1,85 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from typing import Any, Dict, Optional
+from pydantic import BaseModel, Field
+__all__ = ["QuantizationType", "QuantizationStrategy", "QuantizationArgs"]
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+    INT = "int"
+    FLOAT = "float"
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config for weights or
+    activations
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric about zero-point
+    :param strategy: string id determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block strategy, must be
+    of the format "2x4", "8x16", etc.
+    """
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
+    group_size: Optional[int] = None
+    block_structure: Optional[str] = None
+    observer: str = Field(
+        default="minmax",
+        description=(
+            "The class to use to compute the quantization param - "
+            "scale and zero-point'"
+        ),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "optional dict of kwargs to be passed directly to torch quantization "
+            "Observers constructor excluding quantization range or symmetry"
+        ),
+    )
+    def get_observer(self):
+        """
+        :return: torch quantization FakeQuantize built based on these QuantizationArgs
+        """
+        from compressed_tensors.quantization.observers.base import Observer
+        return Observer.load_from_registry(self.observer, quantization_args=self)

compressed_tensors/quantization/quant_config.py ADDED Viewed

@@ -0,0 +1,171 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from typing import Dict, List, Optional
+from compressed_tensors.base import QUANTIZATION_CONFIG_NAME
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.quantization.utils import (
+    calculate_compression_ratio,
+    is_module_quantized,
+    iter_named_leaf_modules,
+    module_type,
+)
+from pydantic import BaseModel, Field
+from torch.nn import Module
+from transformers import AutoConfig
+__all__ = [
+    "QuantizationStatus",
+    "QuantizationConfig",
+    "LIFECYCLE_ORDER",
+]
+class QuantizationStatus(str, Enum):
+    """
+    Enum storing the different states a quantized layer can be in
+    Initialized: scale, zero points and observers have been attached to the layer but
+    are set to dummy values (not yet calibrated)
+    Calibration: scale and zero points have been calibrated through OBCQ or similar
+    algorithm, observers are still attached
+    Frozen: scale and zero points are finalized, observers have been deleted, weights
+    are still in their original precision
+    Compressed: weights have been converted to their target type or compressed to
+    their closed approximation
+    """
+    INITIALIZED = "initialized"
+    CALIBRATION = "calibration"
+    FROZEN = "frozen"
+    COMPRESSED = "compressed"
+    @classmethod
+    def lifecycle_order(cls) -> List["QuantizationStatus"]:
+        """
+        :return: list of correct quantization lifecycle order
+        """
+        return
+    def __ge__(self, other):
+        if not isinstance(other, self.__class__):
+            raise NotImplementedError
+        return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
+LIFECYCLE_ORDER = [
+    QuantizationStatus.INITIALIZED,
+    QuantizationStatus.CALIBRATION,
+    QuantizationStatus.FROZEN,
+    QuantizationStatus.COMPRESSED,
+]
+class QuantizationConfig(BaseModel):
+    """
+    Full configuration specifying how a model is quantized. Each quantized layer is
+    mapped to a QuantizationScheme in config_groups.
+    :param config_groups: dict of QuantizationSchemes specifying the quantization
+    settings for each quantized layer
+    :param quant_method: a constant used to differentiate sparseML quantization from
+    other quantization configs
+    :param format: specifies how the quantized model is stored on disk
+    :quantization_status: specifies the current status of all quantized layers. It is
+    assumed all layers are in the same state.
+    :global_compression_ratio: optional informational config to report the model
+    compression ratio acheived by the quantization config
+    :ignore: optional list of layers to ignore from config_groups. Layers in this list
+    are not quantized even if they match up with a target in config_groups
+    """
+    config_groups: Dict[str, QuantizationScheme]
+    quant_method: str = "sparseml"
+    format: str = "fakequant"
+    quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
+    global_compression_ratio: Optional[float] = None
+    ignore: Optional[List[str]] = Field(default_factory=list)
+    @staticmethod
+    def from_model_config(model_name_or_path) -> "QuantizationConfig":
+        """
+        Given a path to a model config, extract a quantization config if it exists
+        :param pretrained_model_name_or_path: path to model config on disk or HF hub
+        :return: instantiated QuantizationConfig if config contains a quant config
+        """
+        config = AutoConfig.from_pretrained(model_name_or_path)
+        quantization_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
+        if quantization_config is None:
+            return None
+        return QuantizationConfig.parse_obj(quantization_config)
+    @staticmethod
+    def from_pretrained(model: Module) -> "QuantizationConfig":
+        """
+        Converts a model into its associated QuantizationConfig based on the
+        QuantizationScheme attached to each quanitzed module
+        :param model: model to calculate quantization scheme of
+        :return: filled out QuantizationScheme for the input model
+        """
+        quant_scheme_to_layers = []
+        quantization_status = None
+        ignore = {}
+        quantization_type_names = set()
+        for name, submodule in iter_named_leaf_modules(model):
+            layer_type = module_type(submodule)
+            if not is_module_quantized(submodule):
+                if layer_type not in ignore:
+                    ignore[layer_type] = []
+                ignore[layer_type].append(name)
+            else:
+                quantization_status = submodule.quantization_status
+                scheme = submodule.quantization_scheme
+                quantization_type_names.add(layer_type)
+                match_found = False
+                for existing_scheme in quant_scheme_to_layers:
+                    if scheme == existing_scheme:
+                        match_found = True
+                        break
+                if not match_found:
+                    quant_scheme_to_layers.append(scheme)
+        # clean up ignore list, we can leave out layers types if none of the
+        # instances are quantized
+        consolidated_ignore = []
+        for layer_type, ignore_names in ignore.items():
+            if layer_type in quantization_type_names:
+                # specific layers of a quantized type are ignored
+                consolidated_ignore += ignore_names
+            # else we leave it off the ignore list, doesn't fall under any of the
+            # existing quantization schemes so it won't be quantized
+        config_groups = {}
+        for idx, scheme in enumerate(quant_scheme_to_layers):
+            group_name = "group_" + str(idx)
+            config_groups[group_name] = scheme
+        compression_ratio = calculate_compression_ratio(model)
+        return QuantizationConfig(
+            config_groups=config_groups,
+            quantization_status=quantization_status,
+            global_compression_ratio=compression_ratio,
+            ignore=consolidated_ignore,
+        )

compressed_tensors/quantization/quant_scheme.py ADDED Viewed

@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from pydantic import BaseModel
+__all__ = ["QuantizationScheme"]
+class QuantizationScheme(BaseModel):
+    """
+    Set of QuantizationArgs defining how the weights, inputs and outputs of target list
+    of modules should be quantized
+    :param targets: list of modules to apply the QuantizationArgs to, can be layer
+    names, layer types or a regular expression
+    :param weights: quantization config for layer weights
+    :param input_activations: quantization config for layer inputs
+    :param output_activations: quantization config for layer outputs
+    """
+    targets: List[str]
+    weights: Optional[QuantizationArgs] = None
+    input_activations: Optional[QuantizationArgs] = None
+    output_activations: Optional[QuantizationArgs] = None

compressed_tensors/quantization/utils/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .helpers import *

compressed_tensors/quantization/utils/helpers.py ADDED Viewed

@@ -0,0 +1,115 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+from torch.nn import Module
+from tqdm import tqdm
+__all__ = [
+    "is_module_quantized",
+    "is_model_quantized",
+    "iter_named_leaf_modules",
+    "module_type",
+    "calculate_compression_ratio",
+]
+def is_module_quantized(module: Module) -> bool:
+    """
+    Check if a module is quantized, based on the existence of a non-empty quantization
+    scheme
+    :param module: pytorch module to check
+    :return: True if module is quantized, False otherwise
+    """
+    if not hasattr(module, "quantization_scheme"):
+        return False
+    if module.quantization_scheme.weights is not None:
+        return True
+    if module.quantization_scheme.input_activations is not None:
+        return True
+    if module.quantization_scheme.output_activations is not None:
+        return True
+    return False
+def is_model_quantized(model: Module) -> bool:
+    """
+    Check if any modules in a model are quantized, based on the existence of a non-empty
+    quantization scheme in at least one module
+    :param model: pytorch model
+    :return: True if model is quantized, False otherwise
+    """
+    for _, submodule in iter_named_leaf_modules(model):
+        if is_module_quantized(submodule):
+            return True
+    return False
+def module_type(module: Module) -> str:
+    """
+    Gets a string representation of a module type
+    :module: pytorch module to get type of
+    :return: module type as a string
+    """
+    return type(module).__name__
+def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
+    # yields modules that do not have any submodules
+    # TODO: potentially expand to add list of allowed submodules such as observers
+    for name, submodule in model.named_modules():
+        if len(list(submodule.children())) == 0:
+            yield name, submodule
+def calculate_compression_ratio(model: Module) -> float:
+    """
+    Calculates the quantization compression ratio of a pytorch model, based on the
+    number of bits needed to represent the total weights in compressed form. Does not
+    take into account activation quantizatons.
+    :param model: pytorch module to calculate compression ratio for
+    :return: compression ratio of the whole model
+    """
+    total_compressed = 0.0
+    total_uncompressed = 0.0
+    for name, submodule in tqdm(
+        iter_named_leaf_modules(model),
+        desc="Calculating quantization compression ratio",
+    ):
+        for parameter in model.parameters():
+            try:
+                uncompressed_bits = torch.finfo(parameter.dtype).bits
+            except TypeError:
+                uncompressed_bits = torch.iinfo(parameter.dtype).bits
+            compressed_bits = uncompressed_bits
+            if is_module_quantized(submodule):
+                compressed_bits = submodule.quantization_scheme.weights.num_bits
+            num_weights = parameter.numel()
+            total_compressed += compressed_bits * num_weights
+            total_uncompressed += uncompressed_bits * num_weights
+    return total_uncompressed / total_compressed

compressed_tensors/registry/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# flake8: noqa
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .registry import *