PyPI - compressed-tensors - Versions diffs - 0.3.3__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

compressed-tensors 0.3.3py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

compressed_tensors/base.py +3 -1
compressed_tensors/compressors/__init__.py +9 -1
compressed_tensors/compressors/base.py +12 -55
compressed_tensors/compressors/dense.py +5 -5
compressed_tensors/compressors/helpers.py +12 -12
compressed_tensors/compressors/marlin_24.py +251 -0
compressed_tensors/compressors/model_compressor.py +336 -0
compressed_tensors/compressors/naive_quantized.py +144 -0
compressed_tensors/compressors/pack_quantized.py +219 -0
compressed_tensors/compressors/sparse_bitmask.py +4 -4
compressed_tensors/config/base.py +9 -4
compressed_tensors/config/dense.py +4 -4
compressed_tensors/config/sparse_bitmask.py +3 -3
compressed_tensors/quantization/lifecycle/__init__.py +2 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -31
compressed_tensors/quantization/lifecycle/calibration.py +20 -1
compressed_tensors/quantization/lifecycle/compressed.py +69 -0
compressed_tensors/quantization/lifecycle/forward.py +214 -62
compressed_tensors/quantization/lifecycle/frozen.py +4 -0
compressed_tensors/quantization/lifecycle/helpers.py +53 -0
compressed_tensors/quantization/lifecycle/initialize.py +62 -5
compressed_tensors/quantization/observers/base.py +66 -23
compressed_tensors/quantization/observers/helpers.py +69 -11
compressed_tensors/quantization/observers/memoryless.py +17 -9
compressed_tensors/quantization/observers/min_max.py +44 -13
compressed_tensors/quantization/quant_args.py +47 -3
compressed_tensors/quantization/quant_config.py +104 -23
compressed_tensors/quantization/quant_scheme.py +183 -2
compressed_tensors/quantization/utils/helpers.py +142 -8
compressed_tensors/utils/__init__.py +4 -0
compressed_tensors/utils/helpers.py +54 -7
compressed_tensors/utils/offload.py +104 -0
compressed_tensors/utils/permutations_24.py +65 -0
compressed_tensors/utils/safetensors_load.py +3 -2
compressed_tensors/utils/semi_structured_conversions.py +341 -0
compressed_tensors/version.py +53 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/METADATA +47 -8
compressed_tensors-0.5.0.dist-info/RECORD +48 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/WHEEL +1 -1
compressed_tensors-0.3.3.dist-info/RECORD +0 -38
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.3.3.dist-info → compressed_tensors-0.5.0.dist-info}/top_level.txt +0 -0

compressed_tensors/compressors/sparse_bitmask.py CHANGED Viewed

@@ -17,7 +17,7 @@ from typing import Dict, Generator, List, Tuple, Union
 import numpy
 import torch
-from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.compressors import Compressor
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.utils import get_nested_weight_mappings, merge_names
 from safetensors import safe_open
@@ -37,8 +37,8 @@ __all__ = [
 _LOGGER: logging.Logger = logging.getLogger(__name__)
-@ModelCompressor.register(name=CompressionFormat.sparse_bitmask.value)
-class BitmaskCompressor(ModelCompressor):
+@Compressor.register(name=CompressionFormat.sparse_bitmask.value)
+class BitmaskCompressor(Compressor):
     """
     Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d
     values tensor, with their locations stored in a 2d bitmask
@@ -72,7 +72,7 @@ class BitmaskCompressor(ModelCompressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located

compressed_tensors/config/base.py CHANGED Viewed

@@ -19,17 +19,22 @@ from compressed_tensors.registry import RegistryMixin
 from pydantic import BaseModel
-__all__ = ["CompressionConfig", "CompressionFormat"]
+__all__ = ["SparsityCompressionConfig", "CompressionFormat"]
 class CompressionFormat(Enum):
-    dense_sparsity = "dense-sparsity"
+    dense = "dense"
     sparse_bitmask = "sparse-bitmask"
+    int_quantized = "int-quantized"
+    float_quantized = "float-quantized"
+    naive_quantized = "naive-quantized"
+    pack_quantized = "pack-quantized"
+    marlin_24 = "marlin-24"
-class CompressionConfig(RegistryMixin, BaseModel):
+class SparsityCompressionConfig(RegistryMixin, BaseModel):
     """
-    Base data class for storing compression parameters
+    Base data class for storing sparsity compression parameters
     :param format: name of compression format
     :param global_sparsity: average sparsity of the entire model

compressed_tensors/config/dense.py CHANGED Viewed

@@ -14,14 +14,14 @@
 from typing import Optional
-from compressed_tensors.config import CompressionConfig, CompressionFormat
+from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 __all__ = ["DenseSparsityConfig"]
-@CompressionConfig.register(name=CompressionFormat.dense_sparsity.value)
-class DenseSparsityConfig(CompressionConfig):
+@SparsityCompressionConfig.register(name=CompressionFormat.dense.value)
+class DenseSparsityConfig(SparsityCompressionConfig):
     """
     Identity configuration for storing a sparse model in
     an uncompressed dense format
@@ -31,6 +31,6 @@ class DenseSparsityConfig(CompressionConfig):
     "unstructured", "2:4", "8:16" etc
     """
-    format: str = CompressionFormat.dense_sparsity.value
+    format: str = CompressionFormat.dense.value
     global_sparsity: Optional[float] = 0.0
     sparsity_structure: Optional[str] = "unstructured"

compressed_tensors/config/sparse_bitmask.py CHANGED Viewed

@@ -14,14 +14,14 @@
 from typing import Optional
-from compressed_tensors.config import CompressionConfig, CompressionFormat
+from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 __all__ = ["BitmaskConfig"]
-@CompressionConfig.register(name=CompressionFormat.sparse_bitmask.value)
-class BitmaskConfig(CompressionConfig):
+@SparsityCompressionConfig.register(name=CompressionFormat.sparse_bitmask.value)
+class BitmaskConfig(SparsityCompressionConfig):
     """
     Configuration for storing a sparse model using
     bitmask compression

compressed_tensors/quantization/lifecycle/__init__.py CHANGED Viewed

@@ -19,4 +19,6 @@ from .calibration import *
 from .forward import *
 from .frozen import *
 from .initialize import *
+from .compressed import *
 from .apply import *
+from .helpers import *

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -12,22 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import re
 from collections import OrderedDict
-from typing import Dict, Iterable, Optional
+from typing import Dict, Iterable, List, Optional
+from typing import OrderedDict as OrderedDictType
+from typing import Union
+import torch
 from compressed_tensors.quantization.lifecycle.calibration import (
     set_module_for_calibration,
 )
+from compressed_tensors.quantization.lifecycle.compressed import (
+    compress_quantized_weights,
+)
 from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
 )
-from compressed_tensors.quantization.utils import iter_named_leaf_modules
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.quantization.utils import (
+    KV_CACHE_TARGETS,
+    infer_quantization_status,
+    is_kv_cache_quant_scheme,
+    iter_named_leaf_modules,
+)
+from compressed_tensors.utils.helpers import fix_fsdp_module_name
+from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
 from torch.nn import Module
@@ -36,13 +52,16 @@ __all__ = [
     "load_pretrained_quantization",
     "apply_quantization_config",
     "apply_quantization_status",
-    "find_first_name_or_class_match",
+    "find_name_or_class_matches",
 ]
 from compressed_tensors.quantization.utils.helpers import is_module_quantized
 from compressed_tensors.utils.safetensors_load import get_quantization_state_dict
+_LOGGER = logging.getLogger(__name__)
 def load_pretrained_quantization(model: Module, model_name_or_path: str):
     """
     Loads the quantization parameters (scale and zero point) from model_name_or_path to
@@ -84,7 +103,7 @@ def load_pretrained_quantization(model: Module, model_name_or_path: str):
             )
-def apply_quantization_config(model: Module, config: QuantizationConfig):
+def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict:
     """
     Initializes the model for quantization in-place based on the given config
@@ -94,21 +113,73 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
     target_to_scheme = OrderedDict()
+    config = process_quantization_config(config)
+    names_to_scheme = OrderedDict()
     for scheme in config.config_groups.values():
         for target in scheme.targets:
             target_to_scheme[target] = scheme
+    # list of submodules to ignore
+    ignored_submodules = []
     # mark appropriate layers for quantization by setting their quantization schemes
     for name, submodule in iter_named_leaf_modules(model):
-        if find_first_name_or_class_match(name, submodule, config.ignore):
+        # potentially fix module name to remove FSDP wrapper prefix
+        name = fix_fsdp_module_name(name)
+        if find_name_or_class_matches(name, submodule, config.ignore):
+            ignored_submodules.append(name)
             continue  # layer matches ignore list, continue
-        target = find_first_name_or_class_match(name, submodule, target_to_scheme)
-        if target is not None:
+        targets = find_name_or_class_matches(name, submodule, target_to_scheme)
+        if targets:
             # target matched - add layer and scheme to target list
-            submodule.quantization_scheme = target_to_scheme[target]
+            submodule.quantization_scheme = _scheme_from_targets(
+                target_to_scheme, targets, name
+            )
+            names_to_scheme[name] = submodule.quantization_scheme.weights
+    if config.ignore is not None and ignored_submodules is not None:
+        if set(config.ignore) - set(ignored_submodules):
+            _LOGGER.warning(
+                "Some layers that were to be ignored were "
+                "not found in the model: "
+                f"{set(config.ignore) - set(ignored_submodules)}"
+            )
     # apply current quantization status across all targeted layers
     apply_quantization_status(model, config.quantization_status)
+    return names_to_scheme
+def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
+    """
+    Preprocess the raw QuantizationConfig
+    :param config: the raw QuantizationConfig
+    :return: the processed QuantizationConfig
+    """
+    if config.kv_cache_scheme is not None:
+        config = process_kv_cache_config(config)
+    return config
+def process_kv_cache_config(
+    config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS
+) -> QuantizationConfig:
+    """
+    Reformulate the `config.kv_cache` as a `config_group`
+    and add it to the set of existing `config.groups`
+    :param config: the QuantizationConfig
+    :return: the QuantizationConfig with additional "kv_cache" group
+    """
+    kv_cache_dict = config.kv_cache_scheme.model_dump()
+    kv_cache_scheme = QuantizationScheme(
+        output_activations=QuantizationArgs(**kv_cache_dict),
+        targets=targets,
+    )
+    kv_cache_group = dict(kv_cache=kv_cache_scheme)
+    config.config_groups.update(kv_cache_group)
+    return config
 def apply_quantization_status(model: Module, status: QuantizationStatus):
@@ -118,41 +189,73 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     :param model: model to apply quantization to
     :param status: status to update the module to
     """
-    if status >= QuantizationStatus.INITIALIZED:
+    current_status = infer_quantization_status(model)
+    if status >= QuantizationStatus.INITIALIZED > current_status:
         model.apply(initialize_module_for_quantization)
-    if status >= QuantizationStatus.CALIBRATION:
-        model.apply(set_module_for_calibration)
-    if status >= QuantizationStatus.FROZEN:
+    if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
+        # only quantize weights up front when our end goal state is calibration,
+        # weight quantization parameters are already loaded for frozen/compressed
+        quantize_weights_upfront = status == QuantizationStatus.CALIBRATION
+        model.apply(
+            lambda module: set_module_for_calibration(
+                module, quantize_weights_upfront=quantize_weights_upfront
+            )
+        )
+    if current_status < status >= QuantizationStatus.FROZEN > current_status:
         model.apply(freeze_module_quantization)
+    if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
+        model.apply(compress_quantized_weights)
-def find_first_name_or_class_match(
+def find_name_or_class_matches(
     name: str, module: Module, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # first element of targets that matches the given name
-    # if no name matches returns first target that matches the class name
-    # returns None otherwise
-    return _find_first_match(name, targets) or _find_first_match(
-        module.__class__.__name__, targets, check_contains
-    )
+) -> List[str]:
+    """
+    Returns all targets that match the given name or the class name.
+    Returns empty list otherwise.
+    The order of the output `matches` list matters.
+    The entries are sorted in the following order:
+        1. matches on exact strings
+        2. matches on regex patterns
+        3. matches on module names
+    """
+    targets = sorted(targets, key=lambda x: ("re:" in x, x))
+    if isinstance(targets, Iterable):
+        matches = _find_matches(name, targets) + _find_matches(
+            module.__class__.__name__, targets, check_contains
+        )
+        matches = [match for match in matches if match is not None]
+        return matches
-def _find_first_match(
+def _find_matches(
     value: str, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # returns first element of target that matches value either
+) -> List[str]:
+    # returns all the targets that match value either
     # exactly or as a regex after 're:'. if check_contains is set to True,
     # additionally checks if the target string is contained with value.
+    matches = []
     for target in targets:
         if target.startswith("re:"):
             pattern = target[3:]
             if re.match(pattern, value):
-                return target
+                matches.append(target)
         elif check_contains:
             if target.lower() in value.lower():
-                return target
+                matches.append(target)
         elif target == value:
-            return target
+            matches.append(target)
+    return matches
+def _infer_status(model: Module) -> Optional[QuantizationStatus]:
+    for module in model.modules():
+        status = getattr(module, "quantization_status", None)
+        if status is not None:
+            return status
     return None
@@ -170,9 +273,79 @@ def _load_quant_args_from_state_dict(
     """
     scale_name = f"{base_name}_scale"
     zp_name = f"{base_name}_zero_point"
-    device = next(module.parameters()).device
-    scale = getattr(module, scale_name)
-    zp = getattr(module, zp_name)
-    scale.data = state_dict[f"{module_name}.{scale_name}"].to(device)
-    zp.data = state_dict[f"{module_name}.{zp_name}"].to(device)
+    state_dict_scale = state_dict.get(f"{module_name}.{scale_name}", None)
+    state_dict_zp = state_dict.get(f"{module_name}.{zp_name}", None)
+    if state_dict_scale is not None:
+        # module is quantized
+        update_parameter_data(module, state_dict_scale, scale_name)
+        if state_dict_zp is None:
+            # fill in zero point for symmetric quantization
+            state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
+        update_parameter_data(module, state_dict_zp, zp_name)
+def _scheme_from_targets(
+    target_to_scheme: OrderedDictType[str, QuantizationScheme],
+    targets: List[str],
+    name: str,
+) -> QuantizationScheme:
+    if len(targets) == 1:
+        # if `targets` iterable contains a single element
+        # use it as the key
+        return target_to_scheme[targets[0]]
+    # otherwise, we need to merge QuantizationSchemes corresponding
+    # to multiple targets. This is most likely because `name` module
+    # is being target both as an ordinary quantization target, as well
+    # as kv cache quantization target
+    schemes_to_merge = [target_to_scheme[target] for target in targets]
+    return _merge_schemes(schemes_to_merge, name)
+def _merge_schemes(
+    schemes_to_merge: List[QuantizationScheme], name: str
+) -> QuantizationScheme:
+    kv_cache_quantization_scheme = [
+        scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme)
+    ]
+    if not kv_cache_quantization_scheme:
+        # if the schemes_to_merge do not contain any
+        # kv cache QuantizationScheme
+        # return the first scheme (the prioritized one,
+        # since the order of schemes_to_merge matters)
+        return schemes_to_merge[0]
+    else:
+        # fetch the kv cache QuantizationScheme and the highest
+        # priority non-kv cache QuantizationScheme and merge them
+        kv_cache_quantization_scheme = kv_cache_quantization_scheme[0]
+        quantization_scheme = [
+            scheme
+            for scheme in schemes_to_merge
+            if not is_kv_cache_quant_scheme(scheme)
+        ][0]
+        schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme]
+        merged_scheme = {}
+        for scheme in schemes_to_merge:
+            scheme_dict = {
+                k: v for k, v in scheme.model_dump().items() if v is not None
+            }
+            # when merging multiple schemes, the final target will be
+            # the `name` argument - hence erase the original targets
+            del scheme_dict["targets"]
+            # make sure that schemes do not "clash" with each other
+            overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys())
+            if overlapping_keys:
+                raise ValueError(
+                    f"The module: {name} is being modified by two clashing "
+                    f"quantization schemes, that jointly try to override "
+                    f"properties: {overlapping_keys}. Fix the quantization config "
+                    "so that it is not ambiguous."
+                )
+            merged_scheme.update(scheme_dict)
+        merged_scheme.update(targets=[name])
+        return QuantizationScheme(**merged_scheme)

compressed_tensors/quantization/lifecycle/calibration.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import logging
 from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.utils import is_module_offloaded, update_parameter_data
 from torch.nn import Module
@@ -27,7 +28,7 @@ __all__ = [
 _LOGGER = logging.getLogger(__name__)
-def set_module_for_calibration(module: Module):
+def set_module_for_calibration(module: Module, quantize_weights_upfront: bool = True):
     """
     marks a layer as ready for calibration which activates observers
     to update scales and zero points on each forward pass
@@ -35,6 +36,8 @@ def set_module_for_calibration(module: Module):
     apply to full model with `model.apply(set_module_for_calibration)`
     :param module: module to set for calibration
+    :param quantize_weights_upfront: whether to automatically run weight quantization at the
+    start of calibration
     """
     if not getattr(module, "quantization_scheme", None):
         # no quantization scheme nothing to do
@@ -48,4 +51,20 @@ def set_module_for_calibration(module: Module):
             "to re-calibrate a frozen module"
         )
+    if quantize_weights_upfront and module.quantization_scheme.weights is not None:
+        # set weight scale and zero_point up front, calibration data doesn't affect it
+        observer = module.weight_observer
+        offloaded = False
+        if is_module_offloaded(module):
+            module._hf_hook.pre_forward(module)
+            offloaded = True
+        scale, zero_point = observer(module.weight)
+        update_parameter_data(module, scale, "weight_scale")
+        update_parameter_data(module, zero_point, "weight_zero_point")
+        if offloaded:
+            module._hf_hook.post_forward(module, None)
     module.quantization_status = QuantizationStatus.CALIBRATION

compressed_tensors/quantization/lifecycle/compressed.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import torch
+from compressed_tensors.quantization.lifecycle.forward import quantize
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from torch.nn import Module
+__all__ = [
+    "compress_quantized_weights",
+]
+_LOGGER = logging.getLogger(__name__)
+def compress_quantized_weights(module: Module):
+    """
+    Quantizes the module weight representation to use fewer bits in memory
+    apply to full model with `model.apply(compress_quantized_weights)`
+    :param module: module to compress to quantized representation
+    """
+    scheme = getattr(module, "quantization_scheme", None)
+    if not scheme or not scheme.weights:
+        # no quantization scheme or weights not quantized, nothing to do
+        return
+    if scheme is QuantizationStatus.COMPRESSED:
+        # module is already compressed, nothing to do
+        return
+    weight = getattr(module, "weight", None)
+    scale = getattr(module, "weight_scale", None)
+    zero_point = getattr(module, "weight_zero_point", None)
+    if weight is None or scale is None or zero_point is None:
+        # no weight, scale, or ZP, nothing to do
+        # mark as compressed here to maintain consistent status throughout the model
+        module.quantization_status = QuantizationStatus.COMPRESSED
+        return
+    module.weight.requires_grad = False  # cannot use auto grad after compression
+    module.weight.data = quantize(
+        x=weight,
+        scale=scale,
+        zero_point=zero_point,
+        args=scheme.weights,
+        dtype=torch.int8,
+    )
+    module.quantization_status = QuantizationStatus.COMPRESSED

compressed-tensors 0.3.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

compressed-tensors 0.3.3py3-none-any.whl → 0.5.0py3-none-any.whl