PyPI - compressed-tensors - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

compressed_tensors/base.py +1 -0
compressed_tensors/compressors/__init__.py +5 -1
compressed_tensors/compressors/base.py +200 -8
compressed_tensors/compressors/dense.py +1 -1
compressed_tensors/compressors/marlin_24.py +11 -10
compressed_tensors/compressors/model_compressor.py +101 -13
compressed_tensors/compressors/naive_quantized.py +140 -0
compressed_tensors/compressors/pack_quantized.py +128 -132
compressed_tensors/compressors/sparse_bitmask.py +1 -1
compressed_tensors/config/base.py +8 -1
compressed_tensors/{compressors/utils → linear}/__init__.py +0 -6
compressed_tensors/linear/compressed_linear.py +87 -0
compressed_tensors/quantization/lifecycle/__init__.py +1 -0
compressed_tensors/quantization/lifecycle/apply.py +204 -44
compressed_tensors/quantization/lifecycle/calibration.py +22 -2
compressed_tensors/quantization/lifecycle/compressed.py +3 -1
compressed_tensors/quantization/lifecycle/forward.py +139 -61
compressed_tensors/quantization/lifecycle/helpers.py +80 -0
compressed_tensors/quantization/lifecycle/initialize.py +77 -13
compressed_tensors/quantization/observers/__init__.py +1 -0
compressed_tensors/quantization/observers/base.py +93 -14
compressed_tensors/quantization/observers/helpers.py +64 -11
compressed_tensors/quantization/observers/min_max.py +8 -0
compressed_tensors/quantization/observers/mse.py +162 -0
compressed_tensors/quantization/quant_args.py +139 -23
compressed_tensors/quantization/quant_config.py +35 -2
compressed_tensors/quantization/quant_scheme.py +112 -13
compressed_tensors/quantization/utils/helpers.py +68 -2
compressed_tensors/utils/__init__.py +5 -0
compressed_tensors/utils/helpers.py +44 -2
compressed_tensors/utils/offload.py +116 -0
compressed_tensors/utils/permute.py +70 -0
compressed_tensors/utils/safetensors_load.py +2 -0
compressed_tensors/{compressors/utils → utils}/semi_structured_conversions.py +1 -0
compressed_tensors/version.py +1 -1
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -22
compressed_tensors-0.6.0.dist-info/RECORD +52 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
compressed_tensors/compressors/int_quantized.py +0 -126
compressed_tensors/compressors/utils/helpers.py +0 -43
compressed_tensors-0.4.0.dist-info/RECORD +0 -48
/compressed_tensors/{compressors/utils → utils}/permutations_24.py +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
{compressed_tensors-0.4.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -14,10 +14,14 @@
 import logging
 import re
-from collections import OrderedDict
-from typing import Dict, Iterable, Optional
+from collections import OrderedDict, defaultdict
+from copy import deepcopy
+from typing import Dict, Iterable, List, Optional
+from typing import OrderedDict as OrderedDictType
+from typing import Union
 import torch
+from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.lifecycle.calibration import (
     set_module_for_calibration,
 )
@@ -28,15 +32,20 @@ from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quant
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
 )
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import (
+    KV_CACHE_TARGETS,
     infer_quantization_status,
+    is_kv_cache_quant_scheme,
     iter_named_leaf_modules,
 )
-from compressed_tensors.utils.helpers import fix_fsdp_module_name
+from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
+from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
 from torch.nn import Module
@@ -45,7 +54,7 @@ __all__ = [
     "load_pretrained_quantization",
     "apply_quantization_config",
     "apply_quantization_status",
-    "find_first_name_or_class_match",
+    "find_name_or_class_matches",
 ]
 from compressed_tensors.quantization.utils.helpers import is_module_quantized
@@ -96,33 +105,64 @@ def load_pretrained_quantization(model: Module, model_name_or_path: str):
             )
-def apply_quantization_config(model: Module, config: QuantizationConfig):
+def apply_quantization_config(
+    model: Module, config: QuantizationConfig, run_compressed: bool = False
+) -> Dict:
     """
     Initializes the model for quantization in-place based on the given config
     :param model: model to apply quantization config to
     :param config: quantization config
+    :param run_compressed: Whether the model will be run in compressed mode or
+        decompressed fully on load
     """
+    # remove reference to the original `config`
+    # argument. This function can mutate it, and we'd
+    # like to keep the original `config` as it is.
+    config = deepcopy(config)
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
     target_to_scheme = OrderedDict()
+    config = process_quantization_config(config)
+    names_to_scheme = OrderedDict()
     for scheme in config.config_groups.values():
         for target in scheme.targets:
             target_to_scheme[target] = scheme
+    if run_compressed:
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
     # list of submodules to ignore
-    ignored_submodules = []
+    ignored_submodules = defaultdict(list)
     # mark appropriate layers for quantization by setting their quantization schemes
     for name, submodule in iter_named_leaf_modules(model):
         # potentially fix module name to remove FSDP wrapper prefix
         name = fix_fsdp_module_name(name)
-        if find_first_name_or_class_match(name, submodule, config.ignore):
-            ignored_submodules.append(name)
+        if matches := find_name_or_class_matches(name, submodule, config.ignore):
+            for match in matches:
+                ignored_submodules[match].append(name)
             continue  # layer matches ignore list, continue
-        target = find_first_name_or_class_match(name, submodule, target_to_scheme)
-        if target is not None:
+        targets = find_name_or_class_matches(name, submodule, target_to_scheme)
+        if targets:
+            scheme = _scheme_from_targets(target_to_scheme, targets, name)
+            if run_compressed:
+                format = config.format
+                if format != CompressionFormat.dense.value:
+                    if isinstance(submodule, torch.nn.Linear):
+                        # TODO: expand to more module types
+                        compressed_linear = CompressedLinear.from_linear(
+                            submodule,
+                            quantization_scheme=scheme,
+                            quantization_format=format,
+                        )
+                        replace_module(model, name, compressed_linear)
             # target matched - add layer and scheme to target list
-            submodule.quantization_scheme = target_to_scheme[target]
+            submodule.quantization_scheme = _scheme_from_targets(
+                target_to_scheme, targets, name
+            )
+            names_to_scheme[name] = submodule.quantization_scheme.weights
     if config.ignore is not None and ignored_submodules is not None:
         if set(config.ignore) - set(ignored_submodules):
@@ -131,8 +171,43 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
                 "not found in the model: "
                 f"{set(config.ignore) - set(ignored_submodules)}"
             )
     # apply current quantization status across all targeted layers
     apply_quantization_status(model, config.quantization_status)
+    return names_to_scheme
+def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
+    """
+    Preprocess the raw QuantizationConfig
+    :param config: the raw QuantizationConfig
+    :return: the processed QuantizationConfig
+    """
+    if config.kv_cache_scheme is not None:
+        config = process_kv_cache_config(config)
+    return config
+def process_kv_cache_config(
+    config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS
+) -> QuantizationConfig:
+    """
+    Reformulate the `config.kv_cache` as a `config_group`
+    and add it to the set of existing `config.groups`
+    :param config: the QuantizationConfig
+    :return: the QuantizationConfig with additional "kv_cache" group
+    """
+    kv_cache_dict = config.kv_cache_scheme.model_dump()
+    kv_cache_scheme = QuantizationScheme(
+        output_activations=QuantizationArgs(**kv_cache_dict),
+        targets=targets,
+    )
+    kv_cache_group = dict(kv_cache=kv_cache_scheme)
+    config.config_groups.update(kv_cache_group)
+    return config
 def apply_quantization_status(model: Module, status: QuantizationStatus):
@@ -145,10 +220,22 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     current_status = infer_quantization_status(model)
     if status >= QuantizationStatus.INITIALIZED > current_status:
-        model.apply(initialize_module_for_quantization)
+        force_zero_point_init = status != QuantizationStatus.COMPRESSED
+        model.apply(
+            lambda module: initialize_module_for_quantization(
+                module, force_zero_point=force_zero_point_init
+            )
+        )
     if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
-        model.apply(set_module_for_calibration)
+        # only quantize weights up front when our end goal state is calibration,
+        # weight quantization parameters are already loaded for frozen/compressed
+        quantize_weights_upfront = status == QuantizationStatus.CALIBRATION
+        model.apply(
+            lambda module: set_module_for_calibration(
+                module, quantize_weights_upfront=quantize_weights_upfront
+            )
+        )
     if current_status < status >= QuantizationStatus.FROZEN > current_status:
         model.apply(freeze_module_quantization)
@@ -156,36 +243,45 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
         model.apply(compress_quantized_weights)
-def find_first_name_or_class_match(
+def find_name_or_class_matches(
     name: str, module: Module, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # first element of targets that matches the given name
-    # if no name matches returns first target that matches the class name
-    # returns None otherwise
+) -> List[str]:
+    """
+    Returns all targets that match the given name or the class name.
+    Returns empty list otherwise.
+    The order of the output `matches` list matters.
+    The entries are sorted in the following order:
+        1. matches on exact strings
+        2. matches on regex patterns
+        3. matches on module names
+    """
+    targets = sorted(targets, key=lambda x: ("re:" in x, x))
     if isinstance(targets, Iterable):
-        return _find_first_match(name, targets) or _find_first_match(
+        matches = _find_matches(name, targets) + _find_matches(
             module.__class__.__name__, targets, check_contains
         )
+        matches = [match for match in matches if match is not None]
+        return matches
-def _find_first_match(
+def _find_matches(
     value: str, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # returns first element of target that matches value either
+) -> List[str]:
+    # returns all the targets that match value either
     # exactly or as a regex after 're:'. if check_contains is set to True,
     # additionally checks if the target string is contained with value.
+    matches = []
     for target in targets:
         if target.startswith("re:"):
             pattern = target[3:]
             if re.match(pattern, value):
-                return target
+                matches.append(target)
         elif check_contains:
             if target.lower() in value.lower():
-                return target
+                matches.append(target)
         elif target == value:
-            return target
-    return None
+            matches.append(target)
+    return matches
 def _infer_status(model: Module) -> Optional[QuantizationStatus]:
@@ -210,20 +306,84 @@ def _load_quant_args_from_state_dict(
     """
     scale_name = f"{base_name}_scale"
     zp_name = f"{base_name}_zero_point"
-    device = next(module.parameters()).device
-    scale = getattr(module, scale_name, None)
-    zp = getattr(module, zp_name, None)
-    if scale is not None:
-        state_dict_scale = state_dict.get(f"{module_name}.{scale_name}")
-        if state_dict_scale is not None:
-            scale.data = state_dict_scale.to(device).to(scale.dtype)
-        else:
-            scale.data = scale.data.to(device)
-    if zp is not None:
-        zp_from_state = state_dict.get(f"{module_name}.{zp_name}", None)
-        if zp_from_state is not None:  # load the non-zero zero points
-            zp.data = state_dict[f"{module_name}.{zp_name}"].to(device)
-        else:  # fill with zeros matching scale shape
-            zp.data = torch.zeros_like(scale, dtype=torch.int8).to(device)
+    g_idx_name = f"{base_name}_g_idx"
+    state_dict_scale = state_dict.get(f"{module_name}.{scale_name}", None)
+    state_dict_zp = state_dict.get(f"{module_name}.{zp_name}", None)
+    state_dict_g_idx = state_dict.get(f"{module_name}.{g_idx_name}", None)
+    if state_dict_scale is not None:
+        # module is quantized
+        update_parameter_data(module, state_dict_scale, scale_name)
+        if state_dict_zp is None:
+            # fill in zero point for symmetric quantization
+            state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
+        update_parameter_data(module, state_dict_zp, zp_name)
+    if state_dict_g_idx is not None:
+        update_parameter_data(module, state_dict_g_idx, g_idx_name)
+def _scheme_from_targets(
+    target_to_scheme: OrderedDictType[str, QuantizationScheme],
+    targets: List[str],
+    name: str,
+) -> QuantizationScheme:
+    if len(targets) == 1:
+        # if `targets` iterable contains a single element
+        # use it as the key
+        return target_to_scheme[targets[0]]
+    # otherwise, we need to merge QuantizationSchemes corresponding
+    # to multiple targets. This is most likely because `name` module
+    # is being target both as an ordinary quantization target, as well
+    # as kv cache quantization target
+    schemes_to_merge = [target_to_scheme[target] for target in targets]
+    return _merge_schemes(schemes_to_merge, name)
+def _merge_schemes(
+    schemes_to_merge: List[QuantizationScheme], name: str
+) -> QuantizationScheme:
+    kv_cache_quantization_scheme = [
+        scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme)
+    ]
+    if not kv_cache_quantization_scheme:
+        # if the schemes_to_merge do not contain any
+        # kv cache QuantizationScheme
+        # return the first scheme (the prioritized one,
+        # since the order of schemes_to_merge matters)
+        return schemes_to_merge[0]
+    else:
+        # fetch the kv cache QuantizationScheme and the highest
+        # priority non-kv cache QuantizationScheme and merge them
+        kv_cache_quantization_scheme = kv_cache_quantization_scheme[0]
+        quantization_scheme = [
+            scheme
+            for scheme in schemes_to_merge
+            if not is_kv_cache_quant_scheme(scheme)
+        ][0]
+        schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme]
+        merged_scheme = {}
+        for scheme in schemes_to_merge:
+            scheme_dict = {
+                k: v for k, v in scheme.model_dump().items() if v is not None
+            }
+            # when merging multiple schemes, the final target will be
+            # the `name` argument - hence erase the original targets
+            del scheme_dict["targets"]
+            # make sure that schemes do not "clash" with each other
+            overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys())
+            if overlapping_keys:
+                raise ValueError(
+                    f"The module: {name} is being modified by two clashing "
+                    f"quantization schemes, that jointly try to override "
+                    f"properties: {overlapping_keys}. Fix the quantization config "
+                    "so that it is not ambiguous."
+                )
+            merged_scheme.update(scheme_dict)
+        merged_scheme.update(targets=[name])
+        return QuantizationScheme(**merged_scheme)

compressed_tensors/quantization/lifecycle/calibration.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import logging
 from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.utils import is_module_offloaded, update_parameter_data
 from torch.nn import Module
@@ -27,7 +28,7 @@ __all__ = [
 _LOGGER = logging.getLogger(__name__)
-def set_module_for_calibration(module: Module):
+def set_module_for_calibration(module: Module, quantize_weights_upfront: bool = True):
     """
     marks a layer as ready for calibration which activates observers
     to update scales and zero points on each forward pass
@@ -35,17 +36,36 @@ def set_module_for_calibration(module: Module):
     apply to full model with `model.apply(set_module_for_calibration)`
     :param module: module to set for calibration
+    :param quantize_weights_upfront: whether to automatically
+       run weight quantization at the start of calibration
     """
     if not getattr(module, "quantization_scheme", None):
         # no quantization scheme nothing to do
         return
     status = getattr(module, "quantization_status", None)
     if not status or status != QuantizationStatus.INITIALIZED:
-        raise _LOGGER.warning(
+        _LOGGER.warning(
             f"Attempting set module with status {status} to calibration mode. "
             f"but status is not {QuantizationStatus.INITIALIZED} - you may "
             "be calibrating an uninitialized module which may fail or attempting "
             "to re-calibrate a frozen module"
         )
+    if quantize_weights_upfront and module.quantization_scheme.weights is not None:
+        # set weight scale and zero_point up front, calibration data doesn't affect it
+        observer = module.weight_observer
+        g_idx = getattr(module, "weight_g_idx", None)
+        offloaded = False
+        if is_module_offloaded(module):
+            module._hf_hook.pre_forward(module)
+            offloaded = True
+        scale, zero_point = observer(module.weight, g_idx=g_idx)
+        update_parameter_data(module, scale, "weight_scale")
+        update_parameter_data(module, zero_point, "weight_zero_point")
+        if offloaded:
+            module._hf_hook.post_forward(module, None)
     module.quantization_status = QuantizationStatus.CALIBRATION

compressed_tensors/quantization/lifecycle/compressed.py CHANGED Viewed

@@ -49,8 +49,9 @@ def compress_quantized_weights(module: Module):
     weight = getattr(module, "weight", None)
     scale = getattr(module, "weight_scale", None)
     zero_point = getattr(module, "weight_zero_point", None)
+    g_idx = getattr(module, "weight_g_idx", None)
-    if weight is None or scale is None or zero_point is None:
+    if weight is None or scale is None:
         # no weight, scale, or ZP, nothing to do
         # mark as compressed here to maintain consistent status throughout the model
@@ -62,6 +63,7 @@ def compress_quantized_weights(module: Module):
         x=weight,
         scale=scale,
         zero_point=zero_point,
+        g_idx=g_idx,
         args=scheme.weights,
         dtype=torch.int8,
     )

compressed-tensors 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

compressed-tensors 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl