PyPI - compressed-tensors - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

compressed-tensors 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -13,10 +13,10 @@
 # limitations under the License.
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 import torch
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 __all__ = [
@@ -25,6 +25,7 @@ __all__ = [
     "QuantizationStrategy",
     "QuantizationArgs",
     "round_to_quantized_type",
+    "ActivationOrdering",
 ]
 FP8_DTYPE = torch.float8_e4m3fn
@@ -51,6 +52,19 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
+class ActivationOrdering(str, Enum):
+    """
+    Enum storing strategies for activation ordering
+    Group: reorder groups and weight\n
+    Weight: only reorder weight, not groups. Slightly lower latency and
+    accuracy compared to group actorder\n
+    """
+    GROUP = "group"
+    WEIGHT = "weight"
 class QuantizationArgs(BaseModel, use_enum_values=True):
     """
     User facing arguments used to define a quantization config for weights or
@@ -68,15 +82,18 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         ranges will be observed with every sample. Defaults to False for static
         quantization. Note that enabling dynamic quantization will change the default
         observer to a memoryless one
+    :param actorder: whether to apply group quantization in decreasing order of
+        activation. Defaults to None for arbitrary ordering
     """
     num_bits: int = 8
-    type: QuantizationType = QuantizationType.INT.value
+    type: QuantizationType = QuantizationType.INT
     symmetric: bool = True
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
     dynamic: bool = False
+    actorder: Union[ActivationOrdering, bool, None] = None
     observer: str = Field(
         default="minmax",
         description=(
@@ -98,41 +115,102 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         """
         from compressed_tensors.quantization.observers.base import Observer
-        if self.observer == "minmax" and self.dynamic:
+        if self.dynamic:
             # override defualt observer for dynamic, you never want minmax which
             # keeps state across samples for dynamic
             self.observer = "memoryless"
         return Observer.load_from_registry(self.observer, quantization_args=self)
-    @validator("strategy", pre=True, always=True)
-    def validate_strategy(cls, value, values):
-        group_size = values.get("group_size")
-        # use group_size to determinine strategy if not given explicity
-        if group_size is not None and value is None:
-            if group_size > 0:
-                return QuantizationStrategy.GROUP
+    def get_kv_cache(self):
+        """Get the singleton KV Cache"""
+        from compressed_tensors.quantization.cache import QuantizedKVParameterCache
-            elif group_size == -1:
-                return QuantizationStrategy.CHANNEL
+        return QuantizedKVParameterCache(self)
-            else:
-                raise ValueError(
-                    f"group_size={group_size} with strategy {value} is invald. "
-                    "group_size > 0 for strategy='group' and "
-                    "group_size = -1 for 'channel'"
-                )
+    @field_validator("type", mode="before")
+    def validate_type(cls, value) -> QuantizationType:
+        if isinstance(value, str):
+            return QuantizationType(value.lower())
-        if value == QuantizationStrategy.GROUP:
-            if group_size is None:
-                raise ValueError(f"strategy {value} requires group_size to be set.")
+        return value
+    @field_validator("group_size", mode="before")
+    def validate_group(cls, value) -> Union[int, None]:
         if value is None:
-            return QuantizationStrategy.TENSOR
+            return value
+        if value < -1:
+            raise ValueError(
+                f"Invalid group size {value}. Use group_size > 0 for "
+                "strategy='group' and group_size = -1 for 'channel'"
+            )
+        return value
+    @field_validator("strategy", mode="before")
+    def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
+        if isinstance(value, str):
+            return QuantizationStrategy(value.lower())
+        return value
+    @field_validator("actorder", mode="before")
+    def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
+        if isinstance(value, bool):
+            return ActivationOrdering.GROUP if value else None
+        if isinstance(value, str):
+            return ActivationOrdering(value.lower())
         return value
+    @model_validator(mode="after")
+    def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
+        # extract user-passed values from dictionary
+        strategy = model.strategy
+        group_size = model.group_size
+        actorder = model.actorder
+        # infer strategy
+        if strategy is None:
+            if group_size is None:
+                strategy = QuantizationStrategy.TENSOR
+            elif group_size > 0:
+                strategy = QuantizationStrategy.GROUP
+            elif group_size == -1:
+                strategy = QuantizationStrategy.CHANNEL
+            else:
+                raise ValueError(
+                    f"Invalid group size {group_size}. Use group_size > 0 for "
+                    "strategy='group' and group_size = -1 for 'channel'"
+                )
+        # validate strategy and group
+        if strategy == QuantizationStrategy.GROUP:
+            if group_size is None or group_size <= 0:
+                raise ValueError(
+                    f"strategy {strategy} requires group_size to be "
+                    "set to a positive value"
+                )
+        if (
+            group_size is not None
+            and group_size > 0
+            and strategy != QuantizationStrategy.GROUP
+        ):
+            raise ValueError("group_size requires strategy to be set to 'group'")
+        # validate activation ordering and strategy
+        if actorder is not None and strategy != QuantizationStrategy.GROUP:
+            raise ValueError(
+                "Must use group quantization strategy in order to apply "
+                "activation ordering"
+            )
+        # write back modified values
+        model.strategy = strategy
+        return model
     def pytorch_dtype(self) -> torch.dtype:
         if self.type == QuantizationType.FLOAT:
             return FP8_DTYPE

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -24,7 +24,7 @@ from compressed_tensors.quantization.quant_scheme import (
 from compressed_tensors.quantization.utils import (
     calculate_compression_ratio,
     is_module_quantized,
-    iter_named_leaf_modules,
+    iter_named_quantizable_modules,
     module_type,
     parse_out_kv_cache_args,
 )
@@ -177,7 +177,9 @@ class QuantizationConfig(BaseModel):
         quantization_status = None
         ignore = {}
         quantization_type_names = set()
-        for name, submodule in iter_named_leaf_modules(model):
+        for name, submodule in iter_named_quantizable_modules(
+            model, include_children=True, include_attn=True
+        ):
             layer_type = module_type(submodule)
             if not is_module_quantized(submodule):
                 if layer_type not in ignore:
@@ -199,6 +201,13 @@ class QuantizationConfig(BaseModel):
         if len(quant_scheme_to_layers) == 0:  # No quantized layers
             return None
+        # kv-cache only, no weight/activation quantization
+        if (
+            len(quantization_type_names) == 1
+            and "attention" in list(quantization_type_names)[0].lower()
+        ):
+            quantization_type_names.add("Linear")
         # clean up ignore list, we can leave out layers types if none of the
         # instances are quantized
         consolidated_ignore = []
@@ -241,6 +250,9 @@ class QuantizationConfig(BaseModel):
         )
     def requires_calibration_data(self):
+        if self.kv_cache_scheme is not None:
+            return True
         for _, scheme in self.config_groups.items():
             if scheme.input_activations is not None:
                 if not scheme.input_activations.dynamic:

compressed_tensors/quantization/quant_scheme.py CHANGED Viewed

@@ -57,15 +57,9 @@ class QuantizationScheme(BaseModel):
             # default to quantizing all Linear layers
             targets = ["Linear"]
-        # default to 8 bit integer symmetric quantization
-        # for weights
-        weights = QuantizationArgs(num_bits=8, symmetric=True)
-        # default to 8 bit integer asymmetric quantization
-        input_activations = QuantizationArgs(num_bits=8, symmetric=True)
-        # Do not quantize the output activations
-        # by default
+        # by default, activations and weights are left unquantized
+        weights = None
+        input_activations = None
         output_activations = None
         return cls(
@@ -111,8 +105,10 @@ def is_preset_scheme(name: str) -> bool:
     return name.upper() in PRESET_SCHEMES
+UNQUANTIZED = dict()
 # 8 bit integer weights and 8 bit activations quantization
-W8A8 = dict(
+INT8_W8A8 = dict(
     weights=QuantizationArgs(
         num_bits=8,
         type=QuantizationType.INT,
@@ -153,7 +149,7 @@ W4A16 = dict(
 )
 # 4 bit integer weights and 8 bit activations quantization
-W4A8 = dict(
+INT8_W4A8 = dict(
     weights=QuantizationArgs(
         num_bits=4,
         type=QuantizationType.INT,
@@ -208,12 +204,15 @@ FP8_DYNAMIC = dict(
 )
 PRESET_SCHEMES = {
+    # Unquantized (no-op)
+    "UNQUANTIZED": UNQUANTIZED,
     # Integer weight only schemes
     "W8A16": W8A16,
     "W4A16": W4A16,
     # Integer weight and activation schemes
-    "W8A8": W8A8,
-    "W4A8": W4A8,
+    "W8A8": INT8_W8A8,
+    "INT8": INT8_W8A8,  # alias for W8A8
+    "W4A8": INT8_W4A8,
     # Float weight and activation schemes
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -13,8 +13,7 @@
 # limitations under the License.
 import logging
-import re
-from typing import List, Optional, Tuple
+from typing import Generator, List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -28,7 +27,6 @@ __all__ = [
     "infer_quantization_status",
     "is_module_quantized",
     "is_model_quantized",
-    "iter_named_leaf_modules",
     "module_type",
     "calculate_compression_ratio",
     "get_torch_bit_depth",
@@ -36,9 +34,14 @@ __all__ = [
     "parse_out_kv_cache_args",
     "KV_CACHE_TARGETS",
     "is_kv_cache_quant_scheme",
+    "iter_named_leaf_modules",
+    "iter_named_quantizable_modules",
 ]
-KV_CACHE_TARGETS = ["re:.*k_proj", "re:.*v_proj"]
+# target the self_attn layer
+# QuantizedKVParameterCache is responsible for obtaining the k_scale and v_scale
+KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -106,11 +109,10 @@ def module_type(module: Module) -> str:
     return type(module).__name__
-def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
+def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None, None]:
     """
     Yields modules that do not have any submodules except observers. The observers
     themselves are not yielded
     :param model: model to get leaf modules of
     :returns: generator tuple of (name, leaf_submodule)
     """
@@ -128,6 +130,37 @@ def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
                 yield name, submodule
+def iter_named_quantizable_modules(
+    model: Module, include_children: bool = True, include_attn: bool = False
+) -> Generator[Tuple[str, Module], None, None]:
+    """
+    Yield name and submodule of
+    - leaf modules, set by include_children
+    - attention modyles, set by include_attn
+    :param model: model to get leaf modules of
+    :param include_children: flag to get the leaf modules
+    :param inlcude_attn: flag to get the attention modules
+    :returns: generator tuple of (name, submodule)
+    """
+    for name, submodule in model.named_modules():
+        if include_children:
+            children = list(submodule.children())
+            if len(children) == 0 and not isinstance(submodule, Observer):
+                yield name, submodule
+            else:
+                has_non_observer_children = False
+                for child in children:
+                    if not isinstance(child, Observer):
+                        has_non_observer_children = True
+                if not has_non_observer_children:
+                    yield name, submodule
+        if include_attn:
+            if name.endswith("self_attn"):
+                yield name, submodule
 def get_torch_bit_depth(value: torch.Tensor) -> int:
     """
     Determine the number of bits used to represent the dtype of a tensor
@@ -181,7 +214,7 @@ def calculate_compression_ratio(model: Module) -> float:
         for parameter in model.parameters():
             uncompressed_bits = get_torch_bit_depth(parameter)
             compressed_bits = uncompressed_bits
-            if is_module_quantized(submodule):
+            if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
                 compressed_bits = submodule.quantization_scheme.weights.num_bits
             num_weights = parameter.numel()
@@ -204,19 +237,11 @@ def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
     :param scheme: The QuantizationScheme to investigate
     :return: boolean flag
     """
-    if len(scheme.targets) == 1:
-        # match on the KV_CACHE_TARGETS regex pattern
-        # if there is only one target
-        is_match_targets = any(
-            [re.match(pattern[3:], scheme.targets[0]) for pattern in KV_CACHE_TARGETS]
-        )
-    else:
-        # match on the exact KV_CACHE_TARGETS
-        # if there are multiple targets
-        is_match_targets = set(KV_CACHE_TARGETS) == set(scheme.targets)
+    for target in scheme.targets:
+        if target in KV_CACHE_TARGETS:
+            return True
-    is_match_output_activations = scheme.output_activations is not None
-    return is_match_targets and is_match_output_activations
+    return False
 def parse_out_kv_cache_args(

compressed_tensors/utils/__init__.py CHANGED Viewed

@@ -16,5 +16,6 @@
 from .helpers import *
 from .offload import *
 from .permutations_24 import *
+from .permute import *
 from .safetensors_load import *
 from .semi_structured_conversions import *

compressed_tensors/utils/helpers.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import Any, Optional
 import torch
 from transformers import AutoConfig
@@ -22,6 +22,8 @@ __all__ = [
     "infer_compressor_from_model_config",
     "fix_fsdp_module_name",
     "tensor_follows_mask_structure",
+    "replace_module",
+    "is_compressed_tensors_config",
 ]
 FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -90,3 +92,30 @@ def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool:
         raise ValueError()
     return True
+def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1 :]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model
+        child_name = name
+    setattr(parent, child_name, new_module)
+def is_compressed_tensors_config(compression_config: Any) -> bool:
+    """
+    Returns True if CompressedTensorsConfig is available from transformers and
+    compression_config is an instance of CompressedTensorsConfig
+    See: https://github.com/huggingface/transformers/pull/31704
+    """
+    try:
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+        return isinstance(compression_config, CompressedTensorsConfig)
+    except ImportError:
+        return False

compressed_tensors/utils/offload.py CHANGED Viewed

@@ -40,7 +40,13 @@ def get_execution_device(module: Module) -> torch.device:
     """
     if is_module_offloaded(module):
         return module._hf_hook.execution_device
-    return next(module.parameters()).device
+    device = next(module.parameters()).device
+    # offload only gets set for leaf modules, fallback to checking for device type
+    if device.type == "meta":
+        return module._hf_hook.execution_device
+    return device
 def get_offloaded_device(module: Module) -> torch.device:
@@ -83,8 +89,11 @@ def update_parameter_data(
     :param module: layer containing the parameter to update
     :param new_param_data: tensor to update parameter with
-    :param param_name:
+    :param param_name: name of layer parameter to update
     """
+    if not hasattr(module, param_name):
+        return
     device = next(module.parameters()).device
     offloaded = False
@@ -93,6 +102,9 @@ def update_parameter_data(
         offloaded = True
     parameter = getattr(module, param_name, None)
+    if parameter is None:
+        raise ValueError("Attempted to update uninitialized parameter")
     dtype = parameter.dtype
     parameter.data = new_param_data.to(device).to(dtype)

compressed_tensors/utils/permute.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Set, Tuple
+import torch
+__all__ = ["safe_permute"]
+# these datatypes are missing implementations required for standard permutation
+_EXPERIMENTAL_DTYPES: Set[Tuple[torch.dtype, torch.device]] = set()
+def safe_permute(value: torch.Tensor, perm: torch.Tensor, dim: int = 0) -> torch.Tensor:
+    """
+    Perform out-of-place permutation without using torch.Tensor.index_put_,
+    whose implementation is missing for datatypes such as `torch.float8_e4m3fn`
+    :param value: tensor to permute
+    :param perm: permutation map
+    :param dim: dimension along which to apply permutation
+    :return: permuted value
+    """
+    dtype_tuple = (value.dtype, value.device)
+    if dtype_tuple in _EXPERIMENTAL_DTYPES:
+        return _fallback_permute(value, perm, dim)
+    try:
+        return value[tuple([slice(None)] * dim + [perm])]
+    except RuntimeError:
+        # Mark dtype as experimental if advanced indexing fails
+        _EXPERIMENTAL_DTYPES.add(dtype_tuple)
+        return _fallback_permute(value, perm, dim)
+def _fallback_permute(
+    value: torch.Tensor, perm: torch.Tensor, dim: int
+) -> torch.Tensor:
+    """
+    Fallback permutation method for experimental dtypes.
+    :param value: tensor to permute
+    :param perm: permutation map
+    :param dim: dimension along which to apply permutation
+    :return: permuted value
+    """
+    value_ret = value.clone()  # cannot use zeros_like b/c of missing impl.
+    orig_slices = [slice(None)] * (dim + 1)
+    perm_slices = [slice(None)] * (dim + 1)
+    for index, perm_index in enumerate(perm):
+        orig_slices[dim] = index
+        perm_slices[dim] = perm_index
+        value_ret[tuple(orig_slices)] = value[tuple(perm_slices)]
+    return value_ret

compressed_tensors/utils/safetensors_load.py CHANGED Viewed

@@ -234,5 +234,7 @@ def is_quantization_param(name: str) -> bool:
         return True
     if name.endswith("zero_point"):
         return True
+    if name.endswith("g_idx"):
+        return True
     return False

compressed_tensors/utils/semi_structured_conversions.py CHANGED Viewed

@@ -28,6 +28,7 @@ __all__ = [
     "mask_creator",
 ]
 # This is PyTorch implementation of main part of reorder_meta()
 # function, from tools/util/include/cutlass/util/host_reorder.h file
 # of CUTLASS source tree.  Furthermore, CUTLASS template for sparse

compressed_tensors/version.py CHANGED Viewed

@@ -17,7 +17,7 @@ Functionality for storing and setting the version info for SparseML
 """
-version_base = "0.5.0"
+version_base = "0.7.0"
 is_release = True  # change to True to set the generated version as a release version

compressed-tensors 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

compressed-tensors 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl