PyPI - compressed-tensors - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

compressed-tensors 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -17,8 +17,10 @@ import logging
 from typing import Optional
 import torch
+from compressed_tensors.quantization.cache import KVCacheScaleType
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
+    wrap_module_forward_quantized_attn,
 )
 from compressed_tensors.quantization.quant_args import (
     ActivationOrdering,
@@ -27,6 +29,7 @@ from compressed_tensors.quantization.quant_args import (
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
+from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
 from compressed_tensors.utils import get_execution_device, is_module_offloaded
 from torch.nn import Module, Parameter
@@ -62,72 +65,85 @@ def initialize_module_for_quantization(
         # no scheme passed and layer not targeted for quantization - skip
         return
-    if scheme.input_activations is not None:
-        _initialize_scale_zero_point_observer(
-            module, "input", scheme.input_activations, force_zero_point=force_zero_point
-        )
-    if scheme.weights is not None:
-        if hasattr(module, "weight"):
-            weight_shape = module.weight.shape
+    if is_attention_module(module):
+        # wrap forward call of module to perform
+        # quantized actions based on calltime status
+        wrap_module_forward_quantized_attn(module, scheme)
+        _initialize_attn_scales(module)
+    else:
+        if scheme.input_activations is not None:
             _initialize_scale_zero_point_observer(
                 module,
-                "weight",
-                scheme.weights,
-                weight_shape=weight_shape,
+                "input",
+                scheme.input_activations,
                 force_zero_point=force_zero_point,
             )
-        else:
-            _LOGGER.warning(
-                f"module type {type(module)} targeted for weight quantization but "
-                "has no attribute weight, skipping weight quantization "
-                f"for {type(module)}"
-            )
-    if scheme.output_activations is not None:
-        _initialize_scale_zero_point_observer(
-            module,
-            "output",
-            scheme.output_activations,
-            force_zero_point=force_zero_point,
-        )
-    module.quantization_scheme = scheme
-    module.quantization_status = QuantizationStatus.INITIALIZED
-    offloaded = False
-    if is_module_offloaded(module):
-        try:
-            from accelerate.hooks import add_hook_to_module, remove_hook_from_module
-            from accelerate.utils import PrefixedDataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError(
-                "Offloaded model detected. To use CPU offloading with "
-                "compressed-tensors the `accelerate` package must be installed, "
-                "run `pip install compressed-tensors[accelerate]`"
-            )
-        offloaded = True
-        hook = module._hf_hook
-        prefix_dict = module._hf_hook.weights_map
-        new_prefix = {}
-        # recreate the prefix dict (since it is immutable)
-        # and add quantization parameters
-        for key, data in module.named_parameters():
-            if key not in prefix_dict:
-                new_prefix[f"{prefix_dict.prefix}{key}"] = data
+        if scheme.weights is not None:
+            if hasattr(module, "weight"):
+                weight_shape = None
+                if isinstance(module, torch.nn.Linear):
+                    weight_shape = module.weight.shape
+                _initialize_scale_zero_point_observer(
+                    module,
+                    "weight",
+                    scheme.weights,
+                    weight_shape=weight_shape,
+                    force_zero_point=force_zero_point,
+                )
             else:
-                new_prefix[f"{prefix_dict.prefix}{key}"] = prefix_dict[key]
-        new_prefix_dict = PrefixedDataset(new_prefix, prefix_dict.prefix)
-        remove_hook_from_module(module)
-    # wrap forward call of module to perform quantized actions based on calltime status
-    wrap_module_forward_quantized(module, scheme)
-    if offloaded:
-        # we need to re-add the hook for offloading now that we've wrapped forward
-        add_hook_to_module(module, hook)
-        if prefix_dict is not None:
-            module._hf_hook.weights_map = new_prefix_dict
+                _LOGGER.warning(
+                    f"module type {type(module)} targeted for weight quantization but "
+                    "has no attribute weight, skipping weight quantization "
+                    f"for {type(module)}"
+                )
+        if scheme.output_activations is not None:
+            if not is_kv_cache_quant_scheme(scheme):
+                _initialize_scale_zero_point_observer(
+                    module, "output", scheme.output_activations
+                )
+        module.quantization_scheme = scheme
+        module.quantization_status = QuantizationStatus.INITIALIZED
+        offloaded = False
+        if is_module_offloaded(module):
+            try:
+                from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+                from accelerate.utils import PrefixedDataset
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError(
+                    "Offloaded model detected. To use CPU offloading with "
+                    "compressed-tensors the `accelerate` package must be installed, "
+                    "run `pip install compressed-tensors[accelerate]`"
+                )
+            offloaded = True
+            hook = module._hf_hook
+            prefix_dict = module._hf_hook.weights_map
+            new_prefix = {}
+            # recreate the prefix dict (since it is immutable)
+            # and add quantization parameters
+            for key, data in module.named_parameters():
+                if key not in prefix_dict:
+                    new_prefix[f"{prefix_dict.prefix}{key}"] = data
+                else:
+                    new_prefix[f"{prefix_dict.prefix}{key}"] = prefix_dict[key]
+            new_prefix_dict = PrefixedDataset(new_prefix, prefix_dict.prefix)
+            remove_hook_from_module(module)
+        # wrap forward call of module to perform
+        # quantized actions based on calltime status
+        wrap_module_forward_quantized(module, scheme)
+        if offloaded:
+            # we need to re-add the hook for offloading now that we've wrapped forward
+            add_hook_to_module(module, hook)
+            if prefix_dict is not None:
+                module._hf_hook.weights_map = new_prefix_dict
 def _initialize_scale_zero_point_observer(
@@ -137,12 +153,16 @@ def _initialize_scale_zero_point_observer(
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
-    module.register_module(f"{base_name}_observer", observer)
+    # no need to register an observer for dynamic quantization
+    if observer:
+        module.register_module(f"{base_name}_observer", observer)
+    # no need to register a scale and zero point for a dynamic quantization
     if quantization_args.dynamic:
-        return  # no need to register a scale and zero point for a dynamic observer
+        return
     device = next(module.parameters()).device
     if is_module_offloaded(module):
@@ -156,10 +176,8 @@ def _initialize_scale_zero_point_observer(
             # (output_channels, 1)
             expected_shape = (weight_shape[0], 1)
         elif quantization_args.strategy == QuantizationStrategy.GROUP:
-            expected_shape = (
-                weight_shape[0],
-                weight_shape[1] // quantization_args.group_size,
-            )
+            num_groups = weight_shape[1] // quantization_args.group_size
+            expected_shape = (weight_shape[0], max(num_groups, 1))
     scale_dtype = module.weight.dtype
     if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
@@ -189,3 +207,34 @@ def _initialize_scale_zero_point_observer(
             requires_grad=False,
         )
         module.register_parameter(f"{base_name}_g_idx", init_g_idx)
+def is_attention_module(module: Module):
+    return "attention" in module.__class__.__name__.lower() and (
+        hasattr(module, "k_proj")
+        or hasattr(module, "v_proj")
+        or hasattr(module, "qkv_proj")
+    )
+def _initialize_attn_scales(module: Module) -> None:
+    """Initlaize k_scale, v_scale for  self_attn"""
+    expected_shape = 1  # per tensor
+    param = next(module.parameters())
+    scale_dtype = param.dtype
+    device = param.device
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=scale_dtype, device=device),
+        requires_grad=False,
+    )
+    module.register_parameter(KVCacheScaleType.KEY.value, init_scale)
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=scale_dtype, device=device),
+        requires_grad=False,
+    )
+    module.register_parameter(KVCacheScaleType.VALUE.value, init_scale)

compressed_tensors/quantization/observers/__init__.py CHANGED Viewed

@@ -17,6 +17,5 @@
 from .helpers import *
 from .base import *
-from .memoryless import *
 from .min_max import *
 from .mse import *

compressed_tensors/quantization/observers/helpers.py CHANGED Viewed

@@ -13,18 +13,56 @@
 # limitations under the License.
 from collections import Counter
-from typing import Tuple
+from typing import Optional, Tuple
 import torch
 from compressed_tensors.quantization.quant_args import (
     FP8_DTYPE,
     QuantizationArgs,
+    QuantizationStrategy,
     QuantizationType,
 )
 from torch import FloatTensor, IntTensor, Tensor
-__all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
+__all__ = [
+    "calculate_qparams",
+    "get_observer_token_count",
+    "calculate_range",
+    "compute_dynamic_scales_and_zp",
+]
+def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
+    """
+    Returns the computed scales and zero points for dynamic activation
+    qunatization.
+    :param value: tensor to calculate quantization parameters for
+    :param args: quantization args
+    :param reduce_dims: optional tuple of dimensions to reduce along,
+        returned scale and zero point will be shaped (1,) along the
+        reduced dimensions
+    :return: tuple of scale and zero point derived from the observed tensor
+    """
+    if args.strategy == QuantizationStrategy.TOKEN:
+        dim = {1, 2}
+        reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
+    elif args.strategy == QuantizationStrategy.TENSOR:
+        reduce_dims = None
+    else:
+        raise ValueError(
+            f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ",
+            "must be used for dynamic quantization",
+        )
+    if not reduce_dims:
+        min_val, max_val = torch.aminmax(value)
+    else:
+        min_val = torch.amin(value, dim=reduce_dims, keepdims=True)
+        max_val = torch.amax(value, dim=reduce_dims, keepdims=True)
+    return calculate_qparams(min_val, max_val, args)
 def get_observer_token_count(module: torch.nn.Module) -> Counter:

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from enum import Enum
 from typing import Any, Dict, Optional, Union
@@ -94,7 +95,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     block_structure: Optional[str] = None
     dynamic: bool = False
     actorder: Union[ActivationOrdering, bool, None] = None
-    observer: str = Field(
+    observer: Optional[str] = Field(
         default="minmax",
         description=(
             "The class to use to compute the quantization param - "
@@ -115,13 +116,19 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         """
         from compressed_tensors.quantization.observers.base import Observer
+        # No observer required for the dynamic case
         if self.dynamic:
-            # override defualt observer for dynamic, you never want minmax which
-            # keeps state across samples for dynamic
-            self.observer = "memoryless"
+            self.observer = None
+            return self.observer
         return Observer.load_from_registry(self.observer, quantization_args=self)
+    def get_kv_cache(self):
+        """Get the singleton KV Cache"""
+        from compressed_tensors.quantization.cache import QuantizedKVParameterCache
+        return QuantizedKVParameterCache(self)
     @field_validator("type", mode="before")
     def validate_type(cls, value) -> QuantizationType:
         if isinstance(value, str):
@@ -165,6 +172,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         strategy = model.strategy
         group_size = model.group_size
         actorder = model.actorder
+        dynamic = model.dynamic
+        observer = model.observer
         # infer strategy
         if strategy is None:
@@ -201,6 +210,27 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                 "activation ordering"
             )
+        if dynamic:
+            if strategy not in (
+                QuantizationStrategy.TOKEN,
+                QuantizationStrategy.TENSOR,
+            ):
+                raise ValueError(
+                    f"One of {QuantizationStrategy.TOKEN} or "
+                    f"{QuantizationStrategy.TENSOR} must be used for dynamic ",
+                    "quantization",
+                )
+            if observer is not None:
+                warnings.warn(
+                    "No observer is used for dynamic quantization, setting to None"
+                )
+                model.observer = None
+        # if we have not set an observer and we
+        # are running static quantization, use minmax
+        if not observer and not dynamic:
+            model.observer = "minmax"
         # write back modified values
         model.strategy = strategy
         return model

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -24,7 +24,7 @@ from compressed_tensors.quantization.quant_scheme import (
 from compressed_tensors.quantization.utils import (
     calculate_compression_ratio,
     is_module_quantized,
-    iter_named_leaf_modules,
+    iter_named_quantizable_modules,
     module_type,
     parse_out_kv_cache_args,
 )
@@ -177,7 +177,9 @@ class QuantizationConfig(BaseModel):
         quantization_status = None
         ignore = {}
         quantization_type_names = set()
-        for name, submodule in iter_named_leaf_modules(model):
+        for name, submodule in iter_named_quantizable_modules(
+            model, include_children=True, include_attn=True
+        ):
             layer_type = module_type(submodule)
             if not is_module_quantized(submodule):
                 if layer_type not in ignore:
@@ -199,6 +201,13 @@ class QuantizationConfig(BaseModel):
         if len(quant_scheme_to_layers) == 0:  # No quantized layers
             return None
+        # kv-cache only, no weight/activation quantization
+        if (
+            len(quantization_type_names) == 1
+            and "attention" in list(quantization_type_names)[0].lower()
+        ):
+            quantization_type_names.add("Linear")
         # clean up ignore list, we can leave out layers types if none of the
         # instances are quantized
         consolidated_ignore = []
@@ -241,6 +250,9 @@ class QuantizationConfig(BaseModel):
         )
     def requires_calibration_data(self):
+        if self.kv_cache_scheme is not None:
+            return True
         for _, scheme in self.config_groups.items():
             if scheme.input_activations is not None:
                 if not scheme.input_activations.dynamic:

compressed_tensors/quantization/quant_scheme.py CHANGED Viewed

@@ -108,7 +108,7 @@ def is_preset_scheme(name: str) -> bool:
 UNQUANTIZED = dict()
 # 8 bit integer weights and 8 bit activations quantization
-W8A8 = dict(
+INT8_W8A8 = dict(
     weights=QuantizationArgs(
         num_bits=8,
         type=QuantizationType.INT,
@@ -122,6 +122,7 @@ W8A8 = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
@@ -149,7 +150,7 @@ W4A16 = dict(
 )
 # 4 bit integer weights and 8 bit activations quantization
-W4A8 = dict(
+INT8_W4A8 = dict(
     weights=QuantizationArgs(
         num_bits=4,
         type=QuantizationType.INT,
@@ -164,6 +165,7 @@ W4A8 = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
@@ -200,6 +202,7 @@ FP8_DYNAMIC = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
@@ -210,8 +213,9 @@ PRESET_SCHEMES = {
     "W8A16": W8A16,
     "W4A16": W4A16,
     # Integer weight and activation schemes
-    "W8A8": W8A8,
-    "W4A8": W4A8,
+    "W8A8": INT8_W8A8,
+    "INT8": INT8_W8A8,  # alias for W8A8
+    "W4A8": INT8_W4A8,
     # Float weight and activation schemes
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -13,8 +13,7 @@
 # limitations under the License.
 import logging
-import re
-from typing import List, Optional, Tuple
+from typing import Generator, List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
@@ -28,7 +27,6 @@ __all__ = [
     "infer_quantization_status",
     "is_module_quantized",
     "is_model_quantized",
-    "iter_named_leaf_modules",
     "module_type",
     "calculate_compression_ratio",
     "get_torch_bit_depth",
@@ -36,9 +34,14 @@ __all__ = [
     "parse_out_kv_cache_args",
     "KV_CACHE_TARGETS",
     "is_kv_cache_quant_scheme",
+    "iter_named_leaf_modules",
+    "iter_named_quantizable_modules",
 ]
-KV_CACHE_TARGETS = ["re:.*k_proj", "re:.*v_proj"]
+# target the self_attn layer
+# QuantizedKVParameterCache is responsible for obtaining the k_scale and v_scale
+KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -106,11 +109,10 @@ def module_type(module: Module) -> str:
     return type(module).__name__
-def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
+def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None, None]:
     """
     Yields modules that do not have any submodules except observers. The observers
     themselves are not yielded
     :param model: model to get leaf modules of
     :returns: generator tuple of (name, leaf_submodule)
     """
@@ -128,6 +130,37 @@ def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
                 yield name, submodule
+def iter_named_quantizable_modules(
+    model: Module, include_children: bool = True, include_attn: bool = False
+) -> Generator[Tuple[str, Module], None, None]:
+    """
+    Yield name and submodule of
+    - leaf modules, set by include_children
+    - attention modyles, set by include_attn
+    :param model: model to get leaf modules of
+    :param include_children: flag to get the leaf modules
+    :param inlcude_attn: flag to get the attention modules
+    :returns: generator tuple of (name, submodule)
+    """
+    for name, submodule in model.named_modules():
+        if include_children:
+            children = list(submodule.children())
+            if len(children) == 0 and not isinstance(submodule, Observer):
+                yield name, submodule
+            else:
+                has_non_observer_children = False
+                for child in children:
+                    if not isinstance(child, Observer):
+                        has_non_observer_children = True
+                if not has_non_observer_children:
+                    yield name, submodule
+        if include_attn:
+            if name.endswith("self_attn"):
+                yield name, submodule
 def get_torch_bit_depth(value: torch.Tensor) -> int:
     """
     Determine the number of bits used to represent the dtype of a tensor
@@ -204,19 +237,11 @@ def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
     :param scheme: The QuantizationScheme to investigate
     :return: boolean flag
     """
-    if len(scheme.targets) == 1:
-        # match on the KV_CACHE_TARGETS regex pattern
-        # if there is only one target
-        is_match_targets = any(
-            [re.match(pattern[3:], scheme.targets[0]) for pattern in KV_CACHE_TARGETS]
-        )
-    else:
-        # match on the exact KV_CACHE_TARGETS
-        # if there are multiple targets
-        is_match_targets = set(KV_CACHE_TARGETS) == set(scheme.targets)
+    for target in scheme.targets:
+        if target in KV_CACHE_TARGETS:
+            return True
-    is_match_output_activations = scheme.output_activations is not None
-    return is_match_targets and is_match_output_activations
+    return False
 def parse_out_kv_cache_args(

compressed_tensors/utils/helpers.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import Any, Optional
 import torch
 from transformers import AutoConfig
@@ -23,6 +23,7 @@ __all__ = [
     "fix_fsdp_module_name",
     "tensor_follows_mask_structure",
     "replace_module",
+    "is_compressed_tensors_config",
 ]
 FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -103,3 +104,18 @@ def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Modul
         parent = model
         child_name = name
     setattr(parent, child_name, new_module)
+def is_compressed_tensors_config(compression_config: Any) -> bool:
+    """
+    Returns True if CompressedTensorsConfig is available from transformers and
+    compression_config is an instance of CompressedTensorsConfig
+    See: https://github.com/huggingface/transformers/pull/31704
+    """
+    try:
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+        return isinstance(compression_config, CompressedTensorsConfig)
+    except ImportError:
+        return False

compressed_tensors/version.py CHANGED Viewed

@@ -17,7 +17,7 @@ Functionality for storing and setting the version info for SparseML
 """
-version_base = "0.6.0"
+version_base = "0.7.1"
 is_release = True  # change to True to set the generated version as a release version

{compressed_tensors-0.6.0.dist-info → compressed_tensors-0.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors
-Version: 0.6.0
+Version: 0.7.1
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

compressed-tensors 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

compressed-tensors 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl