PyPI - compressed-tensors - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

compressed-tensors 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

compressed_tensors/compressors/model_compressors/model_compressor.py CHANGED Viewed

@@ -24,7 +24,6 @@ import compressed_tensors
 import torch
 import transformers
 from compressed_tensors.base import (
-    COMPRESSION_CONFIG_NAME,
     COMPRESSION_VERSION_NAME,
     QUANTIZATION_CONFIG_NAME,
     QUANTIZATION_METHOD_NAME,
@@ -39,6 +38,7 @@ from compressed_tensors.quantization import (
     apply_quantization_config,
     load_pretrained_quantization,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.utils import (
     is_module_quantized,
     iter_named_leaf_modules,
@@ -103,12 +103,14 @@ class ModelCompressor:
         :return: compressor for the configs, or None if model is not compressed
         """
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None)
+        compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
         return cls.from_compression_config(compression_config)
     @classmethod
     def from_compression_config(
-        cls, compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
+        cls,
+        compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
     ):
         """
         :param compression_config:
@@ -265,7 +267,11 @@ class ModelCompressor:
             state_dict = model.state_dict()
         compressed_state_dict = state_dict
-        quantized_modules_to_args = map_modules_to_quant_args(model)
+        quantized_modules_to_args: Dict[
+            str, QuantizationArgs
+        ] = map_modules_to_quant_args(model)
         if self.quantization_compressor is not None:
             compressed_state_dict = self.quantization_compressor.compress(
                 state_dict, names_to_scheme=quantized_modules_to_args
@@ -369,7 +375,13 @@ class ModelCompressor:
             update_parameter_data(module, data, param_name)
-def map_modules_to_quant_args(model: Module) -> Dict:
+def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
+    """
+    Given a pytorch model, map out the submodule name (usually linear layers)
+     to the QuantizationArgs
+    :param model: pytorch model
+    """
     quantized_modules_to_args = {}
     for name, submodule in iter_named_leaf_modules(model):
         if is_module_quantized(submodule):

compressed_tensors/compressors/quantized_compressors/naive_quantized.py CHANGED Viewed

@@ -93,9 +93,11 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
                 args=quantization_args,
                 dtype=quantization_args.pytorch_dtype(),
             )
+        else:
+            quantized_weight = weight
-            if device is not None:
-                quantized_weight = quantized_weight.to(device)
+        if device is not None:
+            quantized_weight = quantized_weight.to(device)
         return {"weight": quantized_weight}

compressed_tensors/compressors/quantized_compressors/pack_quantized.py CHANGED Viewed

@@ -94,6 +94,8 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
                 args=quantization_args,
                 dtype=torch.int8,
             )
+        else:
+            quantized_weight = weight
         packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
         weight_shape = torch.tensor(weight.shape)

compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py CHANGED Viewed

@@ -238,7 +238,7 @@ def pack_scales_24(scales, quantization_args, w_shape):
     _, scale_perm_2_4, scale_perm_single_2_4 = get_permutations_24(num_bits)
     if (
-        quantization_args.strategy is QuantizationStrategy.GROUP
+        quantization_args.strategy == QuantizationStrategy.GROUP
         and quantization_args.group_size < size_k
     ):
         scales = scales.reshape((-1, len(scale_perm_2_4)))[:, scale_perm_2_4]

compressed_tensors/config/base.py CHANGED Viewed

@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from enum import Enum
+from enum import Enum, unique
 from typing import List, Optional
 from compressed_tensors.registry import RegistryMixin
 from pydantic import BaseModel
-__all__ = ["SparsityCompressionConfig", "CompressionFormat"]
+__all__ = ["SparsityCompressionConfig", "CompressionFormat", "SparsityStructure"]
+@unique
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
@@ -32,6 +33,63 @@ class CompressionFormat(Enum):
     marlin_24 = "marlin-24"
+@unique
+class SparsityStructure(Enum):
+    """
+    An enumeration to represent different sparsity structures.
+    Attributes
+    ----------
+    TWO_FOUR : str
+        Represents a 2:4 sparsity structure.
+    ZERO_ZERO : str
+        Represents a 0:0 sparsity structure.
+    UNSTRUCTURED : str
+        Represents an unstructured sparsity structure.
+    Examples
+    --------
+    >>> SparsityStructure('2:4')
+    <SparsityStructure.TWO_FOUR: '2:4'>
+    >>> SparsityStructure('unstructured')
+    <SparsityStructure.UNSTRUCTURED: 'unstructured'>
+    >>> SparsityStructure('2:4') == SparsityStructure.TWO_FOUR
+    True
+    >>> SparsityStructure('UNSTRUCTURED') == SparsityStructure.UNSTRUCTURED
+    True
+    >>> SparsityStructure(None) == SparsityStructure.UNSTRUCTURED
+    True
+    >>> SparsityStructure('invalid')
+    Traceback (most recent call last):
+        ...
+    ValueError: invalid is not a valid SparsityStructure
+    """
+    TWO_FOUR = "2:4"
+    UNSTRUCTURED = "unstructured"
+    ZERO_ZERO = "0:0"
+    def __new__(cls, value):
+        obj = object.__new__(cls)
+        obj._value_ = value.lower() if value is not None else value
+        return obj
+    @classmethod
+    def _missing_(cls, value):
+        # Handle None and case-insensitive values
+        if value is None:
+            return cls.UNSTRUCTURED
+        for member in cls:
+            if member.value == value.lower():
+                return member
+        raise ValueError(f"{value} is not a valid {cls.__name__}")
 class SparsityCompressionConfig(RegistryMixin, BaseModel):
     """
     Base data class for storing sparsity compression parameters

compressed_tensors/linear/compressed_linear.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Dict, Tuple
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.quantization import (
@@ -53,7 +55,7 @@ class CompressedLinear(Linear):
         )
         # get the shape and dtype of compressed parameters
-        compression_params = module.compressor.compression_param_info(
+        compression_params: Dict[str, Tuple] = module.compressor.compression_param_info(
             module.weight.shape, quantization_scheme.weights
         )

compressed_tensors/quantization/__init__.py CHANGED Viewed

@@ -19,4 +19,3 @@ from .quant_args import *
 from .quant_config import *
 from .quant_scheme import *
 from .lifecycle import *
-from .cache import QuantizedKVParameterCache

compressed_tensors/quantization/lifecycle/__init__.py CHANGED Viewed

@@ -15,9 +15,7 @@
 # flake8: noqa
 # isort: skip_file
-from .calibration import *
 from .forward import *
-from .frozen import *
 from .initialize import *
 from .compressed import *
 from .apply import *

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -22,13 +22,9 @@ from typing import Union
 import torch
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization.lifecycle.calibration import (
-    set_module_for_calibration,
-)
 from compressed_tensors.quantization.lifecycle.compressed import (
     compress_quantized_weights,
 )
-from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
@@ -110,7 +106,8 @@ def apply_quantization_config(
     model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
 ) -> OrderedDict:
     """
-    Initializes the model for quantization in-place based on the given config
+    Initializes the model for quantization in-place based on the given config.
+    Optionally coverts quantizable modules to compressed_linear modules
     :param model: model to apply quantization config to
     :param config: quantization config
@@ -233,6 +230,7 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     :param model: model to apply quantization to
     :param status: status to update the module to
     """
     current_status = infer_quantization_status(model)
     if status >= QuantizationStatus.INITIALIZED > current_status:
@@ -243,18 +241,6 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
-    if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
-        # only quantize weights up front when our end goal state is calibration,
-        # weight quantization parameters are already loaded for frozen/compressed
-        quantize_weights_upfront = status == QuantizationStatus.CALIBRATION
-        model.apply(
-            lambda module: set_module_for_calibration(
-                module, quantize_weights_upfront=quantize_weights_upfront
-            )
-        )
-    if current_status < status >= QuantizationStatus.FROZEN > current_status:
-        model.apply(freeze_module_quantization)
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -14,14 +14,9 @@
 from functools import wraps
 from math import ceil
-from typing import Callable, Optional
+from typing import Optional
 import torch
-from compressed_tensors.quantization.cache import QuantizedKVParameterCache
-from compressed_tensors.quantization.observers.helpers import (
-    calculate_range,
-    compute_dynamic_scales_and_zp,
-)
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
@@ -29,7 +24,11 @@ from compressed_tensors.quantization.quant_args import (
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.utils import safe_permute, update_parameter_data
+from compressed_tensors.quantization.utils import (
+    calculate_range,
+    compute_dynamic_scales_and_zp,
+)
+from compressed_tensors.utils import safe_permute
 from torch.nn import Module
@@ -38,7 +37,7 @@ __all__ = [
     "dequantize",
     "fake_quantize",
     "wrap_module_forward_quantized",
-    "maybe_calibrate_or_quantize",
+    "forward_quantize",
 ]
@@ -275,15 +274,13 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         compressed = module.quantization_status == QuantizationStatus.COMPRESSED
         if scheme.input_activations is not None:
-            # calibrate and (fake) quantize input activations when applicable
-            input_ = maybe_calibrate_or_quantize(
-                module, input_, "input", scheme.input_activations
-            )
+            # prehook should calibrate activations before forward call
+            input_ = forward_quantize(module, input_, "input", scheme.input_activations)
         if scheme.weights is not None and not compressed:
             # calibrate and (fake) quantize weights when applicable
             unquantized_weight = self.weight.data.clone()
-            self.weight.data = maybe_calibrate_or_quantize(
+            self.weight.data = forward_quantize(
                 module, self.weight, "weight", scheme.weights
             )
@@ -291,64 +288,23 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         output = forward_func_orig.__get__(module, module.__class__)(
             input_, *args[1:], **kwargs
         )
-        if scheme.output_activations is not None:
-            # calibrate and (fake) quantize output activations when applicable
-            # kv_cache scales updated on model self_attn forward call in
-            # wrap_module_forward_quantized_attn
-            output = maybe_calibrate_or_quantize(
-                module, output, "output", scheme.output_activations
-            )
         # restore back to unquantized_value
         if scheme.weights is not None and not compressed:
             self.weight.data = unquantized_weight
-        return output
-    # bind wrapped forward to module class so reference to `self` is correct
-    bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
-    # set forward to wrapped forward
-    setattr(module, "forward", bound_wrapped_forward)
-def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationScheme):
-    # expects a module already initialized and injected with the parameters in
-    # initialize_module_for_quantization
-    if hasattr(module.forward, "__func__"):
-        forward_func_orig = module.forward.__func__
-    else:
-        forward_func_orig = module.forward.func
-    @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
-    def wrapped_forward(self, *args, **kwargs):
-        # kv cache stored under weights
-        if module.quantization_status == QuantizationStatus.CALIBRATION:
-            quantization_args: QuantizationArgs = scheme.output_activations
-            past_key_value: QuantizedKVParameterCache = quantization_args.get_kv_cache()
-            kwargs["past_key_value"] = past_key_value
-            # QuantizedKVParameterCache used for obtaining k_scale, v_scale only,
-            # does not store quantized_key_states and quantized_value_state
-            kwargs["use_cache"] = False
-            attn_forward: Callable = forward_func_orig.__get__(module, module.__class__)
-            past_key_value.reset_states()
-            rtn = attn_forward(*args, **kwargs)
-            update_parameter_data(
-                module, past_key_value.k_scales[module.layer_idx], "k_scale"
-            )
-            update_parameter_data(
-                module, past_key_value.v_scales[module.layer_idx], "v_scale"
+        if scheme.output_activations is not None:
+            # forward-hook should calibrate/forward_quantize
+            if (
+                module.quantization_status == QuantizationStatus.CALIBRATION
+                and not scheme.output_activations.dynamic
+            ):
+                return output
+            output = forward_quantize(
+                module, output, "output", scheme.output_activations
             )
-            return rtn
-        return forward_func_orig.__get__(module, module.__class__)(*args, **kwargs)
+        return output
     # bind wrapped forward to module class so reference to `self` is correct
     bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__)
@@ -356,12 +312,9 @@ def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationSchem
     setattr(module, "forward", bound_wrapped_forward)
-def maybe_calibrate_or_quantize(
+def forward_quantize(
     module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
-    # don't run quantization if we haven't entered calibration mode
-    if module.quantization_status == QuantizationStatus.INITIALIZED:
-        return value
     # in compressed mode, the weight is already compressed and quantized so we don't
     # need to run fake quantization
@@ -379,29 +332,13 @@ def maybe_calibrate_or_quantize(
     g_idx = getattr(module, "weight_g_idx", None)
     if args.dynamic:
-        # dynamic quantization - no need to invoke observer
+        # dynamic quantization - determine the scale/zp on the fly
         scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args)
     else:
-        # static quantization - get previous scale and zero point from layer
+        # static quantization - get scale and zero point from layer
         scale = getattr(module, f"{base_name}_scale")
         zero_point = getattr(module, f"{base_name}_zero_point", None)
-        if (
-            module.quantization_status == QuantizationStatus.CALIBRATION
-            and base_name != "weight"
-        ):
-            # calibration mode - get new quant params from observer
-            observer = getattr(module, f"{base_name}_observer")
-            updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
-            # update scale and zero point
-            update_parameter_data(module, updated_scale, f"{base_name}_scale")
-            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
-            scale = updated_scale
-            zero_point = updated_zero_point
     return fake_quantize(
         x=value,
         scale=scale,

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -14,13 +14,12 @@
 import logging
+from enum import Enum
 from typing import Optional
 import torch
-from compressed_tensors.quantization.cache import KVCacheScaleType
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
-    wrap_module_forward_quantized_attn,
 )
 from compressed_tensors.quantization.quant_args import (
     ActivationOrdering,
@@ -36,12 +35,19 @@ from torch.nn import Module, Parameter
 __all__ = [
     "initialize_module_for_quantization",
+    "is_attention_module",
+    "KVCacheScaleType",
 ]
 _LOGGER = logging.getLogger(__name__)
+class KVCacheScaleType(Enum):
+    KEY = "k_scale"
+    VALUE = "v_scale"
 def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
@@ -66,15 +72,13 @@ def initialize_module_for_quantization(
         return
     if is_attention_module(module):
-        # wrap forward call of module to perform
         # quantized actions based on calltime status
-        wrap_module_forward_quantized_attn(module, scheme)
         _initialize_attn_scales(module)
     else:
         if scheme.input_activations is not None:
-            _initialize_scale_zero_point_observer(
+            _initialize_scale_zero_point(
                 module,
                 "input",
                 scheme.input_activations,
@@ -85,7 +89,7 @@ def initialize_module_for_quantization(
                 weight_shape = None
                 if isinstance(module, torch.nn.Linear):
                     weight_shape = module.weight.shape
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module,
                     "weight",
                     scheme.weights,
@@ -101,7 +105,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module, "output", scheme.output_activations
                 )
@@ -109,6 +113,7 @@ def initialize_module_for_quantization(
         module.quantization_status = QuantizationStatus.INITIALIZED
         offloaded = False
+        # What is this doing/why isn't this in the attn case?
         if is_module_offloaded(module):
             try:
                 from accelerate.hooks import add_hook_to_module, remove_hook_from_module
@@ -146,21 +151,21 @@ def initialize_module_for_quantization(
                 module._hf_hook.weights_map = new_prefix_dict
-def _initialize_scale_zero_point_observer(
+def is_attention_module(module: Module):
+    return "attention" in module.__class__.__name__.lower() and (
+        hasattr(module, "k_proj")
+        or hasattr(module, "v_proj")
+        or hasattr(module, "qkv_proj")
+    )
+def _initialize_scale_zero_point(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
 ):
-    # initialize observer module and attach as submodule
-    observer = quantization_args.get_observer()
-    # no need to register an observer for dynamic quantization
-    if observer:
-        module.register_module(f"{base_name}_observer", observer)
-    # no need to register a scale and zero point for a dynamic quantization
     if quantization_args.dynamic:
         return
@@ -209,14 +214,6 @@ def _initialize_scale_zero_point_observer(
         module.register_parameter(f"{base_name}_g_idx", init_g_idx)
-def is_attention_module(module: Module):
-    return "attention" in module.__class__.__name__.lower() and (
-        hasattr(module, "k_proj")
-        or hasattr(module, "v_proj")
-        or hasattr(module, "qkv_proj")
-    )
 def _initialize_attn_scales(module: Module) -> None:
     """Initlaize k_scale, v_scale for  self_attn"""

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -17,6 +17,7 @@ from enum import Enum
 from typing import Any, Dict, Optional, Union
 import torch
+from compressed_tensors.utils import Aliasable
 from pydantic import BaseModel, Field, field_validator, model_validator
@@ -53,17 +54,29 @@ class QuantizationStrategy(str, Enum):
     TOKEN = "token"
-class ActivationOrdering(str, Enum):
+class ActivationOrdering(Aliasable, str, Enum):
     """
     Enum storing strategies for activation ordering
     Group: reorder groups and weight\n
-    Weight: only reorder weight, not groups. Slightly lower latency and
-    accuracy compared to group actorder\n
+    Weight: only reorder weight, not groups. Slightly lower accuracy but also lower
+    latency when compared to group actorder\n
+    Dynamic: alias for Group\n
+    Static: alias for Weight\n
     """
     GROUP = "group"
     WEIGHT = "weight"
+    # aliases
+    DYNAMIC = "dynamic"
+    STATIC = "static"
+    @staticmethod
+    def get_aliases() -> Dict[str, str]:
+        return {
+            "dynamic": "group",
+            "static": "weight",
+        }
 class QuantizationArgs(BaseModel, use_enum_values=True):
@@ -114,20 +127,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         """
         :return: torch quantization FakeQuantize built based on these QuantizationArgs
         """
-        from compressed_tensors.quantization.observers.base import Observer
-        # No observer required for the dynamic case
-        if self.dynamic:
-            self.observer = None
-            return self.observer
-        return Observer.load_from_registry(self.observer, quantization_args=self)
-    def get_kv_cache(self):
-        """Get the singleton KV Cache"""
-        from compressed_tensors.quantization.cache import QuantizedKVParameterCache
-        return QuantizedKVParameterCache(self)
+        return self.observer
     @field_validator("type", mode="before")
     def validate_type(cls, value) -> QuantizationType:
@@ -210,6 +210,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                 "activation ordering"
             )
+        # infer observer w.r.t. dynamic
         if dynamic:
             if strategy not in (
                 QuantizationStrategy.TOKEN,
@@ -221,18 +222,19 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                     "quantization",
                 )
             if observer is not None:
-                warnings.warn(
-                    "No observer is used for dynamic quantization, setting to None"
-                )
-                model.observer = None
+                if observer != "memoryless":  # avoid annoying users with old configs
+                    warnings.warn(
+                        "No observer is used for dynamic quantization, setting to None"
+                    )
+                observer = None
-        # if we have not set an observer and we
-        # are running static quantization, use minmax
-        if not observer and not dynamic:
-            model.observer = "minmax"
+        elif observer is None:
+            # default to minmax for non-dynamic cases
+            observer = "minmax"
         # write back modified values
         model.strategy = strategy
+        model.observer = observer
         return model
     def pytorch_dtype(self) -> torch.dtype:

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -132,9 +132,9 @@ class QuantizationConfig(BaseModel):
         `k_proj` and `v_proj` in their names. If this is not the case
         and kv_cache_scheme != None, the quantization of kv cache will fail
     :global_compression_ratio: optional informational config to report the model
-    compression ratio acheived by the quantization config
+        compression ratio acheived by the quantization config
     :ignore: optional list of layers to ignore from config_groups. Layers in this list
-    are not quantized even if they match up with a target in config_groups
+        are not quantized even if they match up with a target in config_groups
     """
     config_groups: Dict[str, Union[QuantizationScheme, List[str]]]

compressed-tensors 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

compressed-tensors 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl