PyPI - compressed-tensors - Versions diffs - 0.11.1a20250820__py3-none-any.whl → 0.11.1a20250828__py3-none-any.whl - Mend

compressed-tensors 0.11.1a20250820py3-none-any.whl → 0.11.1a20250828py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

compressed_tensors/compressors/model_compressors/model_compressor.py CHANGED Viewed

@@ -42,8 +42,6 @@ from compressed_tensors.quantization import (
     apply_quantization_config,
     load_pretrained_quantization_parameters,
 )
-from compressed_tensors.quantization.lifecycle import expand_target_names
-from compressed_tensors.quantization.utils import is_module_quantized
 from compressed_tensors.transform import TransformConfig
 from compressed_tensors.utils import (
     align_module_device,
@@ -60,6 +58,7 @@ from compressed_tensors.utils.helpers import (
     fix_fsdp_module_name,
     is_compressed_tensors_config,
 )
+from compressed_tensors.utils.match import match_named_modules
 from torch import Tensor
 from torch.nn import Module
 from tqdm import tqdm
@@ -309,7 +308,7 @@ class ModelCompressor:
         if quantization_config is not None:
             # If a list of compression_format is not provided, we resolve the
             # relevant quantization formats using the config groups from the config
-            # and if those are not defined, we fall-back to the global quantization format
+            # and if those are not defined, we fall-back to the global quantization fmt
             if not self.compression_formats:
                 self.compression_formats = self._fetch_unique_quantization_formats()
@@ -342,13 +341,15 @@ class ModelCompressor:
             self.sparsity_compressor
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
-            sparse_targets = expand_target_names(
+            sparse_targets = match_named_modules(
                 model=model,
                 targets=self.sparsity_config.targets,
                 ignore=self.sparsity_config.ignore,
             )
             missing_keys.update(
-                merge_names(target, "weight") for target in sparse_targets
+                merge_names(target_name, "weight")
+                for target_name, _module in sparse_targets
             )
         # Determine missing keys due to pack quantization
@@ -358,13 +359,14 @@ class ModelCompressor:
             == CompressionFormat.pack_quantized.value
         ):
             for scheme in self.quantization_config.config_groups.values():
-                quant_targets = expand_target_names(
+                quant_targets = match_named_modules(
                     model=model,
                     targets=scheme.targets,
                     ignore=self.quantization_config.ignore,
                 )
                 missing_keys.update(
-                    merge_names(target, "weight") for target in quant_targets
+                    merge_names(target_name, "weight")
+                    for target_name, _module in quant_targets
                 )
         return list(missing_keys)
@@ -395,29 +397,29 @@ class ModelCompressor:
             self.sparsity_compressor
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
-            sparse_targets: Set[str] = expand_target_names(
+            sparse_targets = match_named_modules(
                 model=model,
                 targets=self.sparsity_config.targets,
                 ignore=self.sparsity_config.ignore,
             )
             unexpected_keys.update(
-                merge_names(target, param)
-                for target in sparse_targets
+                merge_names(target_name, param)
+                for target_name, _module in sparse_targets
                 for param in self.sparsity_compressor.compression_param_names
             )
         # Identify unexpected keys from quantization compression
         if self.quantization_compressor:
             for scheme in self.quantization_config.config_groups.values():
-                quant_targets: Set[str] = expand_target_names(
+                quant_targets = match_named_modules(
                     model=model,
                     targets=scheme.targets,
                     ignore=self.quantization_config.ignore,
                 )
                 for quant_compressor in self.quantization_compressor.values():
                     unexpected_keys.update(
-                        merge_names(target, param)
-                        for target in quant_targets
+                        merge_names(target_name, param)
+                        for target_name, _module in quant_targets
                         for param in quant_compressor.compression_param_names
                         if param != "weight"
                     )
@@ -434,73 +436,79 @@ class ModelCompressor:
         :param model: model containing parameters to compress
         """
         module_to_scheme = map_module_to_scheme(model)
-        sparse_compression_targets: Set[str] = expand_target_names(
-            model=model,
-            targets=self.sparsity_config.targets if self.sparsity_config else [],
-            ignore=self.sparsity_config.ignore if self.sparsity_config else [],
-        )
-        for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
-            if prefix in module_to_scheme or prefix in sparse_compression_targets:
-                module_device = get_execution_device(module)
-                is_meta = module_device.type == "meta"
-                exec_device = "meta" if is_meta else "cpu"
-                onloading_device = "meta" if is_meta else module_device
-                # in the future, support compression on same device
-                with align_module_device(module, execution_device=exec_device):
-                    state_dict = {
-                        f"{prefix}.{name}": param
-                        for name, param in module.named_parameters(recurse=False)
-                    }
-                # quantization first
-                if prefix in module_to_scheme:
-                    if (
-                        not hasattr(module.quantization_scheme, "format")
-                        or module.quantization_scheme.format is None
-                    ):
-                        if len(self.compression_formats) > 1:
-                            raise ValueError(
-                                "Applying multiple compressors without defining "
-                                "per module formats is not supported "
-                            )
-                        format = self.compression_formats[0]
-                    else:
-                        format = module.quantization_scheme.format
-                    quant_compressor = self.quantization_compressor.get(format)
-                    state_dict = quant_compressor.compress(
-                        state_dict,
-                        names_to_scheme=module_to_scheme,
-                        show_progress=False,
-                        compression_device=exec_device,
-                    )
-                # sparsity second
-                if prefix in sparse_compression_targets:
-                    state_dict = self.sparsity_compressor.compress(
-                        state_dict,
-                        compression_targets=sparse_compression_targets,
-                        show_progress=False,
-                    )
+        sparse_compression_targets = [
+            module_name
+            for module_name, _module in match_named_modules(
+                model=model,
+                targets=self.sparsity_config.targets if self.sparsity_config else [],
+                ignore=self.sparsity_config.ignore if self.sparsity_config else [],
+            )
+        ]
+        for prefix, module in tqdm(
+            match_named_modules(
+                model,
+                [*sparse_compression_targets, *module_to_scheme.keys()],
+                warn_on_fail=True,
+            ),
+            desc="Compressing model",
+        ):
+            module_device = get_execution_device(module)
+            is_meta = module_device.type == "meta"
+            exec_device = "meta" if is_meta else "cpu"
+            onloading_device = "meta" if is_meta else module_device
+            # in the future, support compression on same device
+            with align_module_device(module, execution_device=exec_device):
+                state_dict = {
+                    f"{prefix}.{name}": param
+                    for name, param in module.named_parameters(recurse=False)
+                }
+            # quantization first
+            if prefix in module_to_scheme:
+                if (
+                    not hasattr(module.quantization_scheme, "format")
+                    or module.quantization_scheme.format is None
+                ):
+                    if len(self.compression_formats) > 1:
+                        raise ValueError(
+                            "Applying multiple compressors without defining "
+                            "per module formats is not supported "
+                        )
+                    format = self.compression_formats[0]
+                else:
+                    format = module.quantization_scheme.format
+                quant_compressor = self.quantization_compressor.get(format)
+                state_dict = quant_compressor.compress(
+                    state_dict,
+                    names_to_scheme=module_to_scheme,
+                    show_progress=False,
+                    compression_device=exec_device,
+                )
-                # remove any existing parameters
-                offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters(recurse=False)):
-                    delete_offload_parameter(module, name)
+            # sparsity second
+            if prefix in sparse_compression_targets:
+                state_dict = self.sparsity_compressor.compress(
+                    state_dict,
+                    compression_targets=sparse_compression_targets,
+                    show_progress=False,
+                )
-                # replace with compressed parameters
-                for name, value in state_dict.items():
-                    name = name.removeprefix(f"{prefix}.")
-                    value = value.to(onloading_device)
-                    param = torch.nn.Parameter(value, requires_grad=False)
-                    register_offload_parameter(module, name, param, offload_device)
+            # remove any existing parameters
+            offload_device = get_offloaded_device(module)
+            for name, _ in list(module.named_parameters(recurse=False)):
+                delete_offload_parameter(module, name)
-                module.quantization_status = QuantizationStatus.COMPRESSED
+            # replace with compressed parameters
+            for name, value in state_dict.items():
+                name = name.removeprefix(f"{prefix}.")
+                value = value.to(onloading_device)
+                param = torch.nn.Parameter(value, requires_grad=False)
+                register_offload_parameter(module, name, param, offload_device)
+            module.quantization_status = QuantizationStatus.COMPRESSED
         # TODO: consider sparse compression to also be compression
         if (
             self.quantization_config is not None
@@ -516,67 +524,75 @@ class ModelCompressor:
         :param model: model containing parameters to compress
         """
         module_to_scheme = map_module_to_scheme(model)
-        sparse_compression_targets: Set[str] = expand_target_names(
-            model=model,
-            targets=self.sparsity_config.targets if self.sparsity_config else [],
-            ignore=self.sparsity_config.ignore if self.sparsity_config else [],
-        )
-        for prefix, module in tqdm(model.named_modules(), desc="Decompressing model"):
-            if prefix in module_to_scheme or prefix in sparse_compression_targets:
-                # in the future, support decompression on same device
-                with align_module_device(module, execution_device="cpu"):
-                    state_dict = {
-                        f"{prefix}.{name}": param
-                        for name, param in module.named_parameters(recurse=False)
-                    }
-                # sparsity first
-                if prefix in sparse_compression_targets:
-                    # sparse_compression_targets are automatically inferred by this fn
-                    generator = self.sparsity_compressor.decompress_from_state_dict(
-                        state_dict,
-                    )
-                    # generates (param_path, param_val)
-                    # of compressed and unused params
-                    state_dict = {key: value for key, value in generator}
-                # quantization second
-                if prefix in module_to_scheme:
-                    if (
-                        not hasattr(module.quantization_scheme, "format")
-                        or module.quantization_scheme.format is None
-                    ):
-                        if len(self.compression_formats) > 1:
-                            raise ValueError(
-                                "Applying multiple compressors without defining "
-                                "per module formats is not supported "
-                            )
-                        format = self.compression_formats[0]
-                    else:
-                        format = module.quantization_scheme.format
-                    quant_compressor = self.quantization_compressor.get(format)
-                    state_dict = quant_compressor.decompress_module_from_state_dict(
-                        prefix,
-                        state_dict,
-                        scheme=module_to_scheme[prefix],
-                    )
+        sparse_compression_targets = [
+            module_name
+            for module_name, _module in match_named_modules(
+                model=model,
+                targets=self.sparsity_config.targets if self.sparsity_config else [],
+                ignore=self.sparsity_config.ignore if self.sparsity_config else [],
+            )
+        ]
+        for prefix, module in tqdm(
+            match_named_modules(
+                model,
+                [*sparse_compression_targets, *module_to_scheme.keys()],
+                warn_on_fail=True,
+            ),
+            desc="Decompressing model",
+        ):
+            # in the future, support decompression on same device
+            with align_module_device(module, execution_device="cpu"):
+                state_dict = {
+                    f"{prefix}.{name}": param
+                    for name, param in module.named_parameters(recurse=False)
+                }
+            # sparsity first
+            if prefix in sparse_compression_targets:
+                # sparse_compression_targets are automatically inferred by this fn
+                generator = self.sparsity_compressor.decompress_from_state_dict(
+                    state_dict,
+                )
+                # generates (param_path, param_val)
+                # of compressed and unused params
+                state_dict = {key: value for key, value in generator}
+            # quantization second
+            if prefix in module_to_scheme:
+                if (
+                    not hasattr(module.quantization_scheme, "format")
+                    or module.quantization_scheme.format is None
+                ):
+                    if len(self.compression_formats) > 1:
+                        raise ValueError(
+                            "Applying multiple compressors without defining "
+                            "per module formats is not supported "
+                        )
+                    format = self.compression_formats[0]
+                else:
+                    format = module.quantization_scheme.format
+                quant_compressor = self.quantization_compressor.get(format)
+                state_dict = quant_compressor.decompress_module_from_state_dict(
+                    prefix,
+                    state_dict,
+                    scheme=module_to_scheme[prefix],
+                )
-                # remove any existing parameters
-                exec_device = get_execution_device(module)
-                offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters(recurse=False)):
-                    delete_offload_parameter(module, name)
+            # remove any existing parameters
+            exec_device = get_execution_device(module)
+            offload_device = get_offloaded_device(module)
+            for name, _ in list(module.named_parameters(recurse=False)):
+                delete_offload_parameter(module, name)
-                # replace with decompressed parameters
-                for name, value in state_dict.items():
-                    name = name.removeprefix(f"{prefix}.")
-                    value = value.to(exec_device)
-                    param = torch.nn.Parameter(value, requires_grad=False)
-                    register_offload_parameter(module, name, param, offload_device)
+            # replace with decompressed parameters
+            for name, value in state_dict.items():
+                name = name.removeprefix(f"{prefix}.")
+                value = value.to(exec_device)
+                param = torch.nn.Parameter(value, requires_grad=False)
+                register_offload_parameter(module, name, param, offload_device)
-                module.quantization_status = QuantizationStatus.FROZEN
+            module.quantization_status = QuantizationStatus.FROZEN
     # ----- state dict compression pathways ----- #
@@ -614,11 +630,14 @@ class ModelCompressor:
                 )
         if self.sparsity_compressor is not None:
-            sparse_compression_targets: Set[str] = expand_target_names(
-                model=model,
-                targets=self.sparsity_config.targets,
-                ignore=self.sparsity_config.ignore,
-            )
+            sparse_compression_targets: Set[str] = {
+                module_name
+                for module_name, _module in match_named_modules(
+                    model=model,
+                    targets=self.sparsity_config.targets,
+                    ignore=self.sparsity_config.ignore,
+                )
+            }
             state_dict = self.sparsity_compressor.compress(
                 state_dict,
                 compression_targets=sparse_compression_targets,
@@ -641,11 +660,12 @@ class ModelCompressor:
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
-        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
-        The variations in these methods are a result of the subtle variations between the sparsity
-        and quantization compressors. Specifically, quantization compressors return not just the
-        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
-        compressors only return the decompressed weight.
+        Note: decompress makes use of both _replace_sparsity_weights and
+        _replace_weights. The variations in these methods are a result of the subtle
+        variations between the sparsity and quantization compressors. Specifically,
+        quantization compressors return not just the decompressed weight, but the
+        quantization parameters (e.g scales, zero_point) whereas sparsity compressors
+        only return the decompressed weight.
         """
         model_path = get_safetensors_folder(model_path)
@@ -683,18 +703,20 @@ class ModelCompressor:
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
-                names_to_scheme = apply_quantization_config(
-                    model, self.quantization_config
-                )
+                apply_quantization_config(model, self.quantization_config)
+                names_to_scheme: Set[QuantizationScheme] = {
+                    name: getattr(module, "quantization_scheme")
+                    for name, module in model.named_modules()
+                    if getattr(module, "quantization_scheme", None) is not None
+                }
                 # Load activation scales/zp or any other quantization parameters
-                # Conditionally load the weight quantization parameters if we have a dense compressor
-                # Or if a sparsity compressor has already been applied
+                # Conditionally load the weight quantization parameters if we have a
+                # dense compressor or if a sparsity compressor has already been applied
                 load_pretrained_quantization_parameters(
                     model,
                     model_path,
-                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
-                    # including initialization
+                    # TODO: all weight quantization params will be moved to the
+                    # compressor in a follow-up including initialization
                     load_weight_quantization=(
                         sparse_decompressed
                         or isinstance(quant_compressor, DenseCompressor)
@@ -786,7 +808,6 @@ class ModelCompressor:
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
@@ -822,9 +843,10 @@ class ModelCompressor:
             for param_name, param_data in data.items():
                 if hasattr(module, param_name):
                     # If compressed, will have an incorrect dtype for transformers >4.49
-                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
-                    # to be consistent with loading which happens later as well
-                    # however, update_data does a good shape check - should be moved to the compressor
+                    # TODO: we can also just skip initialization of scales/zp if in
+                    # decompression in init to be consistent with loading which happens
+                    # later as well however, update_data does a good shape check -
+                    # should be moved to the compressor
                     if param_name == "weight":
                         delattr(module, param_name)
                         requires_grad = param_data.dtype in (

compressed_tensors/compressors/quantized_compressors/base.py CHANGED Viewed

@@ -24,7 +24,6 @@ from compressed_tensors.utils import (
     get_nested_weight_mappings,
     merge_names,
 )
-from compressed_tensors.utils.safetensors_load import match_param_name
 from safetensors import safe_open
 from torch import Tensor
 from tqdm import tqdm
@@ -107,7 +106,8 @@ class BaseQuantizationCompressor(BaseCompressor):
                     compressed_dict[name] = value.to(compression_device)
                     continue
-                # compress values on meta if loading from meta otherwise on cpu (memory movement too expensive)
+                # compress values on meta if loading from meta otherwise on cpu (memory
+                # movement too expensive)
                 module_path = prefix[:-1] if prefix.endswith(".") else prefix
                 quant_args = names_to_scheme[module_path].weights
                 compressed_values = self.compress_weight(

compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py CHANGED Viewed

@@ -15,7 +15,6 @@
 from typing import Dict, Optional, Tuple
-import numpy
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.quantized_compressors.base import (
@@ -92,7 +91,6 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         zero_point: Optional[torch.Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         quantized_weight = quantize(
             x=weight,
             scale=scale,
@@ -112,7 +110,6 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         compressed_data: Dict[str, Tensor],
         quantization_args: Optional[QuantizationArgs] = None,
     ) -> torch.Tensor:
         weight = compressed_data["weight_packed"]
         scale = compressed_data["weight_scale"]
         global_scale = compressed_data["weight_global_scale"]
@@ -126,6 +123,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         return decompressed_weight
+@torch.compile(fullgraph=True, dynamic=True)
 def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     """
     Packs a tensor with values in the fp4 range into uint8.
@@ -148,12 +146,11 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     # Find closest valid FP4 value index for each element
     abs_x = torch.abs(x)
-    abs_indices = torch.zeros_like(abs_x, dtype=torch.long)
-    for i, val in enumerate(kE2M1):
-        abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices)
+    abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE2M1)  # [m, n, 8]
+    abs_indices = torch.argmin(abs_diff_x, dim=-1)  # [m, n]
     # Apply sign bit (bit 3) to get final 4-bit representation
-    indices = abs_indices + (torch.signbit(x) << 3).to(torch.long)
+    indices = abs_indices + (torch.signbit(x).to(torch.long) << 3)
     # Reshape to prepare for packing pairs of values
     indices = indices.reshape(-1)
@@ -175,14 +172,17 @@ kE2M1ToFloat = torch.tensor(
     [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
 )
 # reference: : https://github.com/vllm-project/vllm/pull/16362
+@torch.compile(fullgraph=True, dynamic=True)
 def unpack_fp4_from_uint8(
     a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
 ) -> torch.Tensor:
     """
     Unpacks uint8 values into fp4. Each uint8 consists of two fp4 values
-    (i.e. first four bits correspond to one fp4 value, last four corresond to a consecutive
-    fp4 value). The bits represent an index, which are mapped to an fp4 value.
+    (i.e. first four bits correspond to one fp4 value, last four correspond to a
+    consecutive fp4 value). The bits represent an index, which are mapped to an fp4
+    value.
     :param a: tensor to unpack
     :param m: original dim 0 size of the unpacked tensor

compressed_tensors/compressors/quantized_compressors/pack_quantized.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import math
 from typing import Dict, Literal, Optional, Tuple, Union
-import numpy as np
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.quantized_compressors.base import (
@@ -135,7 +134,8 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
-        # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
+        # We typically don't compress zp; apart from when using the packed_compressor
+        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
@@ -166,7 +166,8 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
-        # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
+        # NOTE: this will fail decompression as we don't currently handle packed zp on
+        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,

compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Generator, List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 import torch
 from compressed_tensors.compressors.base import BaseCompressor

compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py CHANGED Viewed

@@ -48,7 +48,7 @@ class Marlin24Compressor(BaseCompressor):
     @staticmethod
     def validate_quant_compatability(
-        names_to_scheme: Dict[str, QuantizationScheme]
+        names_to_scheme: Dict[str, QuantizationScheme],
     ) -> bool:
         """
         Checks if every quantized module in the model is compatible with Marlin24

compressed-tensors 0.11.1a20250820__py3-none-any.whl → 0.11.1a20250828__py3-none-any.whl

compressed-tensors 0.11.1a20250820py3-none-any.whl → 0.11.1a20250828py3-none-any.whl