PyPI - compressed-tensors - Versions diffs - 0.9.4a20250414__tar.gz → 0.9.5a20250424__tar.gz - Mend

compressed-tensors 0.9.4a20250414tar.gz → 0.9.5a20250424tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/build.yml RENAMED Viewed

@@ -76,7 +76,7 @@ jobs:
             - name: build
               id: build
-              uses: neuralmagic/nm-actions/actions/build-ml-whl@v1.18.0
+              uses: neuralmagic/nm-actions/actions/build-ml-whl@fix-whl-checks
               with:
                   dev: false
                   release: ${{ inputs.wf_category == 'RELEASE' }}

{compressed_tensors-0.9.4a20250414/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250424}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.4a20250414
+Version: 0.9.5a20250424
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

compressed_tensors-0.9.5a20250424/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[tool.black]
+line-length = 88
+target-version = ['py36']

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/setup.py RENAMED Viewed

@@ -101,6 +101,7 @@ setup(
     use_scm_version={
         "version_scheme": version_func,
         "local_scheme": localversion_func,
+        "version_file": "src/compressed_tensors/version.py",
     },
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/base.py RENAMED Viewed

@@ -19,6 +19,7 @@ import torch
 from compressed_tensors.config import SparsityCompressionConfig
 from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
 from compressed_tensors.registry import RegistryMixin
+from compressed_tensors.utils import has_offloaded_params
 from torch import Tensor
 from torch.nn import Module
@@ -169,6 +170,10 @@ class BaseCompressor(RegistryMixin, ABC):
         :param module: PyTorch module to decompress
         :return: tensor of the decompressed weight, or None if module is not quantized
         """
+        params_device = next(module.parameters()).device
+        device = "cpu" if has_offloaded_params(module) else params_device
         if not hasattr(module, "quantization_scheme"):
             return None  # module is not quantized
         quantization_scheme = module.quantization_scheme
@@ -182,7 +187,7 @@ class BaseCompressor(RegistryMixin, ABC):
         return self.decompress_weight(
             compressed_data=compressed_data, quantization_args=quantization_args
-        )
+        ).to(device)
     def decompress_weight(
         self, compressed_data: Dict[str, Tensor], **kwargs

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/model_compressors/model_compressor.py RENAMED Viewed

@@ -31,13 +31,14 @@ from compressed_tensors.base import (
     SPARSITY_CONFIG_NAME,
 )
 from compressed_tensors.compressors.base import BaseCompressor
+from compressed_tensors.compressors.sparse_compressors import DenseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
     QuantizationConfig,
     QuantizationStatus,
     apply_quantization_config,
-    load_pretrained_quantization,
+    load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
 from compressed_tensors.quantization.quant_args import QuantizationArgs
@@ -47,7 +48,9 @@ from compressed_tensors.quantization.utils import (
 )
 from compressed_tensors.utils import (
     get_safetensors_folder,
+    has_offloaded_params,
     merge_names,
+    register_offload_parameter,
     update_parameter_data,
 )
 from compressed_tensors.utils.helpers import (
@@ -382,6 +385,7 @@ class ModelCompressor:
             compressed_state_dict = self.quantization_compressor.compress(
                 state_dict, names_to_scheme=quantized_modules_to_args
             )
             if self.quantization_config.format != CompressionFormat.dense.value:
                 self.quantization_config.quantization_status = (
                     QuantizationStatus.COMPRESSED
@@ -411,6 +415,13 @@ class ModelCompressor:
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
+        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
+        The variations in these methods are a result of the subtle variations between the sparsity
+        and quantization compressors. Specifically, quantization compressors return not just the
+        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
+        compressors only return the decompressed weight.
         """
         model_path = get_safetensors_folder(model_path)
         sparse_decompressed = False
@@ -419,9 +430,16 @@ class ModelCompressor:
             self.sparsity_compressor is not None
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
+            params_to_ignore = None
+            if self.quantization_compressor is not None:
+                params_to_ignore = self.quantization_compressor.compression_param_names
             # Sparse decompression is applied on the model_path
-            dense_gen = self.sparsity_compressor.decompress(model_path)
-            self._replace_weights(dense_gen, model)
+            # The compressor will try and load any quantization parameters as well
+            # params_to_skip_load will skip over quantization params from being loaded
+            dense_gen = self.sparsity_compressor.decompress(
+                model_path, params_to_skip_load=params_to_ignore
+            )
+            self._replace_sparsity_weights(dense_gen, model)
             setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
             sparse_decompressed = True
@@ -430,13 +448,27 @@ class ModelCompressor:
             # quantization during apply_quantization_config. This ensures
             # that the dtypes of the weights are not unintentionally updated.
             # The status is restored after quantization params are loaded.
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
                 names_to_scheme = apply_quantization_config(
                     model, self.quantization_config
                 )
-                load_pretrained_quantization(model, model_path)
+                # Load activation scales/zp or any other quantization parameters
+                # Conditionally load the weight quantization parameters if we have a dense compressor
+                # Or if a sparsity compressor has already been applied
+                load_pretrained_quantization_parameters(
+                    model,
+                    model_path,
+                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
+                    # including initialization
+                    load_weight_quantization=(
+                        sparse_decompressed
+                        or isinstance(self.quantization_compressor, DenseCompressor)
+                    ),
+                )
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
@@ -445,6 +477,8 @@ class ModelCompressor:
             dense_gen = self.quantization_compressor.decompress(
                 model_path_or_state_dict, names_to_scheme=names_to_scheme
             )
+            # TODO: all weight quantization params will be moved to the compressor
+            # to prevent duplicate parameter updates in update_parameter_data
             self._replace_weights(dense_gen, model)
             def freeze_quantization_status(module):
@@ -500,7 +534,7 @@ class ModelCompressor:
         with open(config_file_path, "w") as config_file:
             json.dump(config_data, config_file, indent=2, sort_keys=True)
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -515,11 +549,60 @@ class ModelCompressor:
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
-            if hasattr(module, param_name):
-                update_parameter_data(module, data, param_name)
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            delattr(module, param_name)
+            requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
+            param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
+            register_offload_parameter(module, param_name, param)
+    def _replace_weights(self, dense_weight_generator, model: Module):
+        """
+        Replace the weights of the model with the
+        provided dense weights.
+        This method iterates over the dense_weight_generator and
+        updates the corresponding weights in the model. If a parameter
+        name does not exist in the model, it will be skipped.
+        :param dense_weight_generator (generator): A generator that yields
+            tuples of (name, data), where 'name' is the parameter name and
+            'data' is the updated param data
+        :param model: The model whose weights are to be updated.
+        """
+        for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+            module = operator.attrgetter(name)(model)
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            for param_name, param_data in data.items():
+                if hasattr(module, param_name):
+                    # If compressed, will have an incorrect dtype for transformers >4.49
+                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
+                    # to be consistent with loading which happens later as well
+                    # however, update_data does a good shape check - should be moved to the compressor
+                    if param_name == "weight":
+                        delattr(module, param_name)
+                        requires_grad = param_data.dtype in (
+                            torch.float16,
+                            torch.float32,
+                            torch.bfloat16,
+                        )
+                        param = torch.nn.Parameter(
+                            param_data.to(device), requires_grad=requires_grad
+                        )
+                        register_offload_parameter(module, param_name, param)
+                    else:
+                        # Should already be registered to the correct device for
+                        # for scales/zero-points
+                        update_parameter_data(module, param_data, param_name)
 def map_modules_to_quant_args(

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/base.py RENAMED Viewed

@@ -14,11 +14,11 @@
 import logging
 from pathlib import Path
-from typing import Any, Dict, Generator, Tuple, Union
+from typing import Any, Dict, Generator, Optional, Tuple, Union
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.utils import (
     get_nested_mappings_from_state_dict,
     get_nested_weight_mappings,
@@ -132,8 +132,10 @@ class BaseQuantizationCompressor(BaseCompressor):
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            # only save if asym
-            elif is_weight_zp and quant_args_zp.symmetric:
+            # only save zp if asym and not packed zp
+            elif is_weight_zp and (
+                quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
+            ):
                 continue
             # only save if asym
             elif is_input_zp and input_args_zp.symmetric:
@@ -145,6 +147,17 @@ class BaseQuantizationCompressor(BaseCompressor):
         return compressed_dict
+    def _check_if_zp_pack_quantized(self, quant_args):
+        from compressed_tensors.compressors import PackedQuantizationCompressor
+        if isinstance(self, PackedQuantizationCompressor):
+            if not quant_args.symmetric and quant_args.strategy in [
+                QuantizationStrategy.GROUP.value,
+                QuantizationStrategy.CHANNEL.value,
+            ]:
+                return True
+        return False
     def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
@@ -186,7 +199,8 @@ class BaseQuantizationCompressor(BaseCompressor):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
     def _decompress_from_state_dict(self, state_dict, names_to_scheme):
         weight_mappings = get_nested_mappings_from_state_dict(
@@ -202,4 +216,5 @@ class BaseQuantizationCompressor(BaseCompressor):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py RENAMED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Dict, Optional, Tuple
+from typing import Dict, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -21,7 +21,7 @@ from compressed_tensors.compressors.quantized_compressors.base import (
     BaseQuantizationCompressor,
 )
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
 from torch import Tensor
@@ -65,10 +65,26 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         """
         pack_factor = 32 // quantization_args.num_bits
         packed_size = math.ceil(weight_shape[1] / pack_factor)
-        return {
+        packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
+        output = {
             "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
             "weight_shape": (torch.Size((2,)), torch.int32),
         }
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            zp_factor = (
+                quantization_args.group_size
+                if quantization_args.strategy == QuantizationStrategy.GROUP.value
+                else weight_shape[-1]
+            )
+            output["weight_zero_point"] = (
+                torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
+                torch.int32,
+            )
+        return output
     def compress_weight(
         self,
@@ -104,6 +120,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
             quantized_weight = weight
         packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
         weight_shape = torch.tensor(weight.shape)
         if device is not None:
             packed_weight = packed_weight.to(device)
@@ -112,6 +129,15 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
+        # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            packed_zp = pack_to_int32(
+                zero_point, quantization_args.num_bits, packed_dim=0
+            )
+            compressed_dict["weight_zero_point"] = packed_zp
         return compressed_dict
     def decompress_weight(
@@ -133,6 +159,21 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         original_shape = torch.Size(compressed_data["weight_shape"])
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
+        # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            raise ValueError(
+                "Decompression of packed zero points is currently not supported"
+            )
+            assert zero_point is not None
+            original_zp_shape = (original_shape[0], scale.shape[-1])
+            zero_point = unpack_from_int32(
+                zero_point, num_bits, original_zp_shape, packed_dim=0
+            )
         decompressed_weight = dequantize(
             x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
@@ -140,7 +181,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         return decompressed_weight
-def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
+def pack_to_int32(
+    value: torch.Tensor,
+    num_bits: int,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
+) -> torch.Tensor:
     """
     Packs a tensor of quantized weights stored in int8 into int32s with padding
@@ -176,14 +221,19 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
     pack_factor = 32 // num_bits
     # pad input tensor and initialize packed output
-    packed_size = math.ceil(value.shape[1] / pack_factor)
-    padding = packed_size * pack_factor - value.shape[1]
+    packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
+    padding = packed_size * pack_factor - value.shape[packed_dim]
     value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
     # pack values
-    packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
-    for i in range(pack_factor):
-        packed |= value[:, i::pack_factor] << num_bits * i
+    if packed_dim == 1:
+        packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[:, i::pack_factor] << num_bits * i
+    else:
+        packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[i::pack_factor, :] << num_bits * i
     # convert back to signed and torch
     packed = np.ascontiguousarray(packed).view(np.int32)
@@ -191,7 +241,10 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
 def unpack_from_int32(
-    value: torch.Tensor, num_bits: int, shape: torch.Size
+    value: torch.Tensor,
+    num_bits: int,
+    shape: torch.Size,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
 ) -> torch.Tensor:
     """
     Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
@@ -216,17 +269,31 @@ def unpack_from_int32(
     # unpack
     mask = (1 << num_bits) - 1
-    unpacked = torch.zeros(
-        (value.shape[0], value.shape[1] * pack_factor),
-        device=value.device,
-        dtype=torch.int32,
-    )
-    for i in range(pack_factor):
-        unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
-    # remove padding
-    original_row_size = int(shape[1])
-    unpacked = unpacked[:, :original_row_size]
+    if packed_dim == 1:
+        unpacked = torch.zeros(
+            (value.shape[0], value.shape[1] * pack_factor),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
+        # remove padding
+        original_row_size = int(shape[1])
+        unpacked = unpacked[:, :original_row_size]
+    else:
+        unpacked = torch.zeros(
+            (value.shape[0] * pack_factor, value.shape[1]),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
+        # remove padding
+        original_row_size = int(shape[0])
+        unpacked = unpacked[:original_row_size, :]
     # bits are packed in unsigned format, reformat to signed
     # update the value range from unsigned to signed

{compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/base.py RENAMED Viewed

@@ -98,7 +98,11 @@ class BaseSparseCompressor(BaseCompressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
+        self,
+        path_to_model_or_tensors: str,
+        device: str = "cpu",
+        params_to_skip_load: Optional[Tuple] = None,
+        **kwargs,
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located
@@ -108,6 +112,11 @@ class BaseSparseCompressor(BaseCompressor):
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file
         :param device: device to load decompressed weights onto
+        :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
+            parameters) that we want to skip loading. As the sparsity compresssor does
+            not handle quantized decompression, this should contain any quantization
+            parameters when decompressing stacked compressors. We want these parameters
+            to be handled by the quantization decompressor
         :return: iterator for generating decompressed weights
         """
         weight_mappings, ignored_params = get_nested_weight_mappings(
@@ -121,13 +130,21 @@ class BaseSparseCompressor(BaseCompressor):
                 full_name = merge_names(weight_name, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
             decompressed = self.decompress_weight(weight_data)
             yield merge_names(weight_name, "weight"), decompressed
         for ignored_param_name, safe_path in ignored_params.items():
-            with safe_open(safe_path, framework="pt", device=device) as f:
-                value = f.get_tensor(ignored_param_name)
-            yield ignored_param_name, value
+            should_skip = False
+            if params_to_skip_load is not None:
+                for param_to_skip in params_to_skip_load:
+                    if param_to_skip in ignored_param_name:
+                        should_skip = True
+            if not should_skip:
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    value = f.get_tensor(ignored_param_name)
+                yield ignored_param_name, value
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:

compressed-tensors 0.9.4a20250414__tar.gz → 0.9.5a20250424__tar.gz

compressed-tensors 0.9.4a20250414tar.gz → 0.9.5a20250424tar.gz