PyPI - compressed-tensors - Versions diffs - 0.12.3a20251023__tar.gz → 0.12.3a20251030__tar.gz - Mend

compressed-tensors 0.12.3a20251023tar.gz → 0.12.3a20251030tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

{compressed_tensors-0.12.3a20251023/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251030}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251023
+Version: 0.12.3a20251030
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py RENAMED Viewed

@@ -134,8 +134,6 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
-        # We typically don't compress zp; apart from when using the packed_compressor
-        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
@@ -143,7 +141,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
             packed_zp = pack_to_int32(
                 zero_point, quantization_args.num_bits, packed_dim=0
             )
-            compressed_dict["weight_zero_point"] = packed_zp
+            compressed_dict["weight_zero_point"] = packed_zp.contiguous()
         return compressed_dict
     def decompress_weight(
@@ -166,16 +164,13 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
-        # NOTE: this will fail decompression as we don't currently handle packed zp on
-        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
         ]:
-            raise ValueError(
-                "Decompression of packed zero points is currently not supported"
-            )
-            assert zero_point is not None
+            assert (
+                zero_point is not None
+            ), "Asymmetric quantization requires zero-point values"
             original_zp_shape = (original_shape[0], scale.shape[-1])
             zero_point = unpack_from_int32(
                 zero_point, num_bits, original_zp_shape, packed_dim=0

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -278,7 +278,7 @@ def _process_quantization(
             if columns % group_size != 0:
                 raise ValueError(
                     "tensor column shape must be divisble "
-                    f"by the given group_size {group_size}"
+                    f"by the given group_size {group_size} but got {columns}"
                 )
         # support column-order (default) quantization as well as other orderings

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -25,6 +25,7 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_valida
 __all__ = [
     "FP8_E4M3_DATA",
     "FP4_E2M1_DATA",
+    "BFLOAT16_DATA",
     "FloatArgs",
     "QuantizationType",
     "QuantizationStrategy",
@@ -38,9 +39,9 @@ __all__ = [
 class FloatArgs:
     exponent: int
     mantissa: int
-    bits: int
-    max: float
-    min: float
+    bits: Optional[int] = None
+    max: Optional[float] = None
+    min: Optional[float] = None
     dtype: Optional[torch.dtype] = None
@@ -76,6 +77,11 @@ class FP8_E4M3_DATA(FloatArgs):
     dtype = torch.float8_e4m3fn
+class BFLOAT16_DATA(FloatArgs):
+    exponent = 8
+    mantissa = 7
 class QuantizationType(str, Enum):
     """
     Enum storing quantization type options

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/quantization/utils/__init__.py RENAMED Viewed

@@ -14,3 +14,4 @@
 # flake8: noqa
 from .helpers import *
+from .mxfp4_utils import *

compressed_tensors-0.12.3a20251030/src/compressed_tensors/quantization/utils/mxfp4_utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from compressed_tensors.quantization.quant_args import BFLOAT16_DATA, FP4_E2M1_DATA
+__all__ = ["convert_mxfp4_exp_scale", "generate_mxfp4_scales", "round_to_power_2"]
+# Reference: https://github.com/vllm-project/vllm/blob/main/tests/quantization/reference_mxfp4.py # noqa: E501
+def convert_mxfp4_exp_scale(
+    scale: torch.Tensor, dtype: torch.dtype = torch.bfloat16
+) -> torch.Tensor:
+    """
+    Converts mxfp4 scales. Scales are powers of 2, with the
+    exponents stored in uint8. Converts to dense dtype so that
+    they can be applied to the weights and activations during QDQ
+    :param scale: uint8 exponent scale
+    :param dtype: dense dtype
+    """
+    assert scale.dtype == torch.uint8
+    scale_exp = scale.to(torch.int32) - 127
+    scale = 2.00 ** (scale_exp.to(torch.float))
+    return scale.to(dtype)
+def round_to_power_2(x: torch.Tensor) -> torch.Tensor:
+    """
+    Round values to the closest power of 2.
+    This is done by masking the values with BFLOAT16_SIGN_EXPONENT_MASK
+    which essentially removes the mantissa and keeps the exponent.
+    i.e the closest power of 2 for the input_value.
+    E.g:
+        0.0825 = 1.32 (mantissa) x 2**-4 (exponent)
+        0.0825 ==> -4 (exponent) + 127 = 123 = 01111011 (8 bits for bfloat16)
+        0.0825 ==> 0.32 (mantissa) = 0101001 (7 bits for bfloat16)
+        0.0825 == 0b01111011_0101001 (bfloat16)
+        0b01111011_0101001 & 111111111_0000000 == 0b01111011_0000000
+        Keep the exponent + sign bit to give you the closest power of 2, 0.0625
+    :param x: tensor to round to closest power of 2
+    """
+    assert x.dtype == torch.bfloat16
+    x = x.view(torch.uint16).to(torch.int32)
+    # Find closest power of 2
+    BFLOAT16_VAL_TO_ADD = 1 << (BFLOAT16_DATA.mantissa - FP4_E2M1_DATA.mantissa - 1)
+    # Add value to push the value to the next exponent
+    BFLOAT16_SIGN_EXPONENT_MASK = (
+        (1 << (BFLOAT16_DATA.exponent + 1)) - 1
+    ) << BFLOAT16_DATA.mantissa
+    # mask to only keep exponent - we conservatively round down
+    # to better represent smaller numbers / prevent overflow
+    block_max_uint = torch.bitwise_and(
+        x + BFLOAT16_VAL_TO_ADD, BFLOAT16_SIGN_EXPONENT_MASK
+    )
+    return block_max_uint.to(torch.uint16).view(torch.bfloat16)
+def generate_mxfp4_scales(x: torch.Tensor) -> torch.Tensor:
+    """
+    Generate mxfp4 scales. The scales require the following steps
+    1. Round to the closest power of 2
+    2. Convert to exponent
+    3. Store in uint8
+    Called when calculating qparams using observers.
+    :param x: tensor to round to closest power of 2
+    :returns uint8 scales as exponents
+    """
+    # Round to closest power of 2
+    scale_power_2 = round_to_power_2(x)
+    # Convert to exponent
+    scale_exp = 127 + torch.floor(torch.log2(scale_power_2)).to(torch.int32) - 2
+    # Clamp and store in uint8, as expected by mxfp4
+    scale_exp = torch.clamp(
+        scale_exp,
+        max=torch.iinfo(torch.uint8).max,
+        min=torch.iinfo(torch.uint8).min,
+    )
+    return scale_exp.to(torch.uint8)

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/base.py RENAMED Viewed

@@ -18,6 +18,14 @@ from typing import List, Optional
 import torch
 import torch.nn.utils.parametrize as P
 import tqdm
+from compressed_tensors.modeling.attention import (
+    initialize_hooked_attention,
+    register_query_hook,
+)
+from compressed_tensors.modeling.kvcache import (
+    initialize_hooked_kv_cache,
+    register_key_hook,
+)
 from compressed_tensors.registry.registry import RegistryMixin, T
 from compressed_tensors.transform import (
     TransformArgs,
@@ -36,6 +44,7 @@ from compressed_tensors.utils import (
 from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
 from torch.nn import Module, Parameter
+from transformers import PreTrainedModel
 __all__ = ["TransformFactory", "TransformBase"]
@@ -97,12 +106,13 @@ class TransformFactory(RegistryMixin, ABC):
         desc = f"Applying {self.name} transforms"
         for module, arg in tqdm.tqdm(modules_args, desc=desc, disable=(not use_tqdm)):
-            self._apply_to_module(module, arg)
+            self._apply_to_module(model, module, arg)
-    def _apply_to_module(self, module: Module, args: TransformArgs):
+    def _apply_to_module(self, model: Module, module: Module, args: TransformArgs):
         """
         Create transforms and apply them to the module
+        :param model: model which module belongs to
         :param module: target module to apply transforms to
         :param args: defines how the transform will be applied to the target module
         """
@@ -156,7 +166,28 @@ class TransformFactory(RegistryMixin, ABC):
             module.register_forward_hook(output_hook)
-        # other locations such as q_attn and k_attn have not been implemented
+        # register query hook to attention
+        elif args.location == TransformLocation.Q_ATTN:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def query_hook(_, query_states):
+                return transform(query_states)
+            initialize_hooked_attention(model, module)
+            register_query_hook(module, query_hook)
+        # register key hook to kvcache
+        elif args.location == TransformLocation.K_CACHE:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def key_hook(_, key_states):
+                return transform(key_states)
+            initialize_hooked_kv_cache(model, module)
+            register_key_hook(module, key_hook)
         else:
             raise NotImplementedError()

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/hadamard.py RENAMED Viewed

@@ -51,7 +51,6 @@ class HadamardFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         exec_device = get_execution_device(module)
         device = get_offloaded_device(module)

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/matrix_multiply.py RENAMED Viewed

@@ -50,7 +50,6 @@ class RandomMatrixFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         device = get_offloaded_device(module)
         precision = self.scheme.precision if args.is_online() else torch.float64
@@ -68,8 +67,8 @@ class RandomMatrixFactory(TransformFactory):
             (size, size),
             generator=self.generator,
             dtype=precision,
-            device=device,
-        )
+            device=self.generator.device,
+        ).to(device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
     def _create_inverse(self, weight: Parameter) -> Parameter:

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/transform_args.py RENAMED Viewed

@@ -45,6 +45,16 @@ class TransformLocation(str, Enum):
     K_CACHE = "k_cache"
     Q_ATTN = "q_attn"
+    def is_online(self) -> bool:
+        """
+        Returns True if the transform location is online
+        (applied at runtime), False otherwise
+        """
+        return self not in (
+            TransformLocation.WEIGHT_INPUT,
+            TransformLocation.WEIGHT_OUTPUT,
+        )
 class TransformArgs(BaseModel, use_enum_values=True):
     """
@@ -70,9 +80,6 @@ class TransformArgs(BaseModel, use_enum_values=True):
         return value
     def is_online(self) -> bool:
-        return self.location not in (
-            TransformLocation.WEIGHT_INPUT,
-            TransformLocation.WEIGHT_OUTPUT,
-        )
+        return TransformLocation(self.location).is_online()
     model_config = ConfigDict(extra="forbid")

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/utils/matrix.py RENAMED Viewed

@@ -34,6 +34,8 @@ def get_transform_size(
     :param head_dim: size of head when transform is applied to mha
     :return: size of matrix
     """
+    size = None
     if isinstance(module, torch.nn.Linear):
         if location in (TransformLocation.INPUT, TransformLocation.WEIGHT_INPUT):
             size = module.in_features
@@ -44,11 +46,13 @@ def get_transform_size(
             size = module.num_embeddings
         else:
             size = module.embedding_dim
-    else:
-        raise NotImplementedError(f"Transforms on {type(module)} are not supported")
+    elif head_dim is None:
+        raise NotImplementedError(
+            f"Transforms on {type(module)} are not supported without head_dim"
+        )
     if head_dim is not None:
-        if size % head_dim != 0:
+        if size is not None and size % head_dim != 0:
             raise ValueError(
                 f"{head_dim} must divide {size} for {type(module)} at {location}"
             )
@@ -105,11 +109,11 @@ def apply_transform_weight(
     assert transform_weight.shape[0] == transform_weight.shape[1]
-    if module_type == torch.nn.Linear:
-        if location == TransformLocation.INPUT:
-            return _multihead_matmul(value, transform_weight)
+    if TransformLocation(location).is_online():
+        return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.WEIGHT_INPUT:
+    if module_type == torch.nn.Linear:
+        if location == TransformLocation.WEIGHT_INPUT:
             # equivalent to (transform_weight @ value.T).T
             return _multihead_matmul(value, transform_weight.T)
@@ -117,26 +121,14 @@ def apply_transform_weight(
             # equivalent to (value.T @ transform_weight).T
             return _multihead_matmul(transform_weight.T, value)
-        elif location == TransformLocation.OUTPUT:
-            return _multihead_matmul(value, transform_weight)
     # similar derivation to torch.nn.Linear, but `y = (x W)`
     elif module_type == torch.nn.Embedding:
-        if location == TransformLocation.INPUT:
-            return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.WEIGHT_INPUT:
-            return _multihead_matmul(
-                transform_weight,
-                value,
-            )
+        if location == TransformLocation.WEIGHT_INPUT:
+            return _multihead_matmul(transform_weight, value)
         elif location == TransformLocation.WEIGHT_OUTPUT:
             return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.OUTPUT:
-            return _multihead_matmul(value, transform_weight)
     raise NotImplementedError(
         f"Applying transforms to {module_type} {location} is not supported"
     )

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.12.3.a20251023'
+__version__ = version = '0.12.3.a20251030'
 __version_tuple__ = version_tuple = (0, 12, 3)

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251023
+Version: 0.12.3a20251030
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

@@ -75,6 +75,7 @@ src/compressed_tensors/quantization/lifecycle/helpers.py
 src/compressed_tensors/quantization/lifecycle/initialize.py
 src/compressed_tensors/quantization/utils/__init__.py
 src/compressed_tensors/quantization/utils/helpers.py
+src/compressed_tensors/quantization/utils/mxfp4_utils.py
 src/compressed_tensors/registry/__init__.py
 src/compressed_tensors/registry/registry.py
 src/compressed_tensors/transform/__init__.py
@@ -113,6 +114,7 @@ tests/test_compressors/quantized_compressors/test_fp4_quant.py
 tests/test_compressors/quantized_compressors/test_fp8_quant.py
 tests/test_compressors/quantized_compressors/test_int_quant.py
 tests/test_compressors/quantized_compressors/test_pack_quant.py
+tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py
 tests/test_compressors/sparse_compressors/__init__.py
 tests/test_compressors/sparse_compressors/test_bitmask.py
 tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py
@@ -142,6 +144,7 @@ tests/test_quantization/test_configs/__init__.py
 tests/test_quantization/test_configs/test_bit_depths.py
 tests/test_quantization/test_configs/test_strategies.py
 tests/test_quantization/test_utils/test_helpers.py
+tests/test_quantization/test_utils/test_mxfp4_utils.py
 tests/test_transform/conftest.py
 tests/test_transform/test_transform_args.py
 tests/test_transform/test_transform_config.py

{compressed_tensors-0.12.3a20251023 → compressed_tensors-0.12.3a20251030}/tests/test_compressors/quantized_compressors/test_pack_quant.py RENAMED Viewed

@@ -15,6 +15,7 @@
 import math
 import shutil
+import tempfile
 from collections import OrderedDict
 import pytest
@@ -170,12 +171,13 @@ def test_reload_match(tmp_path, num_bits):
     )
     save_file(compressed_state_dict, tmp_path / "model.safetensors")
-    reconstructed_dense_gen = compressor.decompress(
-        tmp_path, names_to_scheme=quantized_modules_to_scheme
-    )
     reconstructed_dense = {}
-    for name, value in reconstructed_dense_gen:
-        reconstructed_dense[name] = value
+    with tempfile.TemporaryDirectory():
+        reconstructed_dense_gen = compressor.decompress(
+            tmp_path, names_to_scheme=quantized_modules_to_scheme
+        )
+        for name, value in reconstructed_dense_gen:
+            reconstructed_dense[name] = value
     fake_quant_dummy = fake_quantize(
         dense_state_dict["dummy.weight"],
@@ -473,3 +475,91 @@ def test_unpack_from_int32(num_bits, values, expected_tensor):
     unpacked_tensor = unpack_from_int32(values, num_bits, expected_tensor.shape)
     assert torch.equal(unpacked_tensor, unpacked_tensor)
     assert unpacked_tensor.dtype == unpacked_tensor.dtype
+@pytest.mark.parametrize(
+    "strategy,group_size",
+    [
+        (QuantizationStrategy.GROUP, 128),
+        (QuantizationStrategy.CHANNEL, None),
+    ],
+)
+def test_asymmetric_zero_point_decompression(strategy, group_size, tmp_path):
+    """
+    Test that zero-point packing and unpacking works correctly for asymmetric
+    quantization with GROUP and CHANNEL strategies.
+    """
+    shape = (512, 1024)
+    if strategy == QuantizationStrategy.CHANNEL:
+        expected_zp_shape = (shape[0], 1)
+    elif strategy == QuantizationStrategy.GROUP:
+        num_groups = shape[1] // group_size
+        expected_zp_shape = (shape[0], max(num_groups, 1))
+    dense_state_dict = {
+        "dummy.weight": torch.randn(shape),
+        "dummy.weight_scale": torch.rand(expected_zp_shape).to(torch.float32),
+        "dummy.weight_zero_point": torch.randint(-8, 8, expected_zp_shape).to(
+            torch.int8
+        ),
+    }
+    quant_config = get_dummy_quant_config(
+        num_bits=4, strategy=strategy.value, symmetric=False, group_size=group_size
+    )
+    compressor = PackedQuantizationCompressor(config=quant_config)
+    quantized_modules_to_scheme = {"dummy": quant_config.config_groups["group_1"]}
+    compressed_state_dict = compressor.compress(
+        dense_state_dict.copy(), names_to_scheme=quantized_modules_to_scheme
+    )
+    assert "dummy.weight_zero_point" in compressed_state_dict
+    assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32
+    save_file(compressed_state_dict, tmp_path / "model.safetensors")
+    reconstructed_dense_gen = compressor.decompress(
+        tmp_path, names_to_scheme=quantized_modules_to_scheme
+    )
+    reconstructed_dense = {}
+    for name, value in reconstructed_dense_gen:
+        reconstructed_dense[name] = value
+    assert "dummy" in reconstructed_dense
+    assert "weight" in reconstructed_dense["dummy"]
+    assert reconstructed_dense["dummy"]["weight"].shape == shape
+    shutil.rmtree(tmp_path)
+@pytest.mark.parametrize(
+    "num_bits,strategy",
+    [
+        (4, QuantizationStrategy.GROUP),
+        (4, QuantizationStrategy.CHANNEL),
+        (8, QuantizationStrategy.GROUP),
+        (8, QuantizationStrategy.CHANNEL),
+    ],
+)
+def test_zero_point_pack_unpack_consistency(num_bits, strategy):
+    """
+    Test that packing and unpacking zero-points preserves values correctly.
+    """
+    if strategy == QuantizationStrategy.GROUP:
+        shape = (512, 8)
+    else:
+        shape = (512, 1)
+    max_val = (1 << (num_bits - 1)) - 1
+    min_val = -(1 << (num_bits - 1))
+    original_zp = torch.randint(min_val, max_val + 1, shape).to(torch.int8)
+    packed_zp = pack_to_int32(original_zp, num_bits, packed_dim=0)
+    unpacked_zp = unpack_from_int32(packed_zp, num_bits, shape, packed_dim=0)
+    assert torch.equal(original_zp, unpacked_zp)
+    assert unpacked_zp.dtype == torch.int8

compressed-tensors 0.12.3a20251023__tar.gz → 0.12.3a20251030__tar.gz

compressed-tensors 0.12.3a20251023tar.gz → 0.12.3a20251030tar.gz