PyPI - compressed-tensors - Versions diffs - 0.12.3a20251028__tar.gz → 0.12.3a20251030__tar.gz - Mend

compressed-tensors 0.12.3a20251028tar.gz → 0.12.3a20251030tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

{compressed_tensors-0.12.3a20251028/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251030}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251028
+Version: 0.12.3a20251030
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py RENAMED Viewed

@@ -134,8 +134,6 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
-        # We typically don't compress zp; apart from when using the packed_compressor
-        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
@@ -143,7 +141,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
             packed_zp = pack_to_int32(
                 zero_point, quantization_args.num_bits, packed_dim=0
             )
-            compressed_dict["weight_zero_point"] = packed_zp
+            compressed_dict["weight_zero_point"] = packed_zp.contiguous()
         return compressed_dict
     def decompress_weight(
@@ -166,16 +164,13 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
-        # NOTE: this will fail decompression as we don't currently handle packed zp on
-        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
         ]:
-            raise ValueError(
-                "Decompression of packed zero points is currently not supported"
-            )
-            assert zero_point is not None
+            assert (
+                zero_point is not None
+            ), "Asymmetric quantization requires zero-point values"
             original_zp_shape = (original_shape[0], scale.shape[-1])
             zero_point = unpack_from_int32(
                 zero_point, num_bits, original_zp_shape, packed_dim=0

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -278,7 +278,7 @@ def _process_quantization(
             if columns % group_size != 0:
                 raise ValueError(
                     "tensor column shape must be divisble "
-                    f"by the given group_size {group_size}"
+                    f"by the given group_size {group_size} but got {columns}"
                 )
         # support column-order (default) quantization as well as other orderings

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/base.py RENAMED Viewed

@@ -18,6 +18,14 @@ from typing import List, Optional
 import torch
 import torch.nn.utils.parametrize as P
 import tqdm
+from compressed_tensors.modeling.attention import (
+    initialize_hooked_attention,
+    register_query_hook,
+)
+from compressed_tensors.modeling.kvcache import (
+    initialize_hooked_kv_cache,
+    register_key_hook,
+)
 from compressed_tensors.registry.registry import RegistryMixin, T
 from compressed_tensors.transform import (
     TransformArgs,
@@ -36,6 +44,7 @@ from compressed_tensors.utils import (
 from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
 from torch.nn import Module, Parameter
+from transformers import PreTrainedModel
 __all__ = ["TransformFactory", "TransformBase"]
@@ -97,12 +106,13 @@ class TransformFactory(RegistryMixin, ABC):
         desc = f"Applying {self.name} transforms"
         for module, arg in tqdm.tqdm(modules_args, desc=desc, disable=(not use_tqdm)):
-            self._apply_to_module(module, arg)
+            self._apply_to_module(model, module, arg)
-    def _apply_to_module(self, module: Module, args: TransformArgs):
+    def _apply_to_module(self, model: Module, module: Module, args: TransformArgs):
         """
         Create transforms and apply them to the module
+        :param model: model which module belongs to
         :param module: target module to apply transforms to
         :param args: defines how the transform will be applied to the target module
         """
@@ -156,7 +166,28 @@ class TransformFactory(RegistryMixin, ABC):
             module.register_forward_hook(output_hook)
-        # other locations such as q_attn and k_attn have not been implemented
+        # register query hook to attention
+        elif args.location == TransformLocation.Q_ATTN:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def query_hook(_, query_states):
+                return transform(query_states)
+            initialize_hooked_attention(model, module)
+            register_query_hook(module, query_hook)
+        # register key hook to kvcache
+        elif args.location == TransformLocation.K_CACHE:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def key_hook(_, key_states):
+                return transform(key_states)
+            initialize_hooked_kv_cache(model, module)
+            register_key_hook(module, key_hook)
         else:
             raise NotImplementedError()

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/hadamard.py RENAMED Viewed

@@ -51,7 +51,6 @@ class HadamardFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         exec_device = get_execution_device(module)
         device = get_offloaded_device(module)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/factory/matrix_multiply.py RENAMED Viewed

@@ -50,7 +50,6 @@ class RandomMatrixFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         device = get_offloaded_device(module)
         precision = self.scheme.precision if args.is_online() else torch.float64
@@ -68,8 +67,8 @@ class RandomMatrixFactory(TransformFactory):
             (size, size),
             generator=self.generator,
             dtype=precision,
-            device=device,
-        )
+            device=self.generator.device,
+        ).to(device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
     def _create_inverse(self, weight: Parameter) -> Parameter:

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/transform_args.py RENAMED Viewed

@@ -45,6 +45,16 @@ class TransformLocation(str, Enum):
     K_CACHE = "k_cache"
     Q_ATTN = "q_attn"
+    def is_online(self) -> bool:
+        """
+        Returns True if the transform location is online
+        (applied at runtime), False otherwise
+        """
+        return self not in (
+            TransformLocation.WEIGHT_INPUT,
+            TransformLocation.WEIGHT_OUTPUT,
+        )
 class TransformArgs(BaseModel, use_enum_values=True):
     """
@@ -70,9 +80,6 @@ class TransformArgs(BaseModel, use_enum_values=True):
         return value
     def is_online(self) -> bool:
-        return self.location not in (
-            TransformLocation.WEIGHT_INPUT,
-            TransformLocation.WEIGHT_OUTPUT,
-        )
+        return TransformLocation(self.location).is_online()
     model_config = ConfigDict(extra="forbid")

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/transform/utils/matrix.py RENAMED Viewed

@@ -34,6 +34,8 @@ def get_transform_size(
     :param head_dim: size of head when transform is applied to mha
     :return: size of matrix
     """
+    size = None
     if isinstance(module, torch.nn.Linear):
         if location in (TransformLocation.INPUT, TransformLocation.WEIGHT_INPUT):
             size = module.in_features
@@ -44,11 +46,13 @@ def get_transform_size(
             size = module.num_embeddings
         else:
             size = module.embedding_dim
-    else:
-        raise NotImplementedError(f"Transforms on {type(module)} are not supported")
+    elif head_dim is None:
+        raise NotImplementedError(
+            f"Transforms on {type(module)} are not supported without head_dim"
+        )
     if head_dim is not None:
-        if size % head_dim != 0:
+        if size is not None and size % head_dim != 0:
             raise ValueError(
                 f"{head_dim} must divide {size} for {type(module)} at {location}"
             )
@@ -105,11 +109,11 @@ def apply_transform_weight(
     assert transform_weight.shape[0] == transform_weight.shape[1]
-    if module_type == torch.nn.Linear:
-        if location == TransformLocation.INPUT:
-            return _multihead_matmul(value, transform_weight)
+    if TransformLocation(location).is_online():
+        return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.WEIGHT_INPUT:
+    if module_type == torch.nn.Linear:
+        if location == TransformLocation.WEIGHT_INPUT:
             # equivalent to (transform_weight @ value.T).T
             return _multihead_matmul(value, transform_weight.T)
@@ -117,26 +121,14 @@ def apply_transform_weight(
             # equivalent to (value.T @ transform_weight).T
             return _multihead_matmul(transform_weight.T, value)
-        elif location == TransformLocation.OUTPUT:
-            return _multihead_matmul(value, transform_weight)
     # similar derivation to torch.nn.Linear, but `y = (x W)`
     elif module_type == torch.nn.Embedding:
-        if location == TransformLocation.INPUT:
-            return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.WEIGHT_INPUT:
-            return _multihead_matmul(
-                transform_weight,
-                value,
-            )
+        if location == TransformLocation.WEIGHT_INPUT:
+            return _multihead_matmul(transform_weight, value)
         elif location == TransformLocation.WEIGHT_OUTPUT:
             return _multihead_matmul(value, transform_weight)
-        elif location == TransformLocation.OUTPUT:
-            return _multihead_matmul(value, transform_weight)
     raise NotImplementedError(
         f"Applying transforms to {module_type} {location} is not supported"
     )

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.12.3.a20251028'
+__version__ = version = '0.12.3.a20251030'
 __version_tuple__ = version_tuple = (0, 12, 3)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251028
+Version: 0.12.3a20251030
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

@@ -114,6 +114,7 @@ tests/test_compressors/quantized_compressors/test_fp4_quant.py
 tests/test_compressors/quantized_compressors/test_fp8_quant.py
 tests/test_compressors/quantized_compressors/test_int_quant.py
 tests/test_compressors/quantized_compressors/test_pack_quant.py
+tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py
 tests/test_compressors/sparse_compressors/__init__.py
 tests/test_compressors/sparse_compressors/test_bitmask.py
 tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/tests/test_compressors/quantized_compressors/test_pack_quant.py RENAMED Viewed

@@ -15,6 +15,7 @@
 import math
 import shutil
+import tempfile
 from collections import OrderedDict
 import pytest
@@ -170,12 +171,13 @@ def test_reload_match(tmp_path, num_bits):
     )
     save_file(compressed_state_dict, tmp_path / "model.safetensors")
-    reconstructed_dense_gen = compressor.decompress(
-        tmp_path, names_to_scheme=quantized_modules_to_scheme
-    )
     reconstructed_dense = {}
-    for name, value in reconstructed_dense_gen:
-        reconstructed_dense[name] = value
+    with tempfile.TemporaryDirectory():
+        reconstructed_dense_gen = compressor.decompress(
+            tmp_path, names_to_scheme=quantized_modules_to_scheme
+        )
+        for name, value in reconstructed_dense_gen:
+            reconstructed_dense[name] = value
     fake_quant_dummy = fake_quantize(
         dense_state_dict["dummy.weight"],
@@ -473,3 +475,91 @@ def test_unpack_from_int32(num_bits, values, expected_tensor):
     unpacked_tensor = unpack_from_int32(values, num_bits, expected_tensor.shape)
     assert torch.equal(unpacked_tensor, unpacked_tensor)
     assert unpacked_tensor.dtype == unpacked_tensor.dtype
+@pytest.mark.parametrize(
+    "strategy,group_size",
+    [
+        (QuantizationStrategy.GROUP, 128),
+        (QuantizationStrategy.CHANNEL, None),
+    ],
+)
+def test_asymmetric_zero_point_decompression(strategy, group_size, tmp_path):
+    """
+    Test that zero-point packing and unpacking works correctly for asymmetric
+    quantization with GROUP and CHANNEL strategies.
+    """
+    shape = (512, 1024)
+    if strategy == QuantizationStrategy.CHANNEL:
+        expected_zp_shape = (shape[0], 1)
+    elif strategy == QuantizationStrategy.GROUP:
+        num_groups = shape[1] // group_size
+        expected_zp_shape = (shape[0], max(num_groups, 1))
+    dense_state_dict = {
+        "dummy.weight": torch.randn(shape),
+        "dummy.weight_scale": torch.rand(expected_zp_shape).to(torch.float32),
+        "dummy.weight_zero_point": torch.randint(-8, 8, expected_zp_shape).to(
+            torch.int8
+        ),
+    }
+    quant_config = get_dummy_quant_config(
+        num_bits=4, strategy=strategy.value, symmetric=False, group_size=group_size
+    )
+    compressor = PackedQuantizationCompressor(config=quant_config)
+    quantized_modules_to_scheme = {"dummy": quant_config.config_groups["group_1"]}
+    compressed_state_dict = compressor.compress(
+        dense_state_dict.copy(), names_to_scheme=quantized_modules_to_scheme
+    )
+    assert "dummy.weight_zero_point" in compressed_state_dict
+    assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32
+    save_file(compressed_state_dict, tmp_path / "model.safetensors")
+    reconstructed_dense_gen = compressor.decompress(
+        tmp_path, names_to_scheme=quantized_modules_to_scheme
+    )
+    reconstructed_dense = {}
+    for name, value in reconstructed_dense_gen:
+        reconstructed_dense[name] = value
+    assert "dummy" in reconstructed_dense
+    assert "weight" in reconstructed_dense["dummy"]
+    assert reconstructed_dense["dummy"]["weight"].shape == shape
+    shutil.rmtree(tmp_path)
+@pytest.mark.parametrize(
+    "num_bits,strategy",
+    [
+        (4, QuantizationStrategy.GROUP),
+        (4, QuantizationStrategy.CHANNEL),
+        (8, QuantizationStrategy.GROUP),
+        (8, QuantizationStrategy.CHANNEL),
+    ],
+)
+def test_zero_point_pack_unpack_consistency(num_bits, strategy):
+    """
+    Test that packing and unpacking zero-points preserves values correctly.
+    """
+    if strategy == QuantizationStrategy.GROUP:
+        shape = (512, 8)
+    else:
+        shape = (512, 1)
+    max_val = (1 << (num_bits - 1)) - 1
+    min_val = -(1 << (num_bits - 1))
+    original_zp = torch.randint(min_val, max_val + 1, shape).to(torch.int8)
+    packed_zp = pack_to_int32(original_zp, num_bits, packed_dim=0)
+    unpacked_zp = unpack_from_int32(packed_zp, num_bits, shape, packed_dim=0)
+    assert torch.equal(original_zp, unpacked_zp)
+    assert unpacked_zp.dtype == torch.int8

compressed_tensors-0.12.3a20251030/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py ADDED Viewed

@@ -0,0 +1,172 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+End-to-end tests for asymmetric quantization with zero-point decompression.
+"""
+import pytest
+import torch
+from compressed_tensors.compressors.model_compressors.model_compressor import (
+    ModelCompressor,
+)
+from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationConfig,
+    QuantizationScheme,
+    QuantizationStrategy,
+    apply_quantization_config,
+)
+from torch.nn import Linear, Module
+class SimpleModel(Module):
+    """Simple model for testing"""
+    def __init__(self, input_dim=512, hidden_dim=256, output_dim=128):
+        super().__init__()
+        self.layer1 = Linear(input_dim, hidden_dim, bias=False)
+        self.layer2 = Linear(hidden_dim, output_dim, bias=False)
+    def forward(self, x):
+        x = self.layer1(x)
+        x = torch.relu(x)
+        x = self.layer2(x)
+        return x
+def create_asymmetric_quant_config(
+    num_bits=4, strategy=QuantizationStrategy.GROUP, group_size=128
+) -> QuantizationConfig:
+    """Create an asymmetric quantization config"""
+    config_groups = {
+        "group_1": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=num_bits,
+                strategy=strategy.value,
+                group_size=(
+                    group_size if strategy == QuantizationStrategy.GROUP else None
+                ),
+                symmetric=False,
+            ),
+        ),
+    }
+    return QuantizationConfig(config_groups=config_groups)
+@pytest.mark.parametrize(
+    "strategy,group_size",
+    [
+        (QuantizationStrategy.GROUP, 128),
+        (QuantizationStrategy.CHANNEL, None),
+    ],
+)
+def test_end_to_end_asymmetric_quantization(
+    strategy,
+    group_size,
+    mock_per_group_calibration,
+    mock_per_channel_calibration,
+):
+    """
+    Test end-to-end workflow: quantize -> compress -> decompress in memory
+    """
+    model = SimpleModel()
+    original_weights = {
+        "layer1": model.layer1.weight.detach().clone(),
+        "layer2": model.layer2.weight.detach().clone(),
+    }
+    quant_config = create_asymmetric_quant_config(
+        num_bits=4, strategy=strategy, group_size=group_size
+    )
+    # Set pack-quantized format for ModelCompressor usage
+    quant_config.format = CompressionFormat.pack_quantized.value
+    apply_quantization_config(model, quant_config)
+    if strategy == QuantizationStrategy.GROUP:
+        mock_per_group_calibration(
+            model.layer1, "weight", model.layer1.weight, group_size
+        )
+        mock_per_group_calibration(
+            model.layer2, "weight", model.layer2.weight, group_size
+        )
+    else:
+        mock_per_channel_calibration(model.layer1, "weight", model.layer1.weight)
+        mock_per_channel_calibration(model.layer2, "weight", model.layer2.weight)
+    # Compress and decompress in memory using ModelCompressor
+    mc = ModelCompressor(quantization_config=quant_config)
+    mc.compress_model(model)
+    # Verify compression created zero-point parameters
+    assert hasattr(model.layer1, "weight_zero_point")
+    assert hasattr(model.layer2, "weight_zero_point")
+    assert model.layer1.weight_zero_point.dtype == torch.int32
+    assert model.layer2.weight_zero_point.dtype == torch.int32
+    # Decompress in memory
+    mc.decompress_model(model)
+    # Verify decompression restored weights correctly
+    assert model.layer1.weight.shape == original_weights["layer1"].shape
+    assert model.layer2.weight.shape == original_weights["layer2"].shape
+    assert model.layer1.weight.dtype.is_floating_point
+    assert model.layer2.weight.dtype.is_floating_point
+    assert not torch.isnan(model.layer1.weight).any()
+    assert not torch.isnan(model.layer2.weight).any()
+    assert not torch.isinf(model.layer1.weight).any()
+    assert not torch.isinf(model.layer2.weight).any()
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_asymmetric_quantization_accuracy(num_bits, mock_per_group_calibration):
+    """
+    Test that asymmetric quantization with zero-point preserves accuracy better
+    than symmetric quantization for biased weight distributions.
+    """
+    shape = (256, 512)
+    biased_weights = torch.randn(shape) + 2.0
+    quant_config = create_asymmetric_quant_config(
+        num_bits=num_bits,
+        strategy=QuantizationStrategy.GROUP,
+        group_size=128,
+    )
+    quant_config.format = CompressionFormat.pack_quantized.value
+    class SingleLayer(Module):
+        def __init__(self):
+            super().__init__()
+            self.layer = Linear(shape[1], shape[0], bias=False)
+    model = SingleLayer()
+    apply_quantization_config(model, quant_config)
+    with torch.no_grad():
+        model.layer.weight.copy_(biased_weights)
+    mock_per_group_calibration(model.layer, "weight", model.layer.weight, 128)
+    # Compress and decompress in memory using ModelCompressor
+    mc = ModelCompressor(quantization_config=quant_config)
+    mc.compress_model(model)
+    mc.decompress_model(model)
+    decompressed_weights = model.layer.weight
+    assert decompressed_weights.shape == shape
+    assert not torch.isnan(decompressed_weights).any()
+    assert not torch.isinf(decompressed_weights).any()
+    threshold = torch.std(torch.rand(shape) - torch.rand(shape))
+    assert torch.std(biased_weights - decompressed_weights) < threshold

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251030}/tests/test_transform/conftest.py RENAMED Viewed

@@ -62,7 +62,9 @@ class MockAttention(torch.nn.Module):
             num_attention_heads * self.head_dim, hidden_size, bias=False
         )
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, hidden_states: torch.Tensor, past_key_values=None
+    ) -> torch.Tensor:
         batch_size, seq_len, hidden_size = hidden_states.shape
         hidden_shape = (batch_size, seq_len, -1, self.head_dim)
@@ -70,6 +72,9 @@ class MockAttention(torch.nn.Module):
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        if past_key_values is not None:
+            past_key_values.update(key_states, value_states, 0, {})
         key_states = self.repeat_kv(key_states, self.num_key_value_groups)
         value_states = self.repeat_kv(value_states, self.num_key_value_groups)
@@ -97,6 +102,21 @@ class MockAttention(torch.nn.Module):
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class MockAttentionModel(PreTrainedModel):
+    config_class = PretrainedConfig
+    def __init__(self, hidden_size, num_attention_heads, num_key_value_heads):
+        super().__init__(PretrainedConfig())
+        self.self_attn = MockAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+        )
+    def forward(self, x):
+        return self.self_attn(x)
 @pytest.fixture(scope="function")
 def model_apply():
     model = TransformableModel(2, 4, 8, 16, 32, 64)

compressed-tensors 0.12.3a20251028__tar.gz → 0.12.3a20251030__tar.gz

compressed-tensors 0.12.3a20251028tar.gz → 0.12.3a20251030tar.gz