PyPI - compressed-tensors - Versions diffs - 0.12.3a20251028__tar.gz → 0.12.3a20251110__tar.gz - Mend

compressed-tensors 0.12.3a20251028tar.gz → 0.12.3a20251110tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/.github/workflows/test-check.yaml RENAMED Viewed

@@ -12,7 +12,7 @@ on:
 jobs:
   python-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ibm-wdc-k8s-vllm-h100-solo
     env:
         HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
     steps:

{compressed_tensors-0.12.3a20251028/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251110}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251028
+Version: 0.12.3a20251110
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/vllm-project/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/compressors/quantized_compressors/base.py RENAMED Viewed

@@ -90,7 +90,6 @@ class BaseQuantizationCompressor(BaseCompressor):
         desc = "Compressing with quantization"
         for name in tqdm(uncompressed_names, desc=desc, disable=(not show_progress)):
             value = model_state[name]
             # compress weights
             if name.endswith("weight"):
                 prefix = name.removesuffix("weight")
@@ -129,10 +128,18 @@ class BaseQuantizationCompressor(BaseCompressor):
                 if name.endswith("zero_point") and self._skip_zp(name, names_to_scheme):
                     continue
+                if name.endswith("weight_scale") and self._skip_scale():
+                    continue
                 compressed_dict[name] = value.to(compression_device)
         return compressed_dict
+    def _skip_scale(self):
+        from compressed_tensors.compressors import NVFP4PackedCompressor
+        return isinstance(self, NVFP4PackedCompressor)
     def _skip_zp(
         self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
     ) -> bool:

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py RENAMED Viewed

@@ -26,7 +26,7 @@ from compressed_tensors.quantization.lifecycle.forward import dequantize, quanti
 from torch import Tensor
-__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8"]
+__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8", "NVFP4PackedCompressor"]
 FLOAT_TO_E2M1 = [
     0.0,
@@ -103,6 +103,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         if device is not None:
             weight_packed = weight_packed.to(device)
         compressed_dict["weight_packed"] = weight_packed
+        compressed_dict["weight_scale"] = scale.to(quantization_args.scale_dtype)
         return compressed_dict
     def decompress_weight(
@@ -111,8 +112,8 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
         quantization_args: Optional[QuantizationArgs] = None,
     ) -> torch.Tensor:
         weight = compressed_data["weight_packed"]
-        scale = compressed_data["weight_scale"]
         global_scale = compressed_data["weight_global_scale"]
+        scale = compressed_data["weight_scale"]
         m, n = weight.shape
         # TODO: use a user provided dequant dtype
         unpacked = unpack_fp4_from_uint8(weight, m, n * 2)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py RENAMED Viewed

@@ -134,8 +134,6 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
-        # We typically don't compress zp; apart from when using the packed_compressor
-        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
@@ -143,7 +141,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
             packed_zp = pack_to_int32(
                 zero_point, quantization_args.num_bits, packed_dim=0
             )
-            compressed_dict["weight_zero_point"] = packed_zp
+            compressed_dict["weight_zero_point"] = packed_zp.contiguous()
         return compressed_dict
     def decompress_weight(
@@ -166,16 +164,13 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
-        # NOTE: this will fail decompression as we don't currently handle packed zp on
-        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
         ]:
-            raise ValueError(
-                "Decompression of packed zero points is currently not supported"
-            )
-            assert zero_point is not None
+            assert (
+                zero_point is not None
+            ), "Asymmetric quantization requires zero-point values"
             original_zp_shape = (original_shape[0], scale.shape[-1])
             zero_point = unpack_from_int32(
                 zero_point, num_bits, original_zp_shape, packed_dim=0

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -21,7 +21,7 @@ from compressed_tensors.quantization.quant_args import (
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
-    round_to_quantized_type,
+    round_to_quantized_type_args,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
@@ -278,7 +278,7 @@ def _process_quantization(
             if columns % group_size != 0:
                 raise ValueError(
                     "tensor column shape must be divisble "
-                    f"by the given group_size {group_size}"
+                    f"by the given group_size {group_size} but got {columns}"
                 )
         # support column-order (default) quantization as well as other orderings
@@ -466,20 +466,17 @@ def _quantize(
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
     if global_scale is not None:
-        scale = scale.to(global_scale.dtype) / global_scale
+        scale = scale / global_scale
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
-    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
-    clamped_value = torch.clamp(
-        scaled,
-        q_min,
-        q_max,
+    # clamp and round
+    quantized_value = round_to_quantized_type_args(
+        tensor=scaled, args=args, min=q_min, max=q_max
     )
-    quantized_value = round_to_quantized_type(clamped_value, args)
     if dtype is not None:
         quantized_value = quantized_value.to(dtype)
@@ -499,7 +496,7 @@ def _dequantize(
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
     if global_scale is not None:
-        scale = scale.to(global_scale.dtype) / global_scale
+        scale = scale / global_scale
     dequant_value = x_q.to(scale.dtype)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -24,7 +24,6 @@ from compressed_tensors.modeling import (
     QuantizedKVCache,
 )
 from compressed_tensors.quantization import (
-    FP8_E4M3_DATA,
     ActivationOrdering,
     DynamicType,
     QuantizationArgs,
@@ -36,7 +35,7 @@ from compressed_tensors.quantization import (
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.utils import is_fp4, strategy_cdiv
+from compressed_tensors.quantization.utils import strategy_cdiv
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
@@ -250,20 +249,13 @@ def initialize_qparams(
     # 2. Identify quantization scale and zp dtype
     scale_dtype = observed_dtype
-    if is_fp4(quantization_args=quantization_args):
-        scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
-    else:
-        # TODO: consider erroring out in the future as if the dtype if not one of these,
-        # there is likely bug
-        if scale_dtype not in [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-            torch.float64,
-        ]:
-            scale_dtype = torch.bfloat16
-        zp_dtype = quantization_args.pytorch_dtype()
+    if scale_dtype not in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    ]:
+        scale_dtype = torch.float16
     # 3. Initializes scale/zp for the module
     init_scale = Parameter(
@@ -274,7 +266,9 @@ def initialize_qparams(
     if force_zero_point or not quantization_args.symmetric:
         init_zero_point = Parameter(
-            torch.zeros(expected_shape, device=device, dtype=zp_dtype),
+            torch.zeros(
+                expected_shape, device=device, dtype=quantization_args.zp_dtype
+            ),
             requires_grad=False,
         )
         register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -19,7 +19,15 @@ from typing import Any, Dict, List, Optional, Union
 import torch
 from compressed_tensors.utils import Aliasable
 from compressed_tensors.utils.helpers import deprecated
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from compressed_tensors.utils.type import TorchDtype
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_serializer,
+    field_validator,
+    model_validator,
+)
 __all__ = [
@@ -30,7 +38,8 @@ __all__ = [
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
-    "round_to_quantized_type",
+    "round_to_quantized_type_args",
+    "round_to_quantized_type_dtype",
     "ActivationOrdering",
     "DynamicType",
 ]
@@ -174,6 +183,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     block_structure: Optional[List[int]] = None
     dynamic: Union[DynamicType, bool] = False
     actorder: Union[ActivationOrdering, bool, None] = None
+    scale_dtype: Optional[TorchDtype] = None
+    zp_dtype: Optional[TorchDtype] = None
     observer: Optional[str] = Field(
         default=None,
         description=(
@@ -189,6 +200,12 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         ),
     )
+    @field_serializer("zp_dtype")
+    def serialize_dtype(self, dtype: torch.dtype):
+        if self.symmetric:
+            return None
+        return str(dtype)
     @field_validator("type", mode="before")
     def validate_type(cls, value) -> QuantizationType:
         if isinstance(value, str):
@@ -266,6 +283,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         dynamic = model.dynamic
         observer = model.observer
         dynamic = model.dynamic
+        zp_dtype = model.zp_dtype
         # infer strategy
         if strategy is None:
@@ -353,9 +371,16 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
             # default to minmax for non-dynamic cases
             observer = "minmax"
+        if zp_dtype is None:
+            if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
+                zp_dtype = FP8_E4M3_DATA.dtype
+            else:
+                zp_dtype = model.pytorch_dtype()
         # write back modified values
         model.strategy = strategy
         model.observer = observer
+        model.zp_dtype = zp_dtype
         return model
     def pytorch_dtype(self) -> torch.dtype:
@@ -381,18 +406,56 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     model_config = ConfigDict(extra="forbid")
-def round_to_quantized_type(
-    tensor: torch.Tensor, args: QuantizationArgs
+def round_to_quantized_type_dtype(
+    tensor: torch.Tensor,
+    dtype: torch.dtype,
+    cast_to_original_dtype: Optional[bool] = True,
 ) -> torch.Tensor:
     """
-    Rounds each element of the input tensor to the nearest quantized representation,
-    keeping to original dtype
+    Rounds an input tensor to the nearest quantized representation given a dtype.
+    The original dtype is kept post-rounding.
     :param tensor: tensor to round
-    :param args: QuantizationArgs to pull appropriate dtype from
+    :param dtype: dtype to use for rounding
+    :param cast_to_original_dtype: whether or not we cast the rounded tensor to
+        the original dtype
     :return: rounded tensor
     """
     original_dtype = tensor.dtype
+    if torch.is_floating_point(torch.tensor([], dtype=dtype)):
+        finfo = torch.finfo(dtype)
+        rounded = torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
+    else:
+        iinfo = torch.iinfo(dtype)
+        rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max)).to(dtype)
+    if cast_to_original_dtype:
+        return rounded.to(original_dtype)
+    return rounded
+def round_to_quantized_type_args(
+    tensor: torch.Tensor,
+    args: QuantizationArgs,
+    min: torch.Tensor,
+    max: torch.Tensor,
+    cast_to_original_dtype: Optional[bool] = True,
+) -> torch.Tensor:
+    """
+    Rounds an input tensor to the nearest quantized representation given
+    qunatization args. The original dtype is kept post-rounding.
+    :param tensor: tensor to round
+    :param args: quantization args to use for rounding
+    :param min: min value to use for clamping
+    :param max: max value to use for clamping
+    :param cast_to_original_dtype: whether or not we cast the rounded tensor to
+        the original dtype
+    :return: rounded tensor
+    """
+    original_dtype = tensor.dtype
+    tensor = torch.clamp(tensor, min, max)
     if args.type == QuantizationType.FLOAT:
         if args.num_bits == 8:
             rounded = tensor.to(FP8_E4M3_DATA.dtype)
@@ -405,4 +468,6 @@ def round_to_quantized_type(
     else:
         raise ValueError(f"Invalid quantization type {args.type}")
-    return rounded.to(original_dtype)
+    if cast_to_original_dtype:
+        return rounded.to(original_dtype)
+    return rounded

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_config.py RENAMED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
 from enum import Enum
 from typing import Annotated, Any, Dict, List, Optional, Set, Union

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/quant_scheme.py RENAMED Viewed

@@ -18,6 +18,7 @@ from typing import List, Optional
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.quant_args import (
+    FP8_E4M3_DATA,
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
@@ -160,6 +161,8 @@ NVFP4A16 = dict(
         symmetric=True,
         dynamic=False,
         group_size=16,
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     )
 )
@@ -173,6 +176,8 @@ NVFP4 = dict(
         dynamic=False,
         group_size=16,
         observer="static_minmax",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     ),
     input_activations=QuantizationArgs(
         num_bits=4,
@@ -182,6 +187,8 @@ NVFP4 = dict(
         dynamic=DynamicType.LOCAL,
         group_size=16,
         observer="static_minmax",
+        scale_dtype=FP8_E4M3_DATA.dtype,
+        zp_dtype=FP8_E4M3_DATA.dtype,
     ),
 )

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/quantization/utils/helpers.py RENAMED Viewed

@@ -24,6 +24,7 @@ from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
+    round_to_quantized_type_dtype,
 )
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.utils import deprecated
@@ -46,7 +47,6 @@ __all__ = [
     "calculate_range",
     "calculate_qparams",
     "generate_gparam",
-    "is_fp4",
     "strategy_cdiv",
 ]
@@ -57,13 +57,6 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
-def is_fp4(quantization_args: QuantizationArgs):
-    return (
-        quantization_args.num_bits == 4
-        and quantization_args.type == QuantizationType.FLOAT
-    )
 def calculate_qparams(
     min_vals: Tensor,
     max_vals: Tensor,
@@ -92,52 +85,50 @@ def calculate_qparams(
     bit_min, bit_max = calculate_range(quantization_args, device)
     bit_range = bit_max - bit_min
-    if is_fp4(quantization_args=quantization_args):
-        zp_dtype = FP8_E4M3_DATA.dtype
-    else:
-        zp_dtype = quantization_args.pytorch_dtype()
+    # 1. Generate scale and zero-point
     if quantization_args.symmetric:
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-        if is_fp4(quantization_args=quantization_args) and global_scale is not None:
-            # Conditionally scale the generated local scale by a global_scale
-            scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
-            scales = torch.clamp(scales, max=FP8_E4M3_DATA.max, min=FP8_E4M3_DATA.min)
-            scales = scales.to(FP8_E4M3_DATA.dtype)
-        else:
-            scales = max_val_pos / (float(bit_range) / 2)
-        # TODO: in the case of MoEs, the global_scale may also be 0/need to be clamped
-        if scales.dtype == FP8_E4M3_DATA.dtype:
-            # torch.clamp not supported for FP8
-            # use the next largest fp8 value from 0
-            scales = torch.where(
-                scales == 0,
-                torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
-                scales,
-            )
-        else:
-            scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        scales = max_val_pos / (float(bit_range) / 2)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
-        if is_fp4(quantization_args=quantization_args):
+        if (
+            quantization_args.num_bits == 4
+            and quantization_args.type == QuantizationType.FLOAT
+        ):
             raise NotImplementedError(
                 "Asymmetric Quantization is not supported for FP4"
             )
         scales = (max_vals - min_vals) / float(bit_range)
-        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - (min_vals / scales)
         zero_points = torch.clamp(zero_points, bit_min, bit_max)
-    # match zero-points to quantized type
-    # if casting to int, use round instead of truncate
-    if quantization_args.type == QuantizationType.INT:
-        zero_points = torch.round(zero_points)
-    zero_points = zero_points.to(zp_dtype)
+    # 2. Conditionally scale the generated local scale by a global_scale
+    if global_scale is not None:
+        scales = global_scale * scales
+    # 3. Conditionally round the scale to the quantized dtype, if scale_dtype is set
+    if quantization_args.scale_dtype is not None:
+        scales = round_to_quantized_type_dtype(
+            scales, dtype=quantization_args.scale_dtype
+        )
+    # 4. Update any 0s with small values to
+    # prevent div by 0
+    eps = _get_dtype_eps(
+        dtype=quantization_args.scale_dtype
+        if quantization_args.scale_dtype is not None
+        else scales.dtype
+    )
+    scales = torch.where(
+        scales == 0,
+        torch.tensor(eps, dtype=scales.dtype, device=device),
+        scales,
+    )
+    # 5. Round the zp to zp_dtype
+    zero_points = round_to_quantized_type_dtype(
+        zero_points, dtype=quantization_args.zp_dtype, cast_to_original_dtype=False
+    )
     if scales.ndim == 0:
         scales = scales.reshape(1)
@@ -455,3 +446,14 @@ def strategy_cdiv(
             logger.bind(log_once=True).warning(message)
     return dividend
+def _get_dtype_eps(dtype: torch.dtype) -> float:
+    if dtype == FP8_E4M3_DATA.dtype:
+        return 0.125
+    elif dtype == FP4_E2M1_DATA.dtype:
+        return 0.25
+    elif torch.is_floating_point(torch.tensor([], dtype=dtype)):
+        return torch.finfo(dtype).eps
+    else:
+        return 1

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/transform/factory/base.py RENAMED Viewed

@@ -18,6 +18,14 @@ from typing import List, Optional
 import torch
 import torch.nn.utils.parametrize as P
 import tqdm
+from compressed_tensors.modeling.attention import (
+    initialize_hooked_attention,
+    register_query_hook,
+)
+from compressed_tensors.modeling.kvcache import (
+    initialize_hooked_kv_cache,
+    register_key_hook,
+)
 from compressed_tensors.registry.registry import RegistryMixin, T
 from compressed_tensors.transform import (
     TransformArgs,
@@ -36,6 +44,7 @@ from compressed_tensors.utils import (
 from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
 from torch.nn import Module, Parameter
+from transformers import PreTrainedModel
 __all__ = ["TransformFactory", "TransformBase"]
@@ -97,12 +106,13 @@ class TransformFactory(RegistryMixin, ABC):
         desc = f"Applying {self.name} transforms"
         for module, arg in tqdm.tqdm(modules_args, desc=desc, disable=(not use_tqdm)):
-            self._apply_to_module(module, arg)
+            self._apply_to_module(model, module, arg)
-    def _apply_to_module(self, module: Module, args: TransformArgs):
+    def _apply_to_module(self, model: Module, module: Module, args: TransformArgs):
         """
         Create transforms and apply them to the module
+        :param model: model which module belongs to
         :param module: target module to apply transforms to
         :param args: defines how the transform will be applied to the target module
         """
@@ -156,7 +166,28 @@ class TransformFactory(RegistryMixin, ABC):
             module.register_forward_hook(output_hook)
-        # other locations such as q_attn and k_attn have not been implemented
+        # register query hook to attention
+        elif args.location == TransformLocation.Q_ATTN:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def query_hook(_, query_states):
+                return transform(query_states)
+            initialize_hooked_attention(model, module)
+            register_query_hook(module, query_hook)
+        # register key hook to kvcache
+        elif args.location == TransformLocation.K_CACHE:
+            if not isinstance(model, PreTrainedModel):
+                raise ValueError(f"Cannot hook attention of model: {model}")
+            def key_hook(_, key_states):
+                return transform(key_states)
+            initialize_hooked_kv_cache(model, module)
+            register_key_hook(module, key_hook)
         else:
             raise NotImplementedError()

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/transform/factory/hadamard.py RENAMED Viewed

@@ -51,7 +51,6 @@ class HadamardFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         exec_device = get_execution_device(module)
         device = get_offloaded_device(module)

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/transform/factory/matrix_multiply.py RENAMED Viewed

@@ -50,7 +50,6 @@ class RandomMatrixFactory(TransformFactory):
         :param module: parent module that transform will be applied to
         :param args: defines how the transform will be applied to the module
         """
-        assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
         device = get_offloaded_device(module)
         precision = self.scheme.precision if args.is_online() else torch.float64
@@ -68,8 +67,8 @@ class RandomMatrixFactory(TransformFactory):
             (size, size),
             generator=self.generator,
             dtype=precision,
-            device=device,
-        )
+            device=self.generator.device,
+        ).to(device)
         return Parameter(data, requires_grad=self.scheme.requires_grad)
     def _create_inverse(self, weight: Parameter) -> Parameter:

{compressed_tensors-0.12.3a20251028 → compressed_tensors-0.12.3a20251110}/src/compressed_tensors/transform/transform_args.py RENAMED Viewed

@@ -45,6 +45,16 @@ class TransformLocation(str, Enum):
     K_CACHE = "k_cache"
     Q_ATTN = "q_attn"
+    def is_online(self) -> bool:
+        """
+        Returns True if the transform location is online
+        (applied at runtime), False otherwise
+        """
+        return self not in (
+            TransformLocation.WEIGHT_INPUT,
+            TransformLocation.WEIGHT_OUTPUT,
+        )
 class TransformArgs(BaseModel, use_enum_values=True):
     """
@@ -70,9 +80,6 @@ class TransformArgs(BaseModel, use_enum_values=True):
         return value
     def is_online(self) -> bool:
-        return self.location not in (
-            TransformLocation.WEIGHT_INPUT,
-            TransformLocation.WEIGHT_OUTPUT,
-        )
+        return TransformLocation(self.location).is_online()
     model_config = ConfigDict(extra="forbid")

compressed-tensors 0.12.3a20251028__tar.gz → 0.12.3a20251110__tar.gz

compressed-tensors 0.12.3a20251028tar.gz → 0.12.3a20251110tar.gz