PyPI - compressed-tensors - Versions diffs - 0.9.5a20250519__py3-none-any.whl → 0.9.5a20250521__py3-none-any.whl - Mend

compressed-tensors 0.9.5a20250519py3-none-any.whl → 0.9.5a20250521py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

compressed_tensors/compressors/quantized_compressors/base.py CHANGED Viewed

@@ -99,6 +99,7 @@ class BaseQuantizationCompressor(BaseCompressor):
                 scale = model_state.get(prefix + "weight_scale", None)
                 g_idx = model_state.get(prefix + "weight_g_idx", None)
                 zp = model_state.get(prefix + "weight_zero_point", None)
+                global_scale = model_state.get(prefix + "weight_global_scale", None)
                 # is scale does not exist, then weight cannot be compressed
                 if scale is None:
@@ -112,6 +113,7 @@ class BaseQuantizationCompressor(BaseCompressor):
                     weight=value,
                     scale=scale,
                     zero_point=zp,
+                    global_scale=global_scale,
                     g_idx=g_idx,
                     quantization_args=quant_args,
                     device="cpu",

compressed_tensors/compressors/quantized_compressors/naive_quantized.py CHANGED Viewed

@@ -78,6 +78,7 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -90,6 +91,11 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the NaiveQuantizationCompressor"
+            )
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(
                 x=weight,

compressed_tensors/compressors/quantized_compressors/pack_quantized.py CHANGED Viewed

@@ -94,6 +94,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -106,6 +107,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the PackQuantizationCompressor"
+            )
         compressed_dict = {}
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -27,8 +27,14 @@ from compressed_tensors.quantization.lifecycle.compressed import (
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
+    update_fused_layer_weight_global_scales,
+)
+from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    QuantizationArgs,
+    QuantizationType,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
@@ -266,6 +272,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
+        if status == QuantizationStatus.INITIALIZED:
+            update_fused_layer_weight_global_scales(model)
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -20,6 +20,7 @@ import torch
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
     round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
@@ -49,6 +50,7 @@ def quantize(
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Quantize the input tensor x using the QuantizationStrategy specified in args.
@@ -63,6 +65,7 @@ def quantize(
     :param args: quantization args dictating how to quantize x
     :param dtype: optional dtype to cast the quantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
@@ -75,6 +78,7 @@ def quantize(
         do_quantize=True,
         do_dequantize=False,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -86,6 +90,7 @@ def dequantize(
     args: Optional[QuantizationArgs] = None,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -97,6 +102,7 @@ def dequantize(
     :param args: quantization args used to quantize x_q
     :param dtype: optional dtype to cast the dequantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: dequantized float tensor
     """
     if args is None:
@@ -128,6 +134,7 @@ def dequantize(
         do_dequantize=True,
         dtype=dtype,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -138,6 +145,7 @@ def fake_quantize(
     zero_point: torch.Tensor,
     args: QuantizationArgs,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Fake quantize the input tensor x by quantizing then dequantizing with
@@ -151,6 +159,7 @@ def fake_quantize(
     :param zero_point: zero point tensor
     :param args: quantization args dictating how to quantize x
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
     return _process_quantization(
@@ -161,6 +170,7 @@ def fake_quantize(
         do_quantize=True,
         do_dequantize=True,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -174,6 +184,7 @@ def _process_quantization(
     dtype: Optional[torch.dtype] = None,
     do_quantize: bool = True,
     do_dequantize: bool = True,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
@@ -221,18 +232,21 @@ def _process_quantization(
             end = start + group_count
             if do_quantize:
                 output[:, start:end] = _quantize(
-                    x[:, start:end],
-                    sc,
-                    zp,
-                    q_min,
-                    q_max,
-                    args,
+                    x=x[:, start:end],
+                    scale=sc,
+                    zero_point=zp,
+                    q_min=q_min,
+                    q_max=q_max,
+                    args=args,
                     dtype=dtype,
+                    global_scale=global_scale,
                 )
             if do_dequantize:
                 input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(input, sc, zp)
+                output[:, start:end] = _dequantize(
+                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
+                )
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
@@ -240,16 +254,22 @@ def _process_quantization(
     else:  # covers channel, token and tensor strategies
         if do_quantize:
             output = _quantize(
-                x,
-                scale,
-                zero_point,
-                q_min,
-                q_max,
-                args,
+                x=x,
+                scale=scale,
+                zero_point=zero_point,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
                 dtype=dtype,
+                global_scale=global_scale,
             )
         if do_dequantize:
-            output = _dequantize(output if do_quantize else x, scale, zero_point)
+            output = _dequantize(
+                output if do_quantize else x,
+                scale=scale,
+                zero_point=zero_point,
+                global_scale=global_scale,
+            )
     return output
@@ -330,6 +350,7 @@ def forward_quantize(
         return value
     g_idx = getattr(module, "weight_g_idx", None)
+    global_scale = getattr(module, f"{base_name}_global_scale", None)
     if args.dynamic:
         # dynamic quantization - determine the scale/zp on the fly
@@ -345,6 +366,7 @@ def forward_quantize(
         zero_point=zero_point,
         args=args,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -357,11 +379,18 @@ def _quantize(
     q_max: torch.Tensor,
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
     # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
     clamped_value = torch.clamp(
         scaled,
@@ -381,7 +410,14 @@ def _dequantize(
     scale: torch.Tensor,
     zero_point: torch.Tensor = None,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
     dequant_value = x_q.to(scale.dtype)
     if zero_point is not None:

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -16,24 +16,33 @@
 import logging
 import math
 from enum import Enum
-from typing import Optional
+from typing import List, Optional
 import torch
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
+from compressed_tensors.quantization.utils import (
+    generate_global_scale,
+    is_fp4,
+    is_kv_cache_quant_scheme,
+    iter_named_quantizable_modules,
+)
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
     register_offload_parameter,
+    update_parameter_data,
 )
 from torch.nn import Module, Parameter
@@ -42,6 +51,7 @@ __all__ = [
     "initialize_module_for_quantization",
     "is_attention_module",
     "KVCacheScaleType",
+    "update_fused_layer_weight_global_scales",
 ]
@@ -170,7 +180,24 @@ def _initialize_scale_zero_point(
     # TODO: consider erroring out in the future as if the dtype if not one fo these,
     # there is likely bug
-    if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+    if is_fp4(quantization_args=quantization_args) and base_name == "weight":
+        scale_dtype = FP8_E4M3_DATA.dtype
+        # When applying weight-only FP4 quantization, generate a global_scale
+        # This scale is applied during runtime to ensure that the generated
+        # local scale falls properly within the FP8 range (i.e max value is FP8_max)
+        # which is the expected dtype of NVFP4A16 scales
+        value = generate_global_scale(input_tensor=module.weight)
+        value = value.to(device)
+        init_global_scale = Parameter(value, requires_grad=False)
+        register_offload_parameter(
+            module, f"{base_name}_global_scale", init_global_scale
+        )
+    if scale_dtype not in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ] and not is_fp4(quantization_args=quantization_args):
         scale_dtype = torch.float16
     # initializes empty scale, zero point, and g_idx parameters for the module
@@ -181,7 +208,11 @@ def _initialize_scale_zero_point(
     register_offload_parameter(module, f"{base_name}_scale", init_scale)
     if force_zero_point or not quantization_args.symmetric:
-        zp_dtype = quantization_args.pytorch_dtype()
+        if is_fp4(quantization_args=quantization_args):
+            zp_dtype = FP8_E4M3_DATA.dtype
+        else:
+            zp_dtype = quantization_args.pytorch_dtype()
         init_zero_point = Parameter(
             torch.zeros(expected_shape, device=device, dtype=zp_dtype),
             requires_grad=False,
@@ -219,3 +250,91 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
+# TODO: Potentially introduce an argument to turn this off
+# Only relevant for NVFP4A16 currently
+def update_fused_layer_weight_global_scales(model: torch.nn.Module):
+    """
+    When running NVFP4A16 quantization, update the global scale
+    such that q,k,v layers are treated as one tensor with the same
+    global_scale and gate_proj/up_proj layers are treated as one tensor
+    with the same global scale. This is requirement currently being set
+    by vLLM and may be removed in the future OR potentially make it
+    an optional step.
+    :param model: model to quantize
+    """
+    def _is_attention_module(module: Module):
+        return "attention" in module.__class__.__name__.lower() and (
+            hasattr(module, "k_proj")
+            or hasattr(module, "v_proj")
+            or hasattr(module, "qkv_proj")
+        )
+    def _is_mlp_module(module: Module):
+        return "mlp" in module.__class__.__name__.lower() and (
+            hasattr(module, "gate_proj") or hasattr(module, "up_proj")
+        )
+    def _valid_fp4_quant(layer_list: List[torch.nn.Linear]):
+        """
+        Return True if all the linear layers in the layer_list are
+        NVFP4A16 quantized.
+        """
+        for layer in layer_list:
+            scheme = getattr(layer, "quantization_scheme", None)
+            if scheme is None:
+                return False
+            weight_quant_args = scheme.weights
+            if weight_quant_args is None:
+                return False
+            if not is_fp4(quantization_args=weight_quant_args):
+                return False
+        return True
+    for name, submodule in iter_named_quantizable_modules(
+        model,
+        include_attn=True,
+        include_mlp=True,
+    ):
+        if _is_attention_module(submodule):
+            # already fused/treated as one layer
+            if hasattr(submodule, "qkv_proj"):
+                continue
+            if not _valid_fp4_quant(
+                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
+            ):
+                continue
+            q_weight = submodule.q_proj.weight.data
+            v_weight = submodule.v_proj.weight.data
+            k_weight = submodule.k_proj.weight.data
+            value = generate_global_scale(
+                input_tensor=torch.cat((q_weight, v_weight, k_weight), dim=0)
+            )
+            update_parameter_data(submodule.q_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.k_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.v_proj, value, "weight_global_scale")
+        if _is_mlp_module(submodule):
+            if not _valid_fp4_quant([submodule.gate_proj, submodule.up_proj]):
+                continue
+            gate_data = submodule.gate_proj.weight.data
+            up_data = submodule.up_proj.weight.data
+            value = generate_global_scale(
+                input_tensor=torch.cat((gate_data, up_data), dim=0)
+            )
+            update_parameter_data(submodule.gate_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.up_proj, value, "weight_global_scale")

compressed_tensors/quantization/quant_args.py CHANGED Viewed

@@ -26,6 +26,7 @@ __all__ = [
     "FP8_DTYPE",
     "FP8_E4M3_DATA",
     "FP4_E2M1_DATA",
+    "FloatArgs",
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
@@ -268,8 +269,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                 observer = None
         elif observer is None:
-            # default to minmax for non-dynamic cases
-            observer = "minmax"
+            # default to mse for non-dynamic cases
+            observer = "mse"
         # write back modified values
         model.strategy = strategy

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -17,7 +17,9 @@ from typing import Generator, List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.quant_args import (
-    FP8_DTYPE,
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    FloatArgs,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
@@ -44,6 +46,8 @@ __all__ = [
     "compute_dynamic_scales_and_zp",
     "calculate_range",
     "calculate_qparams",
+    "generate_global_scale",
+    "is_fp4",
 ]
 # target the self_attn layer
@@ -53,8 +57,18 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
+def is_fp4(quantization_args: QuantizationArgs):
+    return (
+        quantization_args.num_bits == 4
+        and quantization_args.type == QuantizationType.FLOAT
+    )
 def calculate_qparams(
-    min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
+    min_vals: Tensor,
+    max_vals: Tensor,
+    quantization_args: QuantizationArgs,
+    global_scale: Optional[Tensor] = None,
 ) -> Tuple[FloatTensor, IntTensor]:
     """
     :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
@@ -62,7 +76,11 @@ def calculate_qparams(
     :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
         from
     :param quantization_args: settings to quantization
-    :return: tuple of the calculated scale(s) and zero point(s)
+    :param global_scale: additional global scale to scale the locally generated scale
+        currently only applied/supported for Fp4
+    :return: tuple of the calculated scale(s) and zero point(s). For FP4, the calculated
+        scale if of dtype FP8
     """
     # based on the implementations for consuming quantized values,
     # 0.0 must always be representable within the quantized range
@@ -73,14 +91,40 @@ def calculate_qparams(
     bit_min, bit_max = calculate_range(quantization_args, device)
     bit_range = bit_max - bit_min
-    zp_dtype = quantization_args.pytorch_dtype()
+    if is_fp4(quantization_args=quantization_args):
+        zp_dtype = FP8_E4M3_DATA.dtype
+    else:
+        zp_dtype = quantization_args.pytorch_dtype()
     if quantization_args.symmetric:
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-        scales = max_val_pos / (float(bit_range) / 2)
-        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        if is_fp4(quantization_args=quantization_args) and global_scale is not None:
+            # Conditionally scale the generated local scale by a global_scale
+            scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
+            scales = scales.to(FP8_E4M3_DATA.dtype)
+        else:
+            scales = max_val_pos / (float(bit_range) / 2)
+        if scales.dtype == FP8_E4M3_DATA.dtype:
+            # torch.clamp not supported for FP8
+            # use the next largest fp8 value from 0
+            scales = torch.where(
+                scales == 0,
+                torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
+                scales,
+            )
+        else:
+            scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
+        if is_fp4(quantization_args=quantization_args):
+            raise NotImplementedError(
+                "Asymmetric Quantization is not supported for FP4"
+            )
         scales = (max_vals - min_vals) / float(bit_range)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - (min_vals / scales)
@@ -144,14 +188,16 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
         q_max = torch.tensor(bit_range / 2 - 1, device=device)
         q_min = torch.tensor(-bit_range / 2, device=device)
     elif quantization_args.type == QuantizationType.FLOAT:
-        if quantization_args.num_bits != 8:
-            raise ValueError(
-                "Floating point quantization is only supported for 8 bits,"
-                f"got {quantization_args.num_bits}"
+        if quantization_args.num_bits == 8:
+            q_max = torch.tensor(FP8_E4M3_DATA.max, device=device)
+            q_min = torch.tensor(FP8_E4M3_DATA.min, device=device)
+        elif quantization_args.num_bits == 4:
+            q_max = torch.tensor(FP4_E2M1_DATA.max, device=device)
+            q_min = torch.tensor(FP4_E2M1_DATA.min, device=device)
+        else:
+            raise NotImplementedError(
+                "Range calculation only supported for 4 and 8 bits"
             )
-        fp_range_info = torch.finfo(FP8_DTYPE)
-        q_max = torch.tensor(fp_range_info.max, device=device)
-        q_min = torch.tensor(fp_range_info.min, device=device)
     else:
         raise ValueError(f"Invalid quantization type {quantization_args.type}")
@@ -249,7 +295,10 @@ def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None
 def iter_named_quantizable_modules(
-    model: Module, include_children: bool = True, include_attn: bool = False
+    model: Module,
+    include_children: bool = True,
+    include_attn: bool = False,
+    include_mlp: bool = False,
 ) -> Generator[Tuple[str, Module], None, None]:
     """
     Yield name and submodule of
@@ -282,6 +331,9 @@ def iter_named_quantizable_modules(
         if include_attn:
             if name.endswith("self_attn"):
                 yield name, submodule
+        if include_mlp:
+            if name.endswith("mlp"):
+                yield name, submodule
 def get_torch_bit_depth(value: torch.Tensor) -> int:
@@ -396,3 +448,24 @@ def parse_out_kv_cache_args(
         kv_cache_args = None
     return kv_cache_args, quant_scheme_to_layers
+def generate_global_scale(
+    input_tensor: torch.Tensor,
+    scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
+    quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
+    dtype: Optional[torch.dtype] = torch.float32,
+):
+    """
+    Generate a global scale for an entire tensor (input_tensor).
+    Goal of the scale is to ensure that the quantization (local) scale
+    falls into the approproiate dtype range.
+    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
+    attempts to use the entire FP8 dtype range while mapping a per-group max
+    to the FP4 max.
+    """
+    scale_dtype = scale_data.dtype
+    tensor_amax = torch.abs(input_tensor.data).max().to(dtype)
+    global_scale = scale_data.max * quant_data.max / tensor_amax
+    return global_scale.to(dtype)

compressed_tensors/registry/registry.py CHANGED Viewed

@@ -19,7 +19,7 @@ of neuralmagic utilities
 import importlib
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, TypeVar, Union
 __all__ = [
@@ -32,8 +32,9 @@ __all__ = [
 ]
-_ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict)
-_REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict)
+_ALIAS_REGISTRY: Dict[type, Dict[str, str]] = defaultdict(dict)
+_REGISTRY: Dict[type, Dict[str, Any]] = defaultdict(dict)
+T = TypeVar("", bound="RegistryMixin")
 def standardize_lookup_name(name: str) -> str:
@@ -159,7 +160,7 @@ class RegistryMixin:
         )
     @classmethod
-    def load_from_registry(cls, name: str, **constructor_kwargs) -> object:
+    def load_from_registry(cls: type[T], name: str, **constructor_kwargs) -> T:
         """
         :param name: name of registered class to load
         :param constructor_kwargs: arguments to pass to the constructor retrieved
@@ -172,7 +173,7 @@ class RegistryMixin:
         return constructor(**constructor_kwargs)
     @classmethod
-    def get_value_from_registry(cls, name: str):
+    def get_value_from_registry(cls: type[T], name: str) -> T:
         """
         :param name: name to retrieve from the registry
         :return: value from retrieved the registry for the given name, raises
@@ -200,7 +201,7 @@ class RegistryMixin:
 def register(
-    parent_class: Type,
+    parent_class: type,
     value: Any,
     name: Optional[str] = None,
     alias: Union[List[str], str, None] = None,
@@ -240,7 +241,7 @@ def register(
 def get_from_registry(
-    parent_class: Type, name: str, require_subclass: bool = False
+    parent_class: type, name: str, require_subclass: bool = False
 ) -> Any:
     """
     :param parent_class: class that the name is registered under
@@ -276,7 +277,7 @@ def get_from_registry(
     return retrieved_value
-def registered_names(parent_class: Type) -> List[str]:
+def registered_names(parent_class: type) -> List[str]:
     """
     :param parent_class: class to look up the registry of
     :return: all names registered to the given class
@@ -284,7 +285,7 @@ def registered_names(parent_class: Type) -> List[str]:
     return list(_REGISTRY[parent_class].keys())
-def registered_aliases(parent_class: Type) -> List[str]:
+def registered_aliases(parent_class: type) -> List[str]:
     """
     :param parent_class: class to look up the registry of
     :return: all aliases registered to the given class
@@ -297,7 +298,7 @@ def registered_aliases(parent_class: Type) -> List[str]:
 def register_alias(
-    name: str, parent_class: Type, alias: Union[str, List[str], None] = None
+    name: str, parent_class: type, alias: Union[str, List[str], None] = None
 ):
     """
     Updates the mapping from the alias(es) to the given name.
@@ -352,7 +353,7 @@ def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any:
     return value
-def _validate_subclass(parent_class: Type, child_class: Type):
+def _validate_subclass(parent_class: type, child_class: type):
     if not issubclass(child_class, parent_class):
         raise ValueError(
             f"class {child_class} is not a subclass of the class it is "

compressed_tensors/version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.9.5.a20250519'
+__version__ = version = '0.9.5.a20250521'
 __version_tuple__ = version_tuple = (0, 9, 5)

{compressed_tensors-0.9.5a20250519.dist-info → compressed_tensors-0.9.5a20250521.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.5a20250519
+Version: 0.9.5a20250521
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.5a20250519.dist-info → compressed_tensors-0.9.5a20250521.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
 compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
-compressed_tensors/version.py,sha256=nXAnufttJXt-FtZQ-qInj1Xx7rNF_ERhtqkUZcqWiEc,521
+compressed_tensors/version.py,sha256=FJ5OPohL511E88TFF_Jipl_3ikvZ6NgmdrYxPbi2vo8,521
 compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
 compressed_tensors/compressors/base.py,sha256=nvWsv4xEw1Tkxkxth6TmHplDYXfBeP22xWxOsZERyDY,7204
 compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
 compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
 compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=BBJd3Ei6FtqVQLBkOm80G6pSJ11IMTGuTA-FL4n6_5g,32704
 compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=KvaFBL_Q84LxRGJOV035M8OBoCkAx8kOkfphswgkKWk,745
-compressed_tensors/compressors/quantized_compressors/base.py,sha256=4YWT95GIhHETI7glsk_ITrnUzzN1MhEypt-0z9eKqOI,9134
-compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
+compressed_tensors/compressors/quantized_compressors/base.py,sha256=n_sVSzySHUBgXt-nkLggM1DtB0aEgQmiKhTzcnQU9Dc,9266
+compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=0ANDcuD8aXPqTYNPY6GnX9iS6eXJw6P0TzNV_rYS2l8,5369
 compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py,sha256=Gw-lVzk5jrKUlM5UTCiJBmhM5gHzB9mn8r298MVUbDI,6395
-compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=SPIHlk8ewip2LcjgkCw02K21EkfUSFSd9qQqL0Pt5eM,11162
+compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=_66tQ8bxslDUdas-ULORXblPw9kdNNn1UJJU9-ZOGPY,11380
 compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
 compressed_tensors/compressors/sparse_compressors/base.py,sha256=YNZWcHjDleAlqbgRZQ6oJf44MQb_UDNvJGOqhl26uFA,8098
 compressed_tensors/compressors/sparse_compressors/dense.py,sha256=rPaxbP7P52prWNs4lGaiBbpNvsQLElFMwOrq1oBP2Yg,1733
@@ -26,19 +26,19 @@ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5y
 compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
 compressed_tensors/linear/compressed_linear.py,sha256=1yo9RyjA0aQ--iuIknFfcSorJn43Mn4CoV-q4JlTJ_o,4052
 compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
-compressed_tensors/quantization/quant_args.py,sha256=CepGBAURFGxBzTyFXxHwsUs6wYEJ46_jPbEvJYMG0Tw,10491
+compressed_tensors/quantization/quant_args.py,sha256=5-mq43RmbI81z9Xl9pYNv4bqIP5AIT65FgT--4ERsE8,10502
 compressed_tensors/quantization/quant_config.py,sha256=MxSUcb5dOqMN6LFyD5K2h8X0TvEtcWIAoiUJqD2dHGE,10159
 compressed_tensors/quantization/quant_scheme.py,sha256=Fx7Ma4bDlFB6OWkHKhOB6_0AOVIOPRgNE_qTwmDLSbc,6586
 compressed_tensors/quantization/lifecycle/__init__.py,sha256=_uItzFWusyV74Zco_pHLOTdE9a83cL-R-ZdyQrBkIyw,772
-compressed_tensors/quantization/lifecycle/apply.py,sha256=DOoxH4jM8r0270GGGUFOpRrgwaisiJi7TV-Q6E8qM8E,18067
+compressed_tensors/quantization/lifecycle/apply.py,sha256=-OKZ-FFFfIIoeGTrho8lXx6HVWZQp3Xkn3Q-G0hU-CM,18294
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
-compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
+compressed_tensors/quantization/lifecycle/forward.py,sha256=WY-HY5kXY2Zs9HMpaq44bpolQUAQ1ELrNZC7GM5C4jw,14494
 compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
-compressed_tensors/quantization/lifecycle/initialize.py,sha256=PaOs3WqlWZFBq9Zc2W_WImdyzSCdZIkqCP5r2jnmokw,7789
+compressed_tensors/quantization/lifecycle/initialize.py,sha256=dWXxjYLemjmtrSnb8vyuvNoNTSm8ywmUswze3soKY4o,12041
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
-compressed_tensors/quantization/utils/helpers.py,sha256=-wX0H7zVysJ67jRRCGbx6BfxbMU_1sqffTf5YUIpPiU,14391
+compressed_tensors/quantization/utils/helpers.py,sha256=w3Ucpdog88b0MnZdJ37VzgtYi1fqrwJafYdfWPc0hTk,16852
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
-compressed_tensors/registry/registry.py,sha256=vRcjVB1ITfSbfYUaGndBBmqhip_5vsS62weorVg0iXo,11896
+compressed_tensors/registry/registry.py,sha256=0s15BxdGgzBv8RL4kUJCYcuDOFUh_KZYvNvLEeRqWTc,11956
 compressed_tensors/utils/__init__.py,sha256=gS4gSU2pwcAbsKj-6YMaqhm25udFy6ISYaWBf-myRSM,808
 compressed_tensors/utils/helpers.py,sha256=RrNvzD08naEjEiXdU-FdZjQVda1nQywu1hA_GCDj0vg,10415
 compressed_tensors/utils/offload.py,sha256=JNQ66_6vhSsizhlUaMgyEdBuFolYxbgUuT1mAZrCfKY,15436
@@ -46,8 +46,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
 compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
 compressed_tensors/utils/safetensors_load.py,sha256=DMfZBuUbA6qp_BG_zIWT3ckiEE33K9ob34s-OgzReO4,12057
 compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
-compressed_tensors-0.9.5a20250519.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors-0.9.5a20250519.dist-info/METADATA,sha256=9A6h2qW5-4_2UfY2lCyQSWJuu0RMUsGzvI8YteN27Dg,7004
-compressed_tensors-0.9.5a20250519.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-compressed_tensors-0.9.5a20250519.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors-0.9.5a20250519.dist-info/RECORD,,
+compressed_tensors-0.9.5a20250521.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors-0.9.5a20250521.dist-info/METADATA,sha256=Xl6EbYwMlKhFyy6VXtxD2x0TsiTDG36YszGdub5wLqM,7004
+compressed_tensors-0.9.5a20250521.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+compressed_tensors-0.9.5a20250521.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors-0.9.5a20250521.dist-info/RECORD,,

{compressed_tensors-0.9.5a20250519.dist-info → compressed_tensors-0.9.5a20250521.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.7.1)
+Generator: setuptools (80.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{compressed_tensors-0.9.5a20250519.dist-info → compressed_tensors-0.9.5a20250521.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250519.dist-info → compressed_tensors-0.9.5a20250521.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors 0.9.5a20250519__py3-none-any.whl → 0.9.5a20250521__py3-none-any.whl

compressed-tensors 0.9.5a20250519py3-none-any.whl → 0.9.5a20250521py3-none-any.whl