PyPI - compressed-tensors - Versions diffs - 0.9.5a20250514__tar.gz → 0.9.5a20250520__tar.gz - Mend

compressed-tensors 0.9.5a20250514tar.gz → 0.9.5a20250520tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

{compressed_tensors-0.9.5a20250514/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250520}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.5a20250514
+Version: 0.9.5a20250520
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/model_compressors/model_compressor.py RENAMED Viewed

@@ -421,6 +421,13 @@ class ModelCompressor:
                 module.quantization_status = QuantizationStatus.COMPRESSED
+        # TODO: consider sparse compression to also be compression
+        if (
+            self.quantization_config is not None
+            and self.quantization_config.format != CompressionFormat.dense.value
+        ):
+            self.quantization_config.quantization_status = QuantizationStatus.COMPRESSED
     def decompress_model(self, model: Module):
         """
         Decompress a model in memory. Because the model structure is modified in place,

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/base.py RENAMED Viewed

@@ -99,6 +99,7 @@ class BaseQuantizationCompressor(BaseCompressor):
                 scale = model_state.get(prefix + "weight_scale", None)
                 g_idx = model_state.get(prefix + "weight_g_idx", None)
                 zp = model_state.get(prefix + "weight_zero_point", None)
+                global_scale = model_state.get(prefix + "weight_global_scale", None)
                 # is scale does not exist, then weight cannot be compressed
                 if scale is None:
@@ -112,6 +113,7 @@ class BaseQuantizationCompressor(BaseCompressor):
                     weight=value,
                     scale=scale,
                     zero_point=zp,
+                    global_scale=global_scale,
                     g_idx=g_idx,
                     quantization_args=quant_args,
                     device="cpu",

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py RENAMED Viewed

@@ -78,6 +78,7 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -90,6 +91,11 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the NaiveQuantizationCompressor"
+            )
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(
                 x=weight,

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py RENAMED Viewed

@@ -94,6 +94,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -106,6 +107,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the PackQuantizationCompressor"
+            )
         compressed_dict = {}
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/apply.py RENAMED Viewed

@@ -27,8 +27,14 @@ from compressed_tensors.quantization.lifecycle.compressed import (
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
+    update_fused_layer_weight_global_scales,
+)
+from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    QuantizationArgs,
+    QuantizationType,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
@@ -266,6 +272,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
+        if status == QuantizationStatus.INITIALIZED:
+            update_fused_layer_weight_global_scales(model)
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -20,6 +20,7 @@ import torch
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
     round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
@@ -49,6 +50,7 @@ def quantize(
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Quantize the input tensor x using the QuantizationStrategy specified in args.
@@ -63,6 +65,7 @@ def quantize(
     :param args: quantization args dictating how to quantize x
     :param dtype: optional dtype to cast the quantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
@@ -75,6 +78,7 @@ def quantize(
         do_quantize=True,
         do_dequantize=False,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -86,6 +90,7 @@ def dequantize(
     args: Optional[QuantizationArgs] = None,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -97,6 +102,7 @@ def dequantize(
     :param args: quantization args used to quantize x_q
     :param dtype: optional dtype to cast the dequantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: dequantized float tensor
     """
     if args is None:
@@ -128,6 +134,7 @@ def dequantize(
         do_dequantize=True,
         dtype=dtype,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -138,6 +145,7 @@ def fake_quantize(
     zero_point: torch.Tensor,
     args: QuantizationArgs,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Fake quantize the input tensor x by quantizing then dequantizing with
@@ -151,6 +159,7 @@ def fake_quantize(
     :param zero_point: zero point tensor
     :param args: quantization args dictating how to quantize x
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
     return _process_quantization(
@@ -161,6 +170,7 @@ def fake_quantize(
         do_quantize=True,
         do_dequantize=True,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -174,6 +184,7 @@ def _process_quantization(
     dtype: Optional[torch.dtype] = None,
     do_quantize: bool = True,
     do_dequantize: bool = True,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
@@ -221,18 +232,21 @@ def _process_quantization(
             end = start + group_count
             if do_quantize:
                 output[:, start:end] = _quantize(
-                    x[:, start:end],
-                    sc,
-                    zp,
-                    q_min,
-                    q_max,
-                    args,
+                    x=x[:, start:end],
+                    scale=sc,
+                    zero_point=zp,
+                    q_min=q_min,
+                    q_max=q_max,
+                    args=args,
                     dtype=dtype,
+                    global_scale=global_scale,
                 )
             if do_dequantize:
                 input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(input, sc, zp)
+                output[:, start:end] = _dequantize(
+                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
+                )
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
@@ -240,16 +254,22 @@ def _process_quantization(
     else:  # covers channel, token and tensor strategies
         if do_quantize:
             output = _quantize(
-                x,
-                scale,
-                zero_point,
-                q_min,
-                q_max,
-                args,
+                x=x,
+                scale=scale,
+                zero_point=zero_point,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
                 dtype=dtype,
+                global_scale=global_scale,
             )
         if do_dequantize:
-            output = _dequantize(output if do_quantize else x, scale, zero_point)
+            output = _dequantize(
+                output if do_quantize else x,
+                scale=scale,
+                zero_point=zero_point,
+                global_scale=global_scale,
+            )
     return output
@@ -330,6 +350,7 @@ def forward_quantize(
         return value
     g_idx = getattr(module, "weight_g_idx", None)
+    global_scale = getattr(module, f"{base_name}_global_scale", None)
     if args.dynamic:
         # dynamic quantization - determine the scale/zp on the fly
@@ -345,6 +366,7 @@ def forward_quantize(
         zero_point=zero_point,
         args=args,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
@@ -357,11 +379,18 @@ def _quantize(
     q_max: torch.Tensor,
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
     # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
     clamped_value = torch.clamp(
         scaled,
@@ -381,7 +410,14 @@ def _dequantize(
     scale: torch.Tensor,
     zero_point: torch.Tensor = None,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
     dequant_value = x_q.to(scale.dtype)
     if zero_point is not None:

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -16,24 +16,33 @@
 import logging
 import math
 from enum import Enum
-from typing import Optional
+from typing import List, Optional
 import torch
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
+from compressed_tensors.quantization.utils import (
+    generate_global_scale,
+    is_fp4,
+    is_kv_cache_quant_scheme,
+    iter_named_quantizable_modules,
+)
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
     register_offload_parameter,
+    update_parameter_data,
 )
 from torch.nn import Module, Parameter
@@ -42,6 +51,7 @@ __all__ = [
     "initialize_module_for_quantization",
     "is_attention_module",
     "KVCacheScaleType",
+    "update_fused_layer_weight_global_scales",
 ]
@@ -170,7 +180,24 @@ def _initialize_scale_zero_point(
     # TODO: consider erroring out in the future as if the dtype if not one fo these,
     # there is likely bug
-    if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+    if is_fp4(quantization_args=quantization_args) and base_name == "weight":
+        scale_dtype = FP8_E4M3_DATA.dtype
+        # When applying weight-only FP4 quantization, generate a global_scale
+        # This scale is applied during runtime to ensure that the generated
+        # local scale falls properly within the FP8 range (i.e max value is FP8_max)
+        # which is the expected dtype of NVFP4A16 scales
+        value = generate_global_scale(input_tensor=module.weight)
+        value = value.to(device)
+        init_global_scale = Parameter(value, requires_grad=False)
+        register_offload_parameter(
+            module, f"{base_name}_global_scale", init_global_scale
+        )
+    if scale_dtype not in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ] and not is_fp4(quantization_args=quantization_args):
         scale_dtype = torch.float16
     # initializes empty scale, zero point, and g_idx parameters for the module
@@ -181,7 +208,11 @@ def _initialize_scale_zero_point(
     register_offload_parameter(module, f"{base_name}_scale", init_scale)
     if force_zero_point or not quantization_args.symmetric:
-        zp_dtype = quantization_args.pytorch_dtype()
+        if is_fp4(quantization_args=quantization_args):
+            zp_dtype = FP8_E4M3_DATA.dtype
+        else:
+            zp_dtype = quantization_args.pytorch_dtype()
         init_zero_point = Parameter(
             torch.zeros(expected_shape, device=device, dtype=zp_dtype),
             requires_grad=False,
@@ -219,3 +250,88 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
+# TODO: Potentially introduce an argument to turn this off
+# Only relevant for NVFP4A16 currently
+def update_fused_layer_weight_global_scales(model: torch.nn.Module):
+    """
+    When running NVFP4A16 quantization, update the global scale
+    such that q,k,v layers are treated as one tensor with the same
+    global_scale and gate_proj/up_proj layers are treated as one tensor
+    with the same global scale. This is requirement currently being set
+    by vLLM and may be removed in the future OR potentially make it
+    an optional step.
+    :param model: model to quantize
+    """
+    def _is_attention_module(module: Module):
+        return "attention" in module.__class__.__name__.lower() and (
+            hasattr(module, "k_proj")
+            or hasattr(module, "v_proj")
+            or hasattr(module, "qkv_proj")
+        )
+    def _is_mlp_module(module: Module):
+        return "mlp" in module.__class__.__name__.lower() and (
+            hasattr(module, "gate_proj") or hasattr(module, "up_proj")
+        )
+    def _valid_fp4_quant(layer_list: List[torch.nn.Linear]):
+        """
+        Return True if all the linear layers in the layer_list are
+        NVFP4A16 quantized.
+        """
+        for layer in layer_list:
+            scheme = getattr(layer, "quantization_scheme", None)
+            if scheme is None:
+                return False
+            weight_quant_args = scheme.weights
+            if weight_quant_args is None:
+                return False
+            if not is_fp4(quantization_args=weight_quant_args):
+                return False
+        return True
+    for name, submodule in iter_named_quantizable_modules(
+        model,
+        include_attn=True,
+        include_mlp=True,
+    ):
+        if _is_attention_module(submodule):
+            if not _valid_fp4_quant(
+                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
+            ):
+                continue
+            q_weight = submodule.q_proj.weight.data
+            v_weight = submodule.v_proj.weight.data
+            k_weight = submodule.k_proj.weight.data
+            value = generate_global_scale(
+                input_tensor=torch.cat((q_weight, v_weight, k_weight), dim=0)
+            )
+            update_parameter_data(submodule.q_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.k_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.v_proj, value, "weight_global_scale")
+        if _is_mlp_module(submodule):
+            if not _valid_fp4_quant([submodule.gate_proj, submodule.up_proj]):
+                continue
+            gate_data = submodule.gate_proj.weight.data
+            up_data = submodule.up_proj.weight.data
+            value = generate_global_scale(
+                input_tensor=torch.cat((gate_data, up_data), dim=0)
+            )
+            update_parameter_data(submodule.gate_proj, value, "weight_global_scale")
+            update_parameter_data(submodule.up_proj, value, "weight_global_scale")

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -26,6 +26,7 @@ __all__ = [
     "FP8_DTYPE",
     "FP8_E4M3_DATA",
     "FP4_E2M1_DATA",
+    "FloatArgs",
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/utils/helpers.py RENAMED Viewed

@@ -17,7 +17,9 @@ from typing import Generator, List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.quant_args import (
-    FP8_DTYPE,
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    FloatArgs,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
@@ -44,6 +46,8 @@ __all__ = [
     "compute_dynamic_scales_and_zp",
     "calculate_range",
     "calculate_qparams",
+    "generate_global_scale",
+    "is_fp4",
 ]
 # target the self_attn layer
@@ -53,8 +57,18 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
+def is_fp4(quantization_args: QuantizationArgs):
+    return (
+        quantization_args.num_bits == 4
+        and quantization_args.type == QuantizationType.FLOAT
+    )
 def calculate_qparams(
-    min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
+    min_vals: Tensor,
+    max_vals: Tensor,
+    quantization_args: QuantizationArgs,
+    global_scale: Optional[Tensor] = None,
 ) -> Tuple[FloatTensor, IntTensor]:
     """
     :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
@@ -62,7 +76,11 @@ def calculate_qparams(
     :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
         from
     :param quantization_args: settings to quantization
-    :return: tuple of the calculated scale(s) and zero point(s)
+    :param global_scale: additional global scale to scale the locally generated scale
+        currently only applied/supported for Fp4
+    :return: tuple of the calculated scale(s) and zero point(s). For FP4, the calculated
+        scale if of dtype FP8
     """
     # based on the implementations for consuming quantized values,
     # 0.0 must always be representable within the quantized range
@@ -73,14 +91,40 @@ def calculate_qparams(
     bit_min, bit_max = calculate_range(quantization_args, device)
     bit_range = bit_max - bit_min
-    zp_dtype = quantization_args.pytorch_dtype()
+    if is_fp4(quantization_args=quantization_args):
+        zp_dtype = FP8_E4M3_DATA.dtype
+    else:
+        zp_dtype = quantization_args.pytorch_dtype()
     if quantization_args.symmetric:
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-        scales = max_val_pos / (float(bit_range) / 2)
-        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        if is_fp4(quantization_args=quantization_args) and global_scale is not None:
+            # Conditionally scale the generated local scale by a global_scale
+            scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
+            scales = scales.to(FP8_E4M3_DATA.dtype)
+        else:
+            scales = max_val_pos / (float(bit_range) / 2)
+        if scales.dtype == FP8_E4M3_DATA.dtype:
+            # torch.clamp not supported for FP8
+            # use the next largest fp8 value from 0
+            scales = torch.where(
+                scales == 0,
+                torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
+                scales,
+            )
+        else:
+            scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
+        if is_fp4(quantization_args=quantization_args):
+            raise NotImplementedError(
+                "Asymmetric Quantization is not supported for FP4"
+            )
         scales = (max_vals - min_vals) / float(bit_range)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = bit_min - (min_vals / scales)
@@ -144,14 +188,16 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
         q_max = torch.tensor(bit_range / 2 - 1, device=device)
         q_min = torch.tensor(-bit_range / 2, device=device)
     elif quantization_args.type == QuantizationType.FLOAT:
-        if quantization_args.num_bits != 8:
-            raise ValueError(
-                "Floating point quantization is only supported for 8 bits,"
-                f"got {quantization_args.num_bits}"
+        if quantization_args.num_bits == 8:
+            q_max = torch.tensor(FP8_E4M3_DATA.max, device=device)
+            q_min = torch.tensor(FP8_E4M3_DATA.min, device=device)
+        elif quantization_args.num_bits == 4:
+            q_max = torch.tensor(FP4_E2M1_DATA.max, device=device)
+            q_min = torch.tensor(FP4_E2M1_DATA.min, device=device)
+        else:
+            raise NotImplementedError(
+                "Range calculation only supported for 4 and 8 bits"
             )
-        fp_range_info = torch.finfo(FP8_DTYPE)
-        q_max = torch.tensor(fp_range_info.max, device=device)
-        q_min = torch.tensor(fp_range_info.min, device=device)
     else:
         raise ValueError(f"Invalid quantization type {quantization_args.type}")
@@ -249,7 +295,10 @@ def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None
 def iter_named_quantizable_modules(
-    model: Module, include_children: bool = True, include_attn: bool = False
+    model: Module,
+    include_children: bool = True,
+    include_attn: bool = False,
+    include_mlp: bool = False,
 ) -> Generator[Tuple[str, Module], None, None]:
     """
     Yield name and submodule of
@@ -282,6 +331,9 @@ def iter_named_quantizable_modules(
         if include_attn:
             if name.endswith("self_attn"):
                 yield name, submodule
+        if include_mlp:
+            if name.endswith("mlp"):
+                yield name, submodule
 def get_torch_bit_depth(value: torch.Tensor) -> int:
@@ -396,3 +448,24 @@ def parse_out_kv_cache_args(
         kv_cache_args = None
     return kv_cache_args, quant_scheme_to_layers
+def generate_global_scale(
+    input_tensor: torch.Tensor,
+    scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
+    quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
+    dtype: Optional[torch.dtype] = torch.float32,
+):
+    """
+    Generate a global scale for an entire tensor (input_tensor).
+    Goal of the scale is to ensure that the quantization (local) scale
+    falls into the approproiate dtype range.
+    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
+    attempts to use the entire FP8 dtype range while mapping a per-group max
+    to the FP4 max.
+    """
+    scale_dtype = scale_data.dtype
+    tensor_amax = torch.abs(input_tensor.data).max().to(dtype)
+    global_scale = scale_data.max * quant_data.max / tensor_amax
+    return global_scale.to(dtype)

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.9.5.a20250514'
+__version__ = version = '0.9.5.a20250520'
 __version_tuple__ = version_tuple = (0, 9, 5)

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.5a20250514
+Version: 0.9.5a20250520
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_initialize.py RENAMED Viewed

@@ -16,12 +16,15 @@
 import math
 import pytest
+import torch
 from compressed_tensors.quantization import (
+    FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
     QuantizationScheme,
     QuantizationStatus,
     QuantizationStrategy,
+    QuantizationType,
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
@@ -152,6 +155,10 @@ def test_initialize_module_for_quantization_offloaded(
             QuantizationArgs(strategy="group", group_size=2, actorder="weight"),
             None,
         ),
+        (
+            QuantizationArgs(strategy="group", group_size=16, type="float", num_bits=4),
+            None,
+        ),
         (
             QuantizationArgs(strategy="block"),
             QuantizationArgs(strategy="block"),
@@ -177,6 +184,14 @@ def test_initialize_quantization_parameters(weights, input_activations):
             continue
         q_param_name = Q_PARAM_NAMES[q_type]
+        if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
+            assert hasattr(layer, "weight_global_scale")
+            assert layer.weight_global_scale.dtype == torch.float32
+            assert layer.weight_global_scale.numel() == 1
+            assert layer.weight_scale.dtype == FP8_E4M3_DATA.dtype
+        else:
+            assert not hasattr(layer, "weight_global_scale")
         # scale and zero point
         if args.strategy == QuantizationStrategy.TENSOR:
             expected_shape = (1,)

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_utils/test_helpers.py RENAMED Viewed

@@ -14,8 +14,16 @@
 import pytest
 import torch
-from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
-from compressed_tensors.quantization.utils import calculate_qparams
+from compressed_tensors.quantization import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    QuantizationArgs,
+    QuantizationStrategy,
+)
+from compressed_tensors.quantization.utils import (
+    calculate_qparams,
+    generate_global_scale,
+)
 @pytest.mark.parametrize(
@@ -56,3 +64,14 @@ def test_calculate_qparams(keepdims, strategy, exp_shape):
         scale, zp = calculate_qparams(min_val, max_val, args)
         assert scale.shape == exp_shape
         assert zp.shape == exp_shape
+def test_fused_global_scales():
+    layer = torch.nn.Linear(7, 8)
+    max_tensor_value = torch.abs(layer.weight.data).max()
+    # use defaults
+    global_scale = generate_global_scale(layer.weight)
+    # max value should be = (448 * 6) / global_scale
+    assert max_tensor_value == pytest.approx(
+        FP4_E2M1_DATA.max * FP8_E4M3_DATA.max / global_scale, abs=0.001
+    )

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/.gitkeep RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/actions/test/action.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/scripts/step-status RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/build-test.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/build.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/report.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/test-check.yaml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/test.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/trigger-all.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/upload.yml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.gitignore RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/LICENSE RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/Makefile RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/README.md RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bit_packing/ex_quantize_and_pack.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bit_packing/int4_config.json RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bitmask_compression.ipynb RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/ex_config_quantization.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/ex_llmcompressor_quantization.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/example_quant_config.json RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/example_quant_recipe.yaml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/quantize_and_pack_int4.ipynb RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/pyproject.toml RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/setup.cfg RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/setup.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/README.md RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/base.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/base.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/helpers.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/model_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/base.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/dense.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/base.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/dense.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/sparse_24_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/sparse_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/linear/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/linear/compressed_linear.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/compressed.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/helpers.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_config.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_scheme.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/utils/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/registry/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/registry/registry.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/helpers.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/offload.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/permutations_24.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/permute.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/safetensors_load.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/semi_structured_conversions.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/requires.txt RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/top_level.txt RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/conftest.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/model_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/model_compressors/test_model_compressor.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_fp8_quant.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_int_quant.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_nvfp4_quant.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_pack_quant.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/test_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_quantized_compressors/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_configs/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_configs/test_base.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_examples/test_bitmask_compression_ipynb.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_linear/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_linear/test_compressed_linear.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/conftest.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_apply.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_enabled.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_forward.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_helpers.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_lifecycle.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/test_bit_depths.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/test_strategies.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_args.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_config.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_scheme.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_registry.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/__init__.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_helpers.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_offload.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_safetensors_load.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/testing_utils.py RENAMED Viewed

File without changes

{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/utils/copyright.py RENAMED Viewed

File without changes

compressed-tensors 0.9.5a20250514__tar.gz → 0.9.5a20250520__tar.gz

compressed-tensors 0.9.5a20250514tar.gz → 0.9.5a20250520tar.gz