PyPI - compressed-tensors - Versions diffs - 0.9.5a20250530__tar.gz → 0.9.5a20250603__tar.gz - Mend

compressed-tensors 0.9.5a20250530tar.gz → 0.9.5a20250603tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

{compressed_tensors-0.9.5a20250530/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250603}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.5a20250530
+Version: 0.9.5a20250603
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/quantization/lifecycle/apply.py RENAMED Viewed

@@ -27,14 +27,8 @@ from compressed_tensors.quantization.lifecycle.compressed import (
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
-    update_fused_layer_weight_global_scales,
-)
-from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
-    FP8_E4M3_DATA,
-    QuantizationArgs,
-    QuantizationType,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
@@ -272,9 +266,6 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
-        if status == QuantizationStatus.INITIALIZED:
-            update_fused_layer_weight_global_scales(model)
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -227,31 +227,42 @@ def _process_quantization(
             perm = torch.argsort(g_idx)
             x = safe_permute(x, perm, dim=1)
-        # TODO: experiment with vectorizing for loop for performance
-        end = 0
-        for index, group_count in enumerate(group_sizes):
-            sc = scale[:, index].view(-1, 1)
-            zp = zero_point[:, index].view(-1, 1) if zero_point is not None else None
-            start = end
-            end = start + group_count
-            if do_quantize:
-                output[:, start:end] = _quantize(
-                    x=x[:, start:end],
-                    scale=sc,
-                    zero_point=zp,
-                    q_min=q_min,
-                    q_max=q_max,
-                    args=args,
-                    dtype=dtype,
-                    global_scale=global_scale,
-                )
+        x = torch.reshape(
+            x,
+            (
+                x.shape[0],
+                ceil(x.shape[1] / group_size),
+                group_size,
+            ),
+        )
-            if do_dequantize:
-                input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(
-                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
-                )
+        if do_quantize:
+            output = _quantize(
+                x=x,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                dtype=dtype,
+                global_scale=global_scale,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+            )
+        if do_dequantize:
+            input = output if do_quantize else x
+            output = _dequantize(
+                x_q=input,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                global_scale=global_scale,
+            )
+        output = torch.reshape(
+            output,
+            (output.shape[0], output.shape[1] * output.shape[2]),
+        )
+        output = output.to(output_dtype)
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -23,26 +23,18 @@ from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
     FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
     QuantizationStrategy,
-    QuantizationType,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import (
-    generate_global_scale,
-    is_fp4,
-    is_kv_cache_quant_scheme,
-    iter_named_quantizable_modules,
-)
+from compressed_tensors.quantization.utils import is_fp4, is_kv_cache_quant_scheme
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
     register_offload_parameter,
-    update_parameter_data,
 )
 from torch.nn import Module, Parameter
@@ -51,7 +43,6 @@ __all__ = [
     "initialize_module_for_quantization",
     "is_attention_module",
     "KVCacheScaleType",
-    "update_fused_layer_weight_global_scales",
 ]
@@ -162,22 +153,13 @@ def _initialize_scale_zero_point(
     # initialize on execution device to avoid performing quantized ops on cpu
     device = get_execution_device(module)
-    # 1. Create global_scales for tensor_group
+    # 1. Create global_scales for tensor_group - generates
+    # a per tensor scale
     if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
-        # TODO: should move to llmcompressor
-        if base_name == "weight":
-            # When applying weight-only FP4 quantization, generate a global_scale
-            # This scale is applied during runtime to ensure that the generated
-            # local scale falls properly within the FP8 range (i.e max value is FP8_max)
-            # which is the expected dtype of NVFP4A16 scales
-            value = generate_global_scale(input_tensor=module.weight)
-            value = value.to(device)
-            init_global_scale = Parameter(value, requires_grad=False)
-        else:
-            init_global_scale = Parameter(
-                torch.empty(1, dtype=torch.float32, device=device),
-                requires_grad=False,
-            )
+        init_global_scale = Parameter(
+            torch.empty(1, dtype=torch.float32, device=device),
+            requires_grad=False,
+        )
         register_offload_parameter(
             module, f"{base_name}_global_scale", init_global_scale
         )
@@ -258,91 +240,3 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
-# TODO: Potentially introduce an argument to turn this off
-# Only relevant for NVFP4A16 currently
-def update_fused_layer_weight_global_scales(model: torch.nn.Module):
-    """
-    When running NVFP4A16 quantization, update the global scale
-    such that q,k,v layers are treated as one tensor with the same
-    global_scale and gate_proj/up_proj layers are treated as one tensor
-    with the same global scale. This is requirement currently being set
-    by vLLM and may be removed in the future OR potentially make it
-    an optional step.
-    :param model: model to quantize
-    """
-    def _is_attention_module(module: Module):
-        return "attention" in module.__class__.__name__.lower() and (
-            hasattr(module, "k_proj")
-            or hasattr(module, "v_proj")
-            or hasattr(module, "qkv_proj")
-        )
-    def _is_mlp_module(module: Module):
-        return "mlp" in module.__class__.__name__.lower() and (
-            hasattr(module, "gate_proj") or hasattr(module, "up_proj")
-        )
-    def _valid_fp4_quant(layer_list: List[torch.nn.Linear]):
-        """
-        Return True if all the linear layers in the layer_list are
-        NVFP4A16 quantized.
-        """
-        for layer in layer_list:
-            scheme = getattr(layer, "quantization_scheme", None)
-            if scheme is None:
-                return False
-            weight_quant_args = scheme.weights
-            if weight_quant_args is None:
-                return False
-            if not is_fp4(quantization_args=weight_quant_args):
-                return False
-        return True
-    for name, submodule in iter_named_quantizable_modules(
-        model,
-        include_attn=True,
-        include_mlp=True,
-    ):
-        if _is_attention_module(submodule):
-            # already fused/treated as one layer
-            if hasattr(submodule, "qkv_proj"):
-                continue
-            if not _valid_fp4_quant(
-                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
-            ):
-                continue
-            q_weight = submodule.q_proj.weight.data
-            v_weight = submodule.v_proj.weight.data
-            k_weight = submodule.k_proj.weight.data
-            value = generate_global_scale(
-                input_tensor=torch.cat((q_weight, v_weight, k_weight), dim=0)
-            )
-            update_parameter_data(submodule.q_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.k_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.v_proj, value, "weight_global_scale")
-        if _is_mlp_module(submodule):
-            if not _valid_fp4_quant([submodule.gate_proj, submodule.up_proj]):
-                continue
-            gate_data = submodule.gate_proj.weight.data
-            up_data = submodule.up_proj.weight.data
-            value = generate_global_scale(
-                input_tensor=torch.cat((gate_data, up_data), dim=0)
-            )
-            update_parameter_data(submodule.gate_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.up_proj, value, "weight_global_scale")

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -53,6 +53,7 @@ class FP4_E2M1_DATA(FloatArgs):
     min = -6.0
     @staticmethod
+    @torch.compile
     def cast_to_fp4(x):
         sign = torch.sign(x)
         x = torch.abs(x)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/quantization/utils/helpers.py RENAMED Viewed

@@ -47,7 +47,7 @@ __all__ = [
     "compute_dynamic_scales_and_zp",
     "calculate_range",
     "calculate_qparams",
-    "generate_global_scale",
+    "generate_gparam",
     "is_fp4",
 ]
@@ -81,7 +81,7 @@ def calculate_qparams(
         currently only applied/supported for Fp4
     :return: tuple of the calculated scale(s) and zero point(s). For FP4, the calculated
-        scale if of dtype FP8
+        scale is of dtype FP8
     """
     # based on the implementations for consuming quantized values,
     # 0.0 must always be representable within the quantized range
@@ -475,8 +475,9 @@ def parse_out_kv_cache_args(
     return kv_cache_args, quant_scheme_to_layers
-def generate_global_scale(
-    input_tensor: torch.Tensor,
+def generate_gparam(
+    updated_min_val: torch.Tensor,
+    updated_max_val: torch.Tensor,
     scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
     quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
     dtype: Optional[torch.dtype] = torch.float32,
@@ -490,7 +491,8 @@ def generate_global_scale(
     attempts to use the entire FP8 dtype range while mapping a per-group max
     to the FP4 max.
     """
-    scale_dtype = scale_data.dtype
-    tensor_amax = torch.abs(input_tensor.data).max().to(dtype)
-    global_scale = scale_data.max * quant_data.max / tensor_amax
+    min_vals = torch.min(updated_min_val, torch.zeros_like(updated_min_val))
+    max_vals = torch.max(updated_max_val, torch.zeros_like(updated_max_val))
+    max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
+    global_scale = scale_data.max * quant_data.max / max_val_pos
     return global_scale.to(dtype)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/utils/offload.py RENAMED Viewed

@@ -28,15 +28,18 @@ Utilities associated with offloading functionality provided by `accelerate`.
 import contextlib
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Union
 import torch
 try:
+    from accelerate import dispatch_model
     from accelerate.hooks import (
         AlignDevicesHook,
         add_hook_to_module,
+        attach_align_device_hook,
+        named_module_tensors,
         remove_hook_from_module,
     )
     from accelerate.utils import (
@@ -54,6 +57,9 @@ except ImportError:
     OffloadedWeightsLoader = None
     PrefixedDataset = None
     set_module_tensor_to_device = None
+    named_module_tensors = None
+    dispatch_model = None
+    attach_align_device_hook = None
 __all__ = [
@@ -70,6 +76,9 @@ __all__ = [
     "disable_offload",
     "align_modules",
     "align_module_device",
+    "register_offload_module",
+    "delete_offload_module",
+    "force_cpu_offload",
 ]
@@ -77,6 +86,11 @@ def check_accelerate(fallback: Any):
     def decorator(func: Callable[[Any], Any]):
         if not _has_accelerate:
+            if fallback == "error":
+                raise ValueError(
+                    "Please install `accelerate` in order to use this function"
+                )
             @wraps(func)
             def fallback_fn(*args, **kwargs):
                 return fallback
@@ -346,6 +360,7 @@ def delete_from_weights_map(
         )
+@check_accelerate(fallback=contextlib.nullcontext())
 @contextlib.contextmanager
 def disable_offload(module: torch.nn.Module):
     """
@@ -362,6 +377,7 @@ def disable_offload(module: torch.nn.Module):
         yield
+@check_accelerate(fallback=contextlib.nullcontext())
 @contextlib.contextmanager
 def align_modules(
     modules: Union[torch.nn.Module, Iterable[torch.nn.Module]],
@@ -383,6 +399,123 @@ def align_modules(
         yield
+def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.Module):
+    """
+    Register a submodule with offloading if the parent module is offloaded
+    :param base: module to attach submodule to
+    :param name: name of submodule
+    :param module: submodule to attach
+    """
+    if has_offloaded_params(base):
+        hook: AlignDevicesHook = base._hf_hook
+        assert hook.offload
+        assert hook.weights_map is not None
+        assert hook.tied_params_map is not None
+        # offloading kwargs for submodule
+        place_submodules = False
+        offload_buffers = True
+        # copy device offloading arguments from parent
+        current_device = next(base.parameters()).device  # assume base has parameters
+        offload_device = get_offloaded_device(base)
+        # offload parameters to weights map
+        for param_name, param in named_module_tensors(
+            module, include_buffers=offload_buffers, recurse=place_submodules
+        ):
+            offloaded = param.to(offload_device)
+            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+            offload_to_weights_map(hook.weights_map, f"{name}.{param_name}", offloaded)
+            # if the parent places submodules, offload here
+            if hook.place_submodules:
+                set_module_tensor_to_device(module, param_name, current_device)
+        # if the parent does not place submodules, then add a hook
+        # parameters are offloaded by `add_hook_to_module`
+        if not hook.place_submodules:
+            weights_map = PrefixedDataset(
+                hook.weights_map.dataset, prefix=f"{hook.weights_map.prefix}{name}."
+            )
+            submodule_hook = AlignDevicesHook(
+                execution_device=hook.execution_device,
+                offload=hook.offload,
+                io_same_device=False,
+                weights_map=weights_map,
+                offload_buffers=offload_buffers,
+                place_submodules=place_submodules,
+                skip_keys=None,
+                tied_params_map=hook.tied_params_map,
+            )
+            add_hook_to_module(module, submodule_hook)
+    base.register_module(name, module)
+    # (1): Since we cannot know which pointers are shared when we add parameters in an
+    # online way, assume that all pointers are shared. This comes at no runtime cost
+def delete_offload_module(base: torch.nn.Module, name: str):
+    """
+    Delete a submodule from a model which may contain offloading
+    :param base: parent module to delete submodule from
+    :param name: name of submodule on parent
+    """
+    module: torch.nn.Module = getattr(base, name)
+    for param_name, _ in list(module.named_parameters()):
+        delete_offload_parameter(module, param_name)
+    delattr(base, name)
+@check_accelerate(fallback="error")
+def force_cpu_offload(
+    module: torch.nn.Module, execution_device: torch.device
+) -> torch.nn.Module:
+    """
+    Force cpu offloading a module, primarily used for testing
+    :param module: module containing parameters to offload
+    :param execution_device: execution device submodules
+    :return: module with hooks to perform cpu offloading
+    """
+    # edge case: there is a bug in `dispatch_model` which causes
+    # the function to only work if the model contains submodules
+    if next(module.children(), None) is None:
+        attach_align_device_hook(
+            module,
+            execution_device=execution_device,
+            offload=True,
+            weights_map=module.state_dict(),
+            tied_params_map={},
+        )
+        return module
+    device_map = {}
+    def collect_device_map(name: List[str], module: torch.nn.Module):
+        if next(module.parameters(recurse=False), None) is not None:
+            device_map[".".join(name)] = "cpu"
+            return
+        else:
+            for submodule_name, submodule in module.named_children():
+                name.append(submodule_name)
+                collect_device_map(name, submodule)
+                name.pop()
+    collect_device_map([], module)
+    return dispatch_model(
+        module, device_map, main_device=execution_device, force_hooks=True
+    )
 """ Upstreamed Functions """

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/src/compressed_tensors/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.9.5.a20250530'
+__version__ = version = '0.9.5.a20250603'
 __version_tuple__ = version_tuple = (0, 9, 5)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.5a20250530
+Version: 0.9.5a20250603
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/tests/test_compressors/quantized_compressors/test_fp8_quant.py RENAMED Viewed

@@ -61,8 +61,8 @@ def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor:
         [
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1), dtype=torch.int8),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8), dtype=torch.int8),
         ],
         [
             QuantizationStrategy.CHANNEL,
@@ -79,7 +79,7 @@ def test_quant_format(strategy, group_size, sc, zp):
         "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.float32),
     }
     if group_size is not None:
-        dense_state_dict["dummy.weight_g_idx"] = make_dummy_g_idx(512, group_size)
+        dense_state_dict["dummy.weight_g_idx"] = make_dummy_g_idx(1024, group_size)
     quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size)

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/tests/test_compressors/quantized_compressors/test_int_quant.py RENAMED Viewed

@@ -53,8 +53,8 @@ def get_dummy_quant_config(strategy, group_size=None, symmetric=True):
             QuantizationStrategy.GROUP,
             True,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1), dtype=torch.int8),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8), dtype=torch.int8),
         ],
         [
             QuantizationStrategy.CHANNEL,

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/tests/test_quantization/lifecycle/test_forward.py RENAMED Viewed

@@ -108,8 +108,8 @@ def test_forward_quantize(
             "int",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             None,
         ),
         (
@@ -117,8 +117,8 @@ def test_forward_quantize(
             "int",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             make_dummy_g_idx(1024, 128),
         ),
         (
@@ -135,8 +135,8 @@ def test_forward_quantize(
             "float",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             None,
         ),
         (
@@ -144,8 +144,8 @@ def test_forward_quantize(
             "float",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             make_dummy_g_idx(1024, 128),
         ),
     ],
@@ -174,8 +174,8 @@ def test_quantize(num_bits, type, strategy, group_size, scale, zero_point, g_idx
             "int",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             None,
         ),
         (
@@ -183,8 +183,8 @@ def test_quantize(num_bits, type, strategy, group_size, scale, zero_point, g_idx
             "int",
             QuantizationStrategy.GROUP,
             128,
-            torch.rand((512, 8, 1)) * 0.01,
-            torch.zeros((512, 8, 1)),
+            torch.rand((512, 8)) * 0.01,
+            torch.zeros((512, 8)),
             make_dummy_g_idx(1024, 128),
         ),
     ],

{compressed_tensors-0.9.5a20250530 → compressed_tensors-0.9.5a20250603}/tests/test_quantization/test_utils/test_helpers.py RENAMED Viewed

@@ -20,10 +20,7 @@ from compressed_tensors.quantization import (
     QuantizationArgs,
     QuantizationStrategy,
 )
-from compressed_tensors.quantization.utils import (
-    calculate_qparams,
-    generate_global_scale,
-)
+from compressed_tensors.quantization.utils import calculate_qparams, generate_gparam
 @pytest.mark.parametrize(
@@ -70,7 +67,8 @@ def test_fused_global_scales():
     layer = torch.nn.Linear(7, 8)
     max_tensor_value = torch.abs(layer.weight.data).max()
     # use defaults
-    global_scale = generate_global_scale(layer.weight)
+    min_val, max_val = torch.aminmax(layer.weight)
+    global_scale = generate_gparam(min_val.data, max_val.data)
     # max value should be = (448 * 6) / global_scale
     assert max_tensor_value == pytest.approx(
         FP4_E2M1_DATA.max * FP8_E4M3_DATA.max / global_scale, abs=0.001

compressed-tensors 0.9.5a20250530__tar.gz → 0.9.5a20250603__tar.gz

compressed-tensors 0.9.5a20250530tar.gz → 0.9.5a20250603tar.gz