PyPI - compressed-tensors - Versions diffs - 0.12.3a20251003__py3-none-any.whl → 0.12.3a20251007__py3-none-any.whl - Mend

compressed-tensors 0.12.3a20251003py3-none-any.whl → 0.12.3a20251007py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -14,14 +14,13 @@
 import logging
-import math
-import warnings
-from typing import Optional
+from typing import Optional, Tuple
 import torch
 from compressed_tensors.quantization import (
     FP8_E4M3_DATA,
     ActivationOrdering,
+    DynamicType,
     KVCacheScaleType,
     QuantizationArgs,
     QuantizationMetadata,
@@ -32,7 +31,11 @@ from compressed_tensors.quantization import (
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.utils import is_fp4, is_kv_cache_quant_scheme
+from compressed_tensors.quantization.utils import (
+    is_fp4,
+    is_kv_cache_quant_scheme,
+    strategy_cdiv,
+)
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
@@ -44,6 +47,7 @@ from torch.nn import Module, Parameter
 __all__ = [
     "initialize_module_for_quantization",
     "is_attention_module",
+    "initialize_qparams",
 ]
@@ -69,10 +73,8 @@ def initialize_module_for_quantization(
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
     """
-    # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
     if scheme is None:
-        # no scheme passed and layer not targeted for quantization - skip
         return
     QuantizationMetadata.clear_all_qparams(module)
@@ -82,38 +84,52 @@ def initialize_module_for_quantization(
         _initialize_attn_scales(module)
     else:
+        if not isinstance(module, torch.nn.Linear):
+            _LOGGER.warning(f"Attempting to quantize module of type {type(module)}")
+        # use weight to determine observed shapes and dtype
+        if hasattr(module, "weight"):
+            weight = module.weight
+            assert isinstance(weight, torch.Tensor)
+        else:
+            # Note that a weight is required for both weight and activation
+            # quantization in order to know the dtype of activation scales
+            _LOGGER.warning(
+                f"module type {type(module)} targeted for quantization but "
+                f"has no attribute weight, skipping quantization for {type(module)}"
+            )
+            return
         if scheme.input_activations is not None:
-            _initialize_scale_zero_point(
+            initialize_qparams(
                 module,
                 "input",
                 scheme.input_activations,
+                observed_shape=weight.shape[-1:],
+                observed_dtype=weight.dtype,
                 force_zero_point=force_zero_point,
             )
         if scheme.weights is not None:
-            if hasattr(module, "weight"):
-                weight_shape = None
-                if isinstance(module, torch.nn.Linear):
-                    weight_shape = module.weight.shape
-                _initialize_scale_zero_point(
-                    module,
-                    "weight",
-                    scheme.weights,
-                    weight_shape=weight_shape,
-                    force_zero_point=force_zero_point,
-                )
-            else:
-                _LOGGER.warning(
-                    f"module type {type(module)} targeted for weight quantization but "
-                    "has no attribute weight, skipping weight quantization "
-                    f"for {type(module)}"
-                )
-        if scheme.output_activations is not None:
-            if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations
-                )
+            initialize_qparams(
+                module,
+                "weight",
+                scheme.weights,
+                observed_shape=weight.shape,
+                observed_dtype=weight.dtype,
+                force_zero_point=force_zero_point,
+            )
+        output_is_kv_cache = is_kv_cache_quant_scheme(scheme)
+        if scheme.output_activations is not None and not output_is_kv_cache:
+            initialize_qparams(
+                module,
+                "output",
+                scheme.output_activations,
+                observed_shape=weight.shape[:-1],
+                observed_dtype=weight.dtype,
+                force_zero_point=force_zero_point,
+            )
         module.quantization_scheme = scheme
         module.quantization_status = QuantizationStatus.INITIALIZED
@@ -132,22 +148,40 @@ def is_attention_module(module: Module):
     )
-def _initialize_scale_zero_point(
+def initialize_qparams(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    weight_shape: Optional[torch.Size] = None,
+    observed_shape: Tuple[int],
+    observed_dtype: torch.dtype,
     force_zero_point: bool = True,
 ):
-    if quantization_args.dynamic is True:
-        return
+    """
+    Initialize quantization parameters for a given basename according to the passed
+    quantization args. The shape and dtype of the observed weight/activation must also
+    be provided.
+    Scales will always be initialized. Global scales are initialized depending on args.
+    Zero points will be initialized if not symmetric or if `force_zero_point` is True.
+    :param module: module to register qparams to
+    :param base_name: base name of qparams, for example "input", "weight", "k", "v"
+    :param quantization_args: arguments for quantization
+    :param observed_shape: last (right-most) known dimensions of the observed weight/act
+    :param observed_dtype: dtype of the observed weight/actt
+    :param force_zero_point: force the zero_point parameter to be initialized
+    """
+    strategy = quantization_args.strategy
+    dynamic = quantization_args.dynamic
+    actorder = quantization_args.actorder
+    device = get_execution_device(module)  # avoid performing intialization ops on cpu
-    # initialize on execution device to avoid performing quantized ops on cpu
-    device = get_execution_device(module)
+    # Skip all intialization for fully dynamic quantization
+    if dynamic is True:
+        return
-    # 1. Create global_scales for tensor_group - generates
-    # a per tensor scale
-    if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    # 0. Create global scale for tensor-group quantization
+    if strategy == QuantizationStrategy.TENSOR_GROUP:
         init_global_scale = Parameter(
             torch.empty(1, dtype=torch.float32, device=device),
             requires_grad=False,
@@ -156,56 +190,55 @@ def _initialize_scale_zero_point(
             module, f"{base_name}_global_scale", init_global_scale
         )
-    # 2. Infer expected scale/zero point shape
-    if quantization_args.strategy == QuantizationStrategy.TOKEN:
+    # Skip scale/zp initialization for locally dynamic quantization
+    if dynamic == DynamicType.LOCAL:
+        return
+    # 1. Infer expected scale/zp shape
+    if strategy == QuantizationStrategy.TENSOR:
+        expected_shape = (1,)
+    elif strategy == QuantizationStrategy.TOKEN:
         expected_shape = (1, 1)
+    elif strategy == QuantizationStrategy.CHANNEL:
+        if len(observed_shape) < 2:
+            raise ValueError("Channel quant requires at least 2 observed dimensions")
+        expected_shape = (observed_shape[-2], 1)
+    elif strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        assert quantization_args.group_size is not None
+        if len(observed_shape) < 1:
+            raise ValueError("Group quant requires at least 1 observed dimension")
+        group_size = quantization_args.group_size
+        num_groups = strategy_cdiv(observed_shape[-1], group_size, strategy)
+        expected_shape = (*observed_shape[:-1], num_groups)
+        # initialize activation ordering if applicable
+        if actorder == ActivationOrdering.GROUP:
+            init_g_idx = Parameter(
+                torch.full((observed_shape[-1],), -1, device=device, dtype=torch.int),
+                requires_grad=False,
+            )
+            register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
+    elif strategy == QuantizationStrategy.BLOCK:
+        assert quantization_args.block_structure is not None
+        if len(observed_shape) < 2:
+            raise ValueError("Block quant requires at least 2 observed dimensions")
+        block_structure = quantization_args.block_structure
+        num_rows = strategy_cdiv(observed_shape[-2], block_structure[-2], strategy)
+        num_cols = strategy_cdiv(observed_shape[-1], block_structure[-1], strategy)
+        expected_shape = (num_rows, num_cols)
     else:
-        expected_shape = 1
-    if base_name == "weight" and weight_shape is not None:
-        if quantization_args.strategy == QuantizationStrategy.CHANNEL:
-            # (output_channels, 1) - only for weights
-            expected_shape = (weight_shape[0], 1)
-        elif quantization_args.strategy in (
-            QuantizationStrategy.TENSOR_GROUP,
-            QuantizationStrategy.GROUP,
-        ):
-            # GROUP/TENSOR_GROUP for both weights and activations
-            num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
-            expected_shape = (weight_shape[0], max(num_groups, 1))
-        elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-            # For block quantization, scale shape should match number of blocks - only
-            # for weights
-            if quantization_args.block_structure is None:
-                raise ValueError(
-                    "Block quantization requires block_structure to be specified"
-                )
-            block_height, block_width = quantization_args.block_structure
-            rows, cols = weight_shape[-2], weight_shape[-1]
-            num_rows_blocks = math.ceil(rows / block_height)
-            num_cols_blocks = math.ceil(cols / block_width)
-            # Warn if dimensions don't divide evenly
-            if rows % block_height != 0 or cols % block_width != 0:
-                warnings.warn(
-                    f"Block quantization: tensor shape {weight_shape} does not divide"
-                    f"evenly by block structure {quantization_args.block_structure}. "
-                    f"Some blocks will be incomplete which may affect quantization"
-                    "quality.",
-                    UserWarning,
-                )
-            expected_shape = (num_rows_blocks, num_cols_blocks)
-    elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-        warnings.warn(
-            f"BLOCK quantization not supported for {base_name} activations. "
-            f"Falling back to tensor-level quantization.",
-            UserWarning,
-        )
-        expected_shape = 1
+        assert False, f"Unknown strategy {strategy}"
-    # 3. Identify quantization scale and zp dtype
-    scale_dtype = module.weight.dtype
+    # 2. Identify quantization scale and zp dtype
+    scale_dtype = observed_dtype
     if is_fp4(quantization_args=quantization_args):
         scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
@@ -221,14 +254,12 @@ def _initialize_scale_zero_point(
             scale_dtype = torch.bfloat16
         zp_dtype = quantization_args.pytorch_dtype()
-    # 4. Initializes empty scale, zero point, and g_idx parameters for the module
-    # do not init scales for quantzation_args.dynamic == DynamicType.local
-    if not quantization_args.dynamic:
-        init_scale = Parameter(
-            torch.empty(expected_shape, dtype=scale_dtype, device=device),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_scale", init_scale)
+    # 3. Initializes scale/zp for the module
+    init_scale = Parameter(
+        torch.empty(expected_shape, dtype=scale_dtype, device=device),
+        requires_grad=False,
+    )
+    register_offload_parameter(module, f"{base_name}_scale", init_scale)
     if force_zero_point or not quantization_args.symmetric:
         init_zero_point = Parameter(
@@ -237,16 +268,6 @@ def _initialize_scale_zero_point(
         )
         register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)
-    # only grouped activation ordering has g_idx
-    if quantization_args.actorder == ActivationOrdering.GROUP:
-        g_idx_shape = (weight_shape[1],)
-        g_idx_dtype = torch.int
-        init_g_idx = Parameter(
-            torch.full(g_idx_shape, -1, device=device, dtype=g_idx_dtype),
-            requires_grad=False,
-        )
-        register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx)
 def _initialize_attn_scales(module: Module) -> None:
     """Initlaize k_scale, v_scale for  self_attn"""

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -27,6 +27,7 @@ from compressed_tensors.quantization.quant_args import (
 )
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.utils import deprecated
+from loguru import logger
 from torch import FloatTensor, IntTensor, Tensor
 from torch.nn import Module
@@ -47,6 +48,7 @@ __all__ = [
     "calculate_qparams",
     "generate_gparam",
     "is_fp4",
+    "strategy_cdiv",
 ]
 # target the self_attn layer
@@ -461,3 +463,26 @@ def generate_gparam(
     max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
     global_scale = scale_data.max * quant_data.max / max_val_pos
     return global_scale.to(dtype).reshape([1])
+def strategy_cdiv(
+    value: int,
+    divisor: int,
+    strategy: Optional[QuantizationStrategy],
+    strict: bool = False,
+) -> int:
+    dividend = math.ceil(value / divisor)
+    if dividend * divisor != value:
+        message = (
+            f"{strategy} quantization strategy requires strict division of "
+            f"weight/activation size {value} and group/block size {divisor}. "
+            "consider reducing the group/block size or ignoring modules with "
+            f"weights not divisible by {divisor}"
+        )
+        if strict:
+            raise ValueError(message)
+        else:
+            logger.bind(log_once=True).warning(message)
+    return dividend

compressed_tensors/version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.12.3.a20251003'
+__version__ = version = '0.12.3.a20251007'
 __version_tuple__ = version_tuple = (0, 12, 3)

{compressed_tensors-0.12.3a20251003.dist-info → compressed_tensors-0.12.3a20251007.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.12.3a20251003
+Version: 0.12.3a20251007
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.12.3a20251003.dist-info → compressed_tensors-0.12.3a20251007.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 compressed_tensors/__init__.py,sha256=SRqNYFVvxAaLa4SImhoiIBKfoOSj7EUdx0CxXjGC2PA,884
 compressed_tensors/base.py,sha256=-gxWvDF4LCkyeDP8YlGzvBBKxo4Dk9h4NINPD61drFU,921
 compressed_tensors/logger.py,sha256=sTm1Od1cV0aDxBm3YN-PPvsOATxY_2tBV62TQE4HiPw,4032
-compressed_tensors/version.py,sha256=8cvrsHpab_kfxFsOLRqkafZiztV6QoX8qSdtWAslAE8,523
+compressed_tensors/version.py,sha256=wUAf4k-SKBfmo_lva-t_YeLxw6mVgmYfFIXdh6YQLP4,523
 compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
 compressed_tensors/compressors/base.py,sha256=nvWsv4xEw1Tkxkxth6TmHplDYXfBeP22xWxOsZERyDY,7204
 compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
@@ -37,9 +37,9 @@ compressed_tensors/quantization/lifecycle/apply.py,sha256=1zRc7tQbE5OAVJ5VRgU9FZ
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=_gTH0CnLe8MxkTY1hrCCeSYAMzuvIwoCTT4hxW1TPk4,2354
 compressed_tensors/quantization/lifecycle/forward.py,sha256=MAw049L4a9ha4P5D4MjOMoIcSwv9_ZXizahYzHJaaQI,17550
 compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
-compressed_tensors/quantization/lifecycle/initialize.py,sha256=1vCHN1LLSIoh6IgGAyE-Jan64HvuiChwdGhslbYkXhk,9832
+compressed_tensors/quantization/lifecycle/initialize.py,sha256=xebqRiQz3hiSTYwCQQsovg-IKJtHkAbuj6eWygf5yKY,10259
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
-compressed_tensors/quantization/utils/helpers.py,sha256=fyAjq0YbrZG6_UbEU7n6Z1i-Bg4PLA-mQiu7mHS9fs0,16420
+compressed_tensors/quantization/utils/helpers.py,sha256=BA-twfAKk-HMBr_OZHZnSQN7F1a0l5zB1kJhml6j-cI,17146
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
 compressed_tensors/registry/registry.py,sha256=cWnlwZ66lgG0w9OAUEAgq5XVxqsgFm1o8ZYdNhkNvJY,11957
 compressed_tensors/transform/__init__.py,sha256=v2wfl4CMfA6KbD7Hxx_MbRev63y_6QLDlccZq-WTtdw,907
@@ -65,8 +65,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
 compressed_tensors/utils/safetensors_load.py,sha256=Vql34aCTDHwmTZXJHzCyBISJo7iA7EQ78LdTlMjdpZo,12023
 compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
 compressed_tensors/utils/type.py,sha256=bNwoo_FWlvLuDpYAGGzZJITRg0JA_Ngk9LGPo-kvjeU,2554
-compressed_tensors-0.12.3a20251003.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors-0.12.3a20251003.dist-info/METADATA,sha256=8jYvFdUw29ME2nRPHzr3TcPKJ6wtt_iP2pfjIF6WzY0,7027
-compressed_tensors-0.12.3a20251003.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-compressed_tensors-0.12.3a20251003.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors-0.12.3a20251003.dist-info/RECORD,,
+compressed_tensors-0.12.3a20251007.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors-0.12.3a20251007.dist-info/METADATA,sha256=8uUWt8bF7sZhbMQd2Llj4PDLC7I4ALKJE_eFNa8DBWI,7027
+compressed_tensors-0.12.3a20251007.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+compressed_tensors-0.12.3a20251007.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors-0.12.3a20251007.dist-info/RECORD,,

{compressed_tensors-0.12.3a20251003.dist-info → compressed_tensors-0.12.3a20251007.dist-info}/WHEEL RENAMED Viewed

File without changes

{compressed_tensors-0.12.3a20251003.dist-info → compressed_tensors-0.12.3a20251007.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{compressed_tensors-0.12.3a20251003.dist-info → compressed_tensors-0.12.3a20251007.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors 0.12.3a20251003__py3-none-any.whl → 0.12.3a20251007__py3-none-any.whl

compressed-tensors 0.12.3a20251003py3-none-any.whl → 0.12.3a20251007py3-none-any.whl