PyPI - compressed-tensors - Versions diffs - 0.9.4a20250414__py3-none-any.whl → 0.9.5a20250424__py3-none-any.whl - Mend

compressed-tensors 0.9.4a20250414py3-none-any.whl → 0.9.5a20250424py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

compressed_tensors/compressors/base.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 from compressed_tensors.config import SparsityCompressionConfig
 from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
 from compressed_tensors.registry import RegistryMixin
+from compressed_tensors.utils import has_offloaded_params
 from torch import Tensor
 from torch.nn import Module
@@ -169,6 +170,10 @@ class BaseCompressor(RegistryMixin, ABC):
         :param module: PyTorch module to decompress
         :return: tensor of the decompressed weight, or None if module is not quantized
         """
+        params_device = next(module.parameters()).device
+        device = "cpu" if has_offloaded_params(module) else params_device
         if not hasattr(module, "quantization_scheme"):
             return None  # module is not quantized
         quantization_scheme = module.quantization_scheme
@@ -182,7 +187,7 @@ class BaseCompressor(RegistryMixin, ABC):
         return self.decompress_weight(
             compressed_data=compressed_data, quantization_args=quantization_args
-        )
+        ).to(device)
     def decompress_weight(
         self, compressed_data: Dict[str, Tensor], **kwargs

compressed_tensors/compressors/model_compressors/model_compressor.py CHANGED Viewed

@@ -31,13 +31,14 @@ from compressed_tensors.base import (
     SPARSITY_CONFIG_NAME,
 )
 from compressed_tensors.compressors.base import BaseCompressor
+from compressed_tensors.compressors.sparse_compressors import DenseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
     QuantizationConfig,
     QuantizationStatus,
     apply_quantization_config,
-    load_pretrained_quantization,
+    load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
 from compressed_tensors.quantization.quant_args import QuantizationArgs
@@ -47,7 +48,9 @@ from compressed_tensors.quantization.utils import (
 )
 from compressed_tensors.utils import (
     get_safetensors_folder,
+    has_offloaded_params,
     merge_names,
+    register_offload_parameter,
     update_parameter_data,
 )
 from compressed_tensors.utils.helpers import (
@@ -382,6 +385,7 @@ class ModelCompressor:
             compressed_state_dict = self.quantization_compressor.compress(
                 state_dict, names_to_scheme=quantized_modules_to_args
             )
             if self.quantization_config.format != CompressionFormat.dense.value:
                 self.quantization_config.quantization_status = (
                     QuantizationStatus.COMPRESSED
@@ -411,6 +415,13 @@ class ModelCompressor:
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
+        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
+        The variations in these methods are a result of the subtle variations between the sparsity
+        and quantization compressors. Specifically, quantization compressors return not just the
+        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
+        compressors only return the decompressed weight.
         """
         model_path = get_safetensors_folder(model_path)
         sparse_decompressed = False
@@ -419,9 +430,16 @@ class ModelCompressor:
             self.sparsity_compressor is not None
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
+            params_to_ignore = None
+            if self.quantization_compressor is not None:
+                params_to_ignore = self.quantization_compressor.compression_param_names
             # Sparse decompression is applied on the model_path
-            dense_gen = self.sparsity_compressor.decompress(model_path)
-            self._replace_weights(dense_gen, model)
+            # The compressor will try and load any quantization parameters as well
+            # params_to_skip_load will skip over quantization params from being loaded
+            dense_gen = self.sparsity_compressor.decompress(
+                model_path, params_to_skip_load=params_to_ignore
+            )
+            self._replace_sparsity_weights(dense_gen, model)
             setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
             sparse_decompressed = True
@@ -430,13 +448,27 @@ class ModelCompressor:
             # quantization during apply_quantization_config. This ensures
             # that the dtypes of the weights are not unintentionally updated.
             # The status is restored after quantization params are loaded.
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
                 names_to_scheme = apply_quantization_config(
                     model, self.quantization_config
                 )
-                load_pretrained_quantization(model, model_path)
+                # Load activation scales/zp or any other quantization parameters
+                # Conditionally load the weight quantization parameters if we have a dense compressor
+                # Or if a sparsity compressor has already been applied
+                load_pretrained_quantization_parameters(
+                    model,
+                    model_path,
+                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
+                    # including initialization
+                    load_weight_quantization=(
+                        sparse_decompressed
+                        or isinstance(self.quantization_compressor, DenseCompressor)
+                    ),
+                )
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
@@ -445,6 +477,8 @@ class ModelCompressor:
             dense_gen = self.quantization_compressor.decompress(
                 model_path_or_state_dict, names_to_scheme=names_to_scheme
             )
+            # TODO: all weight quantization params will be moved to the compressor
+            # to prevent duplicate parameter updates in update_parameter_data
             self._replace_weights(dense_gen, model)
             def freeze_quantization_status(module):
@@ -500,7 +534,7 @@ class ModelCompressor:
         with open(config_file_path, "w") as config_file:
             json.dump(config_data, config_file, indent=2, sort_keys=True)
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -515,11 +549,60 @@ class ModelCompressor:
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
-            if hasattr(module, param_name):
-                update_parameter_data(module, data, param_name)
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            delattr(module, param_name)
+            requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
+            param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
+            register_offload_parameter(module, param_name, param)
+    def _replace_weights(self, dense_weight_generator, model: Module):
+        """
+        Replace the weights of the model with the
+        provided dense weights.
+        This method iterates over the dense_weight_generator and
+        updates the corresponding weights in the model. If a parameter
+        name does not exist in the model, it will be skipped.
+        :param dense_weight_generator (generator): A generator that yields
+            tuples of (name, data), where 'name' is the parameter name and
+            'data' is the updated param data
+        :param model: The model whose weights are to be updated.
+        """
+        for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+            module = operator.attrgetter(name)(model)
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            for param_name, param_data in data.items():
+                if hasattr(module, param_name):
+                    # If compressed, will have an incorrect dtype for transformers >4.49
+                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
+                    # to be consistent with loading which happens later as well
+                    # however, update_data does a good shape check - should be moved to the compressor
+                    if param_name == "weight":
+                        delattr(module, param_name)
+                        requires_grad = param_data.dtype in (
+                            torch.float16,
+                            torch.float32,
+                            torch.bfloat16,
+                        )
+                        param = torch.nn.Parameter(
+                            param_data.to(device), requires_grad=requires_grad
+                        )
+                        register_offload_parameter(module, param_name, param)
+                    else:
+                        # Should already be registered to the correct device for
+                        # for scales/zero-points
+                        update_parameter_data(module, param_data, param_name)
 def map_modules_to_quant_args(

compressed_tensors/compressors/quantized_compressors/base.py CHANGED Viewed

@@ -14,11 +14,11 @@
 import logging
 from pathlib import Path
-from typing import Any, Dict, Generator, Tuple, Union
+from typing import Any, Dict, Generator, Optional, Tuple, Union
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.utils import (
     get_nested_mappings_from_state_dict,
     get_nested_weight_mappings,
@@ -132,8 +132,10 @@ class BaseQuantizationCompressor(BaseCompressor):
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            # only save if asym
-            elif is_weight_zp and quant_args_zp.symmetric:
+            # only save zp if asym and not packed zp
+            elif is_weight_zp and (
+                quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
+            ):
                 continue
             # only save if asym
             elif is_input_zp and input_args_zp.symmetric:
@@ -145,6 +147,17 @@ class BaseQuantizationCompressor(BaseCompressor):
         return compressed_dict
+    def _check_if_zp_pack_quantized(self, quant_args):
+        from compressed_tensors.compressors import PackedQuantizationCompressor
+        if isinstance(self, PackedQuantizationCompressor):
+            if not quant_args.symmetric and quant_args.strategy in [
+                QuantizationStrategy.GROUP.value,
+                QuantizationStrategy.CHANNEL.value,
+            ]:
+                return True
+        return False
     def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
@@ -186,7 +199,8 @@ class BaseQuantizationCompressor(BaseCompressor):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
     def _decompress_from_state_dict(self, state_dict, names_to_scheme):
         weight_mappings = get_nested_mappings_from_state_dict(
@@ -202,4 +216,5 @@ class BaseQuantizationCompressor(BaseCompressor):
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data

compressed_tensors/compressors/quantized_compressors/pack_quantized.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Dict, Optional, Tuple
+from typing import Dict, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -21,7 +21,7 @@ from compressed_tensors.compressors.quantized_compressors.base import (
     BaseQuantizationCompressor,
 )
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
 from torch import Tensor
@@ -65,10 +65,26 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         """
         pack_factor = 32 // quantization_args.num_bits
         packed_size = math.ceil(weight_shape[1] / pack_factor)
-        return {
+        packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
+        output = {
             "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
             "weight_shape": (torch.Size((2,)), torch.int32),
         }
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            zp_factor = (
+                quantization_args.group_size
+                if quantization_args.strategy == QuantizationStrategy.GROUP.value
+                else weight_shape[-1]
+            )
+            output["weight_zero_point"] = (
+                torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
+                torch.int32,
+            )
+        return output
     def compress_weight(
         self,
@@ -104,6 +120,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
             quantized_weight = weight
         packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
         weight_shape = torch.tensor(weight.shape)
         if device is not None:
             packed_weight = packed_weight.to(device)
@@ -112,6 +129,15 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
+        # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            packed_zp = pack_to_int32(
+                zero_point, quantization_args.num_bits, packed_dim=0
+            )
+            compressed_dict["weight_zero_point"] = packed_zp
         return compressed_dict
     def decompress_weight(
@@ -133,6 +159,21 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         original_shape = torch.Size(compressed_data["weight_shape"])
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
+        # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            raise ValueError(
+                "Decompression of packed zero points is currently not supported"
+            )
+            assert zero_point is not None
+            original_zp_shape = (original_shape[0], scale.shape[-1])
+            zero_point = unpack_from_int32(
+                zero_point, num_bits, original_zp_shape, packed_dim=0
+            )
         decompressed_weight = dequantize(
             x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
@@ -140,7 +181,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
         return decompressed_weight
-def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
+def pack_to_int32(
+    value: torch.Tensor,
+    num_bits: int,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
+) -> torch.Tensor:
     """
     Packs a tensor of quantized weights stored in int8 into int32s with padding
@@ -176,14 +221,19 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
     pack_factor = 32 // num_bits
     # pad input tensor and initialize packed output
-    packed_size = math.ceil(value.shape[1] / pack_factor)
-    padding = packed_size * pack_factor - value.shape[1]
+    packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
+    padding = packed_size * pack_factor - value.shape[packed_dim]
     value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
     # pack values
-    packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
-    for i in range(pack_factor):
-        packed |= value[:, i::pack_factor] << num_bits * i
+    if packed_dim == 1:
+        packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[:, i::pack_factor] << num_bits * i
+    else:
+        packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[i::pack_factor, :] << num_bits * i
     # convert back to signed and torch
     packed = np.ascontiguousarray(packed).view(np.int32)
@@ -191,7 +241,10 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
 def unpack_from_int32(
-    value: torch.Tensor, num_bits: int, shape: torch.Size
+    value: torch.Tensor,
+    num_bits: int,
+    shape: torch.Size,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
 ) -> torch.Tensor:
     """
     Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
@@ -216,17 +269,31 @@ def unpack_from_int32(
     # unpack
     mask = (1 << num_bits) - 1
-    unpacked = torch.zeros(
-        (value.shape[0], value.shape[1] * pack_factor),
-        device=value.device,
-        dtype=torch.int32,
-    )
-    for i in range(pack_factor):
-        unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
-    # remove padding
-    original_row_size = int(shape[1])
-    unpacked = unpacked[:, :original_row_size]
+    if packed_dim == 1:
+        unpacked = torch.zeros(
+            (value.shape[0], value.shape[1] * pack_factor),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
+        # remove padding
+        original_row_size = int(shape[1])
+        unpacked = unpacked[:, :original_row_size]
+    else:
+        unpacked = torch.zeros(
+            (value.shape[0] * pack_factor, value.shape[1]),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
+        # remove padding
+        original_row_size = int(shape[0])
+        unpacked = unpacked[:original_row_size, :]
     # bits are packed in unsigned format, reformat to signed
     # update the value range from unsigned to signed

compressed_tensors/compressors/sparse_compressors/base.py CHANGED Viewed

@@ -98,7 +98,11 @@ class BaseSparseCompressor(BaseCompressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
+        self,
+        path_to_model_or_tensors: str,
+        device: str = "cpu",
+        params_to_skip_load: Optional[Tuple] = None,
+        **kwargs,
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located
@@ -108,6 +112,11 @@ class BaseSparseCompressor(BaseCompressor):
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file
         :param device: device to load decompressed weights onto
+        :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
+            parameters) that we want to skip loading. As the sparsity compresssor does
+            not handle quantized decompression, this should contain any quantization
+            parameters when decompressing stacked compressors. We want these parameters
+            to be handled by the quantization decompressor
         :return: iterator for generating decompressed weights
         """
         weight_mappings, ignored_params = get_nested_weight_mappings(
@@ -121,13 +130,21 @@ class BaseSparseCompressor(BaseCompressor):
                 full_name = merge_names(weight_name, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
             decompressed = self.decompress_weight(weight_data)
             yield merge_names(weight_name, "weight"), decompressed
         for ignored_param_name, safe_path in ignored_params.items():
-            with safe_open(safe_path, framework="pt", device=device) as f:
-                value = f.get_tensor(ignored_param_name)
-            yield ignored_param_name, value
+            should_skip = False
+            if params_to_skip_load is not None:
+                for param_to_skip in params_to_skip_load:
+                    if param_to_skip in ignored_param_name:
+                        should_skip = True
+            if not should_skip:
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    value = f.get_tensor(ignored_param_name)
+                yield ignored_param_name, value
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -44,11 +44,12 @@ from compressed_tensors.quantization.utils import (
 from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
 from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
+from safetensors import safe_open
 from torch.nn import Module
 __all__ = [
-    "load_pretrained_quantization",
+    "load_pretrained_quantization_parameters",
     "apply_quantization_config",
     "apply_quantization_status",
     "find_name_or_class_matches",
@@ -57,50 +58,62 @@ __all__ = [
 ]
 from compressed_tensors.quantization.utils.helpers import is_module_quantized
-from compressed_tensors.utils.safetensors_load import get_quantization_state_dict
+from compressed_tensors.utils.safetensors_load import (
+    get_quantization_parameter_to_path_mapping,
+)
 _LOGGER = logging.getLogger(__name__)
-def load_pretrained_quantization(model: Module, model_name_or_path: str):
+def load_pretrained_quantization_parameters(
+    model: Module,
+    model_name_or_path: Optional[str] = None,
+    load_weight_quantization: Optional[bool] = False,
+):
     """
     Loads the quantization parameters (scale and zero point) from model_name_or_path to
-    a model that has already been initialized with a quantization config
+    a model that has already been initialized with a quantization config.
+    NOTE: Will always load inputs/output parameters.
+    Will conditioanlly load weight parameters, if load_weight_quantization is set to True.
     :param model: model to load pretrained quantization parameters to
     :param model_name_or_path: Hugging Face stub or local folder containing a quantized
-    model, which is used to load quantization parameters
+        model, which is used to load quantization parameters
+    :param load_weight_quantization: whether or not the weight quantization parameters shoud
+        be laoded
     """
     model_path = get_safetensors_folder(model_name_or_path)
-    state_dict = get_quantization_state_dict(model_path)
+    mapping = get_quantization_parameter_to_path_mapping(model_path)
     for name, submodule in iter_named_leaf_modules(model):
         if not is_module_quantized(submodule):
             continue
-        if submodule.quantization_scheme.weights is not None:
-            base_name = "weight"
-            _load_quant_args_from_state_dict(
-                base_name=base_name,
-                module_name=name,
-                module=submodule,
-                state_dict=state_dict,
-            )
         if submodule.quantization_scheme.input_activations is not None:
             base_name = "input"
-            _load_quant_args_from_state_dict(
+            _load_quant_args_from_mapping(
                 base_name=base_name,
                 module_name=name,
                 module=submodule,
-                state_dict=state_dict,
+                mapping=mapping,
             )
         if submodule.quantization_scheme.output_activations is not None:
             base_name = "output"
-            _load_quant_args_from_state_dict(
+            _load_quant_args_from_mapping(
                 base_name=base_name,
                 module_name=name,
                 module=submodule,
-                state_dict=state_dict,
+                mapping=mapping,
+            )
+        if load_weight_quantization and submodule.quantization_scheme.weights:
+            base_name = "weight"
+            _load_quant_args_from_mapping(
+                base_name=base_name,
+                module_name=name,
+                module=submodule,
+                mapping=mapping,
             )
@@ -237,9 +250,19 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     if status >= QuantizationStatus.INITIALIZED > current_status:
         force_zero_point_init = status != QuantizationStatus.COMPRESSED
+        # When decompressing, we set the scale_dtype as the model's dtype
+        # This is because the normal workflow of using the weight's dtype
+        # will be incorrect as the model weight will be compressed
+        # Therfore, use the dtype set by the user using the PretrainedModel
+        scale_dtype = None
+        if status == QuantizationStatus.FROZEN:
+            if hasattr(model, "dtype"):
+                scale_dtype = model.dtype
         model.apply(
             lambda module: initialize_module_for_quantization(
-                module, force_zero_point=force_zero_point_init
+                module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
             )
         )
@@ -344,9 +367,10 @@ def _infer_status(model: Module) -> Optional[QuantizationStatus]:
     return None
-def _load_quant_args_from_state_dict(
-    base_name: str, module_name: str, module: Module, state_dict: Dict
+def _load_quant_args_from_mapping(
+    base_name: str, module_name: str, module: Module, mapping: Dict
 ):
+    # TODO: skip update and just register here, don't do it in initialize
     """
     Loads scale and zero point from a state_dict into the specified module
@@ -354,26 +378,37 @@ def _load_quant_args_from_state_dict(
     output_activations
     :param module_name: pytorch module name to look up in state_dict
     :module: pytorch module associated with module_name
-    :state_dict: state_dict to search for matching quantization parameters
+    :mapping: mapping to search fetch paths on disk for a given parameter
     """
     scale_name = f"{base_name}_scale"
     zp_name = f"{base_name}_zero_point"
     g_idx_name = f"{base_name}_g_idx"
-    state_dict_scale = state_dict.get(f"{module_name}.{scale_name}", None)
-    state_dict_zp = state_dict.get(f"{module_name}.{zp_name}", None)
-    state_dict_g_idx = state_dict.get(f"{module_name}.{g_idx_name}", None)
+    state_dict_scale_path = mapping.get(f"{module_name}.{scale_name}", None)
+    state_dict_zp_path = mapping.get(f"{module_name}.{zp_name}", None)
+    state_dict_g_idx_path = mapping.get(f"{module_name}.{g_idx_name}", None)
+    if state_dict_g_idx_path is not None:
+        with safe_open(state_dict_g_idx_path, framework="pt", device="cpu") as f:
+            state_dict_g_idx = f.get_tensor(f"{module_name}.{g_idx_name}")
+        update_parameter_data(module, state_dict_g_idx, g_idx_name)
-    if state_dict_scale is not None:
+    if state_dict_scale_path is not None:
         # module is quantized
+        with safe_open(state_dict_scale_path, framework="pt", device="cpu") as f:
+            state_dict_scale = f.get_tensor(f"{module_name}.{scale_name}")
         update_parameter_data(module, state_dict_scale, scale_name)
-        if state_dict_zp is None:
+        if state_dict_zp_path is None:
             # fill in zero point for symmetric quantization
             state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
-        update_parameter_data(module, state_dict_zp, zp_name)
+        else:
+            with safe_open(state_dict_zp_path, framework="pt", device="cpu") as f:
+                state_dict_zp = f.get_tensor(f"{module_name}.{zp_name}")
-    if state_dict_g_idx is not None:
-        update_parameter_data(module, state_dict_g_idx, g_idx_name)
+        update_parameter_data(module, state_dict_zp, zp_name)
 def _scheme_from_targets(

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -56,6 +56,7 @@ def initialize_module_for_quantization(
     module: Module,
     scheme: Optional[QuantizationScheme] = None,
     force_zero_point: bool = True,
+    scale_dtype: Optional[torch.dtype] = None,
 ):
     """
     attaches appropriate scales, zero points, and observers to a layer
@@ -69,7 +70,10 @@ def initialize_module_for_quantization(
         if not provided, the layer will be skipped
     :param force_zero_point: whether to force initialization of a zero point for
         symmetric quantization
+    :param scale_dtype: dtype to used for the scales, if overriding the
+        weight dtype as the scale dtype
     """
+    # TODO: don't initialize parameters when running decompression
     scheme = scheme or getattr(module, "quantization_scheme", None)
     if scheme is None:
         # no scheme passed and layer not targeted for quantization - skip
@@ -87,7 +91,9 @@ def initialize_module_for_quantization(
                 "input",
                 scheme.input_activations,
                 force_zero_point=force_zero_point,
+                scale_dtype=scale_dtype,
             )
         if scheme.weights is not None:
             if hasattr(module, "weight"):
                 weight_shape = None
@@ -99,6 +105,7 @@ def initialize_module_for_quantization(
                     scheme.weights,
                     weight_shape=weight_shape,
                     force_zero_point=force_zero_point,
+                    scale_dtype=scale_dtype,
                 )
             else:
                 _LOGGER.warning(
@@ -110,7 +117,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
                 _initialize_scale_zero_point(
-                    module, "output", scheme.output_activations
+                    module, "output", scheme.output_activations, scale_dtype=scale_dtype
                 )
         module.quantization_scheme = scheme
@@ -136,6 +143,7 @@ def _initialize_scale_zero_point(
     quantization_args: QuantizationArgs,
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
+    scale_dtype: Optional[torch.dtype] = None,
 ):
     if quantization_args.dynamic:
         return
@@ -160,7 +168,10 @@ def _initialize_scale_zero_point(
             num_groups = weight_shape[1] // quantization_args.group_size
             expected_shape = (weight_shape[0], max(num_groups, 1))
-    scale_dtype = module.weight.dtype
+    scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
+    # TODO: consider erroring out in the future as if the dtype if not one fo these,
+    # there is likely bug
     if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
         scale_dtype = torch.float16

compressed_tensors/utils/offload.py CHANGED Viewed

@@ -94,22 +94,6 @@ def is_module_offloaded(module: torch.nn.Module) -> bool:
     return has_offloaded_params(module)
-def get_execution_device(module: torch.nn.Module) -> torch.device:
-    """
-    :param module: module to check
-    :return: device module is loaded onto during forward pass
-    """
-    if has_offloaded_params(module):
-        return module._hf_hook.execution_device
-    device = next(module.parameters()).device
-    # offload only gets set for leaf modules, fallback to checking for device type
-    if device.type == "meta":
-        return module._hf_hook.execution_device
-    return device
 def get_offloaded_device(module: torch.nn.Module) -> torch.device:
     """
     :param module: module to check
@@ -158,6 +142,26 @@ def update_parameter_data(
 """ Candidates for Upstreaming """
+def get_execution_device(module: torch.nn.Module) -> torch.device:
+    """
+    Get the device which inputs should be moved to before module execution
+    :param module: module to check, may be offloaded
+    :return: onload device of module
+    """
+    if has_offloaded_params(module):
+        return module._hf_hook.execution_device
+    first_param = next(module.parameters(), None)
+    if first_param is None:
+        warnings.warn(
+            f"Unable able to infer execution device of {module}, falling back to CPU"
+        )
+        return torch.device("cpu")
+    return first_param.device
 def register_offload_parameter(
     module: torch.nn.Module,
     name: str,
@@ -200,7 +204,6 @@ def update_offload_parameter(
         provided, then infer device from parameters on module
     """
     param = getattr(module, name)
-    data = data.to(param.dtype)
     if param.data.shape != data.shape:
         warnings.warn(
             f"Shape of parameter being updated {param.data.shape} does not match shape "

compressed_tensors/utils/safetensors_load.py CHANGED Viewed

@@ -31,7 +31,7 @@ __all__ = [
     "get_weight_mappings",
     "get_nested_weight_mappings",
     "get_nested_mappings_from_state_dict",
-    "get_quantization_state_dict",
+    "get_quantization_parameter_to_path_mapping",
     "is_quantization_param",
 ]
@@ -279,16 +279,18 @@ def get_nested_mappings_from_state_dict(
     return nested_weight_mappings
-def get_quantization_state_dict(model_path: str) -> Dict[str, Tensor]:
+def get_quantization_parameter_to_path_mapping(model_path: str) -> Dict[str, str]:
+    """
+    Given a model path, return a mapping between a parameter and its path
+    on disk
+    """
     weight_mappings = get_weight_mappings(model_path)
-    state_dict = {}
+    mapping = {}
     for weight_name, safe_path in weight_mappings.items():
-        if not is_quantization_param(weight_name):
+        if is_quantization_param(weight_name):
+            mapping[weight_name] = safe_path
             continue
-        with safe_open(safe_path, framework="pt", device="cpu") as f:
-            state_dict[weight_name] = f.get_tensor(weight_name)
-    return state_dict
+    return mapping
 def is_quantization_param(name: str) -> bool:

compressed_tensors/version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.9.4a20250414'
-__version_tuple__ = version_tuple = (0, 9, 4)
+__version__ = version = '0.9.5.a20250424'
+__version_tuple__ = version_tuple = (0, 9, 5)

{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.4a20250414
+Version: 0.9.5a20250424
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
 compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
-compressed_tensors/version.py,sha256=JCDPCnyAovJOVFzV3xFxk3_fp65oLlCW-p8xggQzveU,520
+compressed_tensors/version.py,sha256=fMpLfUNedNFTmTmQeHxGZnMaXAKOKiqpI9xyx46F2gI,521
 compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
-compressed_tensors/compressors/base.py,sha256=x8dQrWVEurynXw03yHJZTaAmrRTOsdZJoHjmvs0IKwk,7002
+compressed_tensors/compressors/base.py,sha256=nvWsv4xEw1Tkxkxth6TmHplDYXfBeP22xWxOsZERyDY,7204
 compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
 compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
-compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=n0gcrKwefJuO6b4LNjCynJQf7NNqNHDcoLlzZgTCPGc,23080
+compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=gZvhGSMYIWvLiH0Xl2dmh7PxfyLHAX5nFBvIUUDE6Qc,27451
 compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=09UJq68Pht6Bf-4iP9xYl3tetKsncNPHD8IAGbePsr4,714
-compressed_tensors/compressors/quantized_compressors/base.py,sha256=GXTSWgFAhksbno94Ulpth9-YM4a7NsDlx4oQGGB0swQ,8567
+compressed_tensors/compressors/quantized_compressors/base.py,sha256=PWSPLQ7zBBjHfQyHUqr9D-mGYLe5WczJHMSRZWCOxOI,9189
 compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
-compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=zH2PocRe_T5yt1-3kLdZH9AUQWQyaVOi4U9nEJiYaWA,8509
+compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=SPIHlk8ewip2LcjgkCw02K21EkfUSFSd9qQqL0Pt5eM,11162
 compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
-compressed_tensors/compressors/sparse_compressors/base.py,sha256=CVWbs3sd7GKJEoWOIKImABQ01VOTX8dlF2AQaEVPotw,5883
+compressed_tensors/compressors/sparse_compressors/base.py,sha256=PMiWIaW2XSF_esYJlQ12RVW7opeAzavdbkRFtelMFX0,6655
 compressed_tensors/compressors/sparse_compressors/dense.py,sha256=_uW_HISeDNz4yboSZWoh6GwrkUE6HFibzPQSKrHOCkg,1505
 compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py,sha256=mEKSSgpXookqYSJw3mlyP6cYYKD-eaIvpQMvi4JO6TY,8807
 compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py,sha256=S8vW0FI9ep_XtUQOxj0P5utJt3vKEYOHjWEPp-Xd9aY,5820
@@ -29,24 +29,24 @@ compressed_tensors/quantization/quant_args.py,sha256=sKpb8DcNObidjXjNol1Tn_Iih3Z
 compressed_tensors/quantization/quant_config.py,sha256=MxSUcb5dOqMN6LFyD5K2h8X0TvEtcWIAoiUJqD2dHGE,10159
 compressed_tensors/quantization/quant_scheme.py,sha256=yz0oMbbwp7QZXXd2k5KIJu-Q6aTqg2929VdUzZ7vysM,6324
 compressed_tensors/quantization/lifecycle/__init__.py,sha256=_uItzFWusyV74Zco_pHLOTdE9a83cL-R-ZdyQrBkIyw,772
-compressed_tensors/quantization/lifecycle/apply.py,sha256=lZmCCSm1_o79iUAy460w6Bv9FaOvntVisMdS-dN9fnk,16594
+compressed_tensors/quantization/lifecycle/apply.py,sha256=OR-6QmN9pFRGteYMBAatu2T5qHutQt7Iw3jH4DILvEk,18071
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
 compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
 compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
-compressed_tensors/quantization/lifecycle/initialize.py,sha256=sK3PLm69N91QepBuq-83Qd2Br6XcOmRDpD5qo_WWNJo,7469
+compressed_tensors/quantization/lifecycle/initialize.py,sha256=SY4-FJWpVSupQjuvy7rrIc0pFYU9cRL5Lo1KyfUSvoU,8010
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
 compressed_tensors/quantization/utils/helpers.py,sha256=-wX0H7zVysJ67jRRCGbx6BfxbMU_1sqffTf5YUIpPiU,14391
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
 compressed_tensors/registry/registry.py,sha256=vRcjVB1ITfSbfYUaGndBBmqhip_5vsS62weorVg0iXo,11896
 compressed_tensors/utils/__init__.py,sha256=gS4gSU2pwcAbsKj-6YMaqhm25udFy6ISYaWBf-myRSM,808
 compressed_tensors/utils/helpers.py,sha256=RrNvzD08naEjEiXdU-FdZjQVda1nQywu1hA_GCDj0vg,10415
-compressed_tensors/utils/offload.py,sha256=H4aAg21zUvJM2uwE6QCNYazX_p_o41yQUAgLLWBqR0w,14079
+compressed_tensors/utils/offload.py,sha256=Fmb4jBJhH5OdSQFaecFSHK_UreSyZdynEkadZ_oKcvM,14153
 compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVyah6BUUir_StT28,2530
 compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
-compressed_tensors/utils/safetensors_load.py,sha256=5SeM2hzLh77Ne8Vk7qR6-km7cf8bhov41ExpWITqX3A,11470
+compressed_tensors/utils/safetensors_load.py,sha256=rwj0ufU5561ScWDoCG7tzLBRDtiykNno2Iq4PM_JA7E,11499
 compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
-compressed_tensors-0.9.4a20250414.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors-0.9.4a20250414.dist-info/METADATA,sha256=3Qp5i-5uU9vMdZ5vsWLRrU68Htb1YbDx4lcn3I8r8Ts,7004
-compressed_tensors-0.9.4a20250414.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-compressed_tensors-0.9.4a20250414.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors-0.9.4a20250414.dist-info/RECORD,,
+compressed_tensors-0.9.5a20250424.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors-0.9.5a20250424.dist-info/METADATA,sha256=P0oAhrS28ZU90nUEi9yjIu3CE-968yZTsTLTx1Uj1nM,7004
+compressed_tensors-0.9.5a20250424.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+compressed_tensors-0.9.5a20250424.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors-0.9.5a20250424.dist-info/RECORD,,

{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (79.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors 0.9.4a20250414__py3-none-any.whl → 0.9.5a20250424__py3-none-any.whl

compressed-tensors 0.9.4a20250414py3-none-any.whl → 0.9.5a20250424py3-none-any.whl