PyPI - compressed-tensors-nightly - Versions diffs - 0.5.0.20240814__py3-none-any.whl → 0.5.0.20240830__py3-none-any.whl - Mend

compressed-tensors-nightly 0.5.0.20240814py3-none-any.whl → 0.5.0.20240830py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

compressed_tensors/compressors/base.py CHANGED Viewed

@@ -12,20 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Generator, Tuple, Union
+import logging
+from typing import Dict, Generator, Optional, Tuple, Union
+import torch
 from compressed_tensors.config import SparsityCompressionConfig
-from compressed_tensors.quantization import QuantizationConfig
+from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
 from compressed_tensors.registry import RegistryMixin
+from compressed_tensors.utils import get_nested_weight_mappings, merge_names
+from safetensors import safe_open
 from torch import Tensor
+from torch.nn.modules import Module
+from tqdm import tqdm
+_LOGGER: logging.Logger = logging.getLogger(__name__)
 __all__ = ["Compressor"]
 class Compressor(RegistryMixin):
     """
-    Base class representing a model compression algorithm
+    Base class representing a model compression algorithm. Each child class should
+    implement compression_param_info, compress_weight and decompress_weight.
+    Compressors support compressing/decompressing a full module state dict or a single
+    quantized PyTorch leaf module.
+    Model Load Lifecycle (run_compressed=False):
+        - ModelCompressor.decompress()
+            - apply_quantization_config()
+            - Compressor.decompress()
+                - Compressor.decompress_weight()
+    Model Save Lifecycle:
+        - ModelCompressor.compress()
+            - Compressor.compress()
+                - Compressor.compress_weight()
+    Module Lifecycle (run_compressed=True):
+        - apply_quantization_config()
+        - compressed_module = CompressedLinear(module)
+            - initialize_module_for_quantization()
+            - Compressor.compression_param_info()
+            - register_parameters()
+        - compressed_module.forward()
+            -compressed_module.decompress()
     :param config: config specifying compression parameters
     """
@@ -35,26 +68,183 @@ class Compressor(RegistryMixin):
     ):
         self.config = config
-    def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
+    def compression_param_info(
+        self,
+        weight_shape: torch.Size,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
+        """
+        Creates a dictionary of expected shapes and dtypes for each compression
+            parameter used by the compressor
+        :param weight_shape: uncompressed weight shape
+        :param quantization_args: quantization parameters for the weight
+        :return: dictionary mapping compressed parameter names to shape and dtype
+        """
+        raise NotImplementedError()
+    def compress(
+        self,
+        model_state: Dict[str, Tensor],
+        names_to_scheme: Dict[str, QuantizationArgs],
+        **kwargs,
+    ) -> Dict[str, Tensor]:
         """
         Compresses a dense state dict
         :param model_state: state dict of uncompressed model
+        :param names_to_scheme: quantization args for each quantized weight, needed for
+            quantize function to calculate bit depth
         :return: compressed state dict
         """
-        raise NotImplementedError()
+        compressed_dict = {}
+        weight_suffix = ".weight"
+        _LOGGER.debug(
+            f"Compressing model with {len(model_state)} parameterized layers..."
+        )
+        for name, value in tqdm(model_state.items(), desc="Compressing model"):
+            if name.endswith(weight_suffix):
+                prefix = name[: -(len(weight_suffix))]
+                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
+                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
+                if scale is not None:
+                    # weight is quantized, compress it
+                    quant_args = names_to_scheme[prefix]
+                    compressed_data = self.compress_weight(
+                        weight=value,
+                        scale=scale,
+                        zero_point=zp,
+                        quantization_args=quant_args,
+                        device="cpu",
+                    )
+                    for key, value in compressed_data.items():
+                        compressed_dict[merge_names(prefix, key)] = value
+                else:
+                    compressed_dict[name] = value.to("cpu")
+            elif name.endswith("zero_point") and torch.all(value == 0):
+                # all zero_points are 0, no need to include in
+                # compressed state_dict
+                continue
+            else:
+                compressed_dict[name] = value.to("cpu")
+        return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
+        self,
+        path_to_model_or_tensors: str,
+        names_to_scheme: Dict[str, QuantizationArgs],
+        device: str = "cpu",
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors
         and returns a generator for sequentially decompressing back to a
         dense state dict
-        :param model_path: path to compressed safetensors model (directory with
-            one or more safetensors files) or compressed tensors file
+        :param path_to_model_or_tensors: path to compressed safetensors model (directory
+            with one or more safetensors files) or compressed tensors file
+        :param names_to_scheme: quantization args for each quantized weight
         :param device: optional device to load intermediate weights into
         :return: compressed state dict
         """
+        weight_mappings = get_nested_weight_mappings(
+            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
+        )
+        for weight_name in weight_mappings.keys():
+            weight_data = {}
+            for param_name, safe_path in weight_mappings[weight_name].items():
+                full_name = merge_names(weight_name, param_name)
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    weight_data[param_name] = f.get_tensor(full_name)
+            if "weight_scale" in weight_data:
+                quant_args = names_to_scheme[weight_name]
+                decompressed = self.decompress_weight(
+                    compressed_data=weight_data, quantization_args=quant_args
+                )
+                yield merge_names(weight_name, "weight"), decompressed
+    def compress_weight(
+        self,
+        weight: Tensor,
+        scale: Tensor,
+        zero_point: Optional[Tensor] = None,
+        g_idx: Optional[torch.Tensor] = None,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compresses a single uncompressed weight
+        :param weight: uncompressed weight tensor
+        :param scale: quantization scale for weight
+        :param zero_point: quantization zero point for weight
+        :param g_idx: optional mapping from column index to group index
+        :param quantization_args: quantization parameters for weight
+        :return: dictionary of compressed weight data
+        """
         raise NotImplementedError()
+    def decompress_weight(
+        self,
+        compressed_data: Dict[str, Tensor],
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> torch.Tensor:
+        """
+        Decompresses a single compressed weight
+        :param compressed_data: dictionary of data needed for decompression
+        :param quantization_args: quantization parameters for the weight
+        :return: tensor of the decompressed weight
+        """
+        raise NotImplementedError()
+    def compress_module(self, module: Module) -> Optional[Dict[str, torch.Tensor]]:
+        """
+        Compresses a single quantized leaf PyTorch module. If the module is not
+        quantized, this function has no effect.
+        :param module: PyTorch module to compress
+        :return: dictionary of compressed weight data, or None if module is not
+            quantized
+        """
+        if not hasattr(module, "quantization_scheme"):
+            return None  # module is not quantized
+        quantization_scheme = module.quantization_scheme
+        if not hasattr(quantization_scheme, "weights"):
+            return None  # weights are not quantized
+        quantization_args = quantization_scheme.weights
+        weight = getattr(module, "weight", None)
+        weight_scale = getattr(module, "weight_scale", None)
+        weight_zero_point = getattr(module, "weight_zero_point", None)
+        return self.compress_weight(
+            weight=weight,
+            scale=weight_scale,
+            zero_point=weight_zero_point,
+            quantization_args=quantization_args,
+        )
+    def decompress_module(self, module: Module):
+        """
+        Decompresses a single compressed leaf PyTorch module. If the module is not
+        quantized, this function has no effect.
+        :param module: PyTorch module to decompress
+        :return: tensor of the decompressed weight, or None if module is not quantized
+        """
+        if not hasattr(module, "quantization_scheme"):
+            return None  # module is not quantized
+        quantization_scheme = module.quantization_scheme
+        if not hasattr(quantization_scheme, "weights"):
+            return None  # weights are not quantized
+        quantization_args = quantization_scheme.weights
+        compressed_data = {}
+        for name, parameter in module.named_parameters():
+            compressed_data[name] = parameter
+        return self.decompress_weight(
+            compressed_data=compressed_data, quantization_args=quantization_args
+        )

compressed_tensors/compressors/model_compressor.py CHANGED Viewed

@@ -28,7 +28,7 @@ from compressed_tensors.base import (
     SPARSITY_CONFIG_NAME,
 )
 from compressed_tensors.compressors import Compressor
-from compressed_tensors.config import SparsityCompressionConfig
+from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization import (
     QuantizationConfig,
     QuantizationStatus,
@@ -176,6 +176,9 @@ class ModelCompressor:
         if hasattr(compression_config, SPARSITY_CONFIG_NAME):
             # for loaded HFQuantizer config
             return getattr(compression_config, SPARSITY_CONFIG_NAME)
+        if SPARSITY_CONFIG_NAME in compression_config:
+            # for loaded HFQuantizer config from dict
+            return compression_config[SPARSITY_CONFIG_NAME]
         # SparseAutoModel format
         return compression_config.get(SPARSITY_CONFIG_NAME, None)
@@ -189,6 +192,10 @@ class ModelCompressor:
             # for loaded HFQuantizer config
             return getattr(compression_config, QUANTIZATION_CONFIG_NAME)
+        if QUANTIZATION_CONFIG_NAME in compression_config:
+            # for loaded HFQuantizer config from dict
+            return compression_config[QUANTIZATION_CONFIG_NAME]
         # SparseAutoModel format
         quantization_config = deepcopy(compression_config)
         quantization_config.pop(SPARSITY_CONFIG_NAME, None)
@@ -234,12 +241,69 @@ class ModelCompressor:
             compressed_state_dict = self.quantization_compressor.compress(
                 state_dict, names_to_scheme=quantized_modules_to_args
             )
+            if self.quantization_config.format != CompressionFormat.dense.value:
+                self.quantization_config.quantization_status = (
+                    QuantizationStatus.COMPRESSED
+                )
         if self.sparsity_compressor is not None:
             compressed_state_dict = self.sparsity_compressor.compress(
                 compressed_state_dict
             )
+        # HACK (mgoin): Post-process step for kv cache scales to take the
+        # k/v_proj module `output_scale` parameters, and store them in the
+        # parent attention module as `k_scale` and `v_scale`
+        #
+        # Example:
+        #  Replace `model.layers.0.self_attn.k_proj.output_scale`
+        #  with    `model.layers.0.self_attn.k_scale`
+        if (
+            self.quantization_config is not None
+            and self.quantization_config.kv_cache_scheme is not None
+        ):
+            # HACK (mgoin): We assume the quantized modules in question
+            # will be k_proj and v_proj since those are the default targets.
+            # We check that both of these modules have output activation
+            # quantization, and additionally check that q_proj doesn't.
+            q_proj_has_no_quant_output = 0
+            k_proj_has_quant_output = 0
+            v_proj_has_quant_output = 0
+            for name, module in model.named_modules():
+                if not hasattr(module, "quantization_scheme"):
+                    continue
+                out_act = module.quantization_scheme.output_activations
+                if name.endswith(".q_proj") and out_act is None:
+                    q_proj_has_no_quant_output += 1
+                elif name.endswith(".k_proj") and out_act is not None:
+                    k_proj_has_quant_output += 1
+                elif name.endswith(".v_proj") and out_act is not None:
+                    v_proj_has_quant_output += 1
+            assert (
+                q_proj_has_no_quant_output > 0
+                and k_proj_has_quant_output > 0
+                and v_proj_has_quant_output > 0
+            )
+            assert (
+                q_proj_has_no_quant_output
+                == k_proj_has_quant_output
+                == v_proj_has_quant_output
+            )
+            # Move all .k/v_proj.output_scale parameters to .k/v_scale
+            working_state_dict = {}
+            for key in compressed_state_dict.keys():
+                if key.endswith(".k_proj.output_scale"):
+                    new_key = key.replace(".k_proj.output_scale", ".k_scale")
+                    working_state_dict[new_key] = compressed_state_dict[key]
+                elif key.endswith(".v_proj.output_scale"):
+                    new_key = key.replace(".v_proj.output_scale", ".v_scale")
+                    working_state_dict[new_key] = compressed_state_dict[key]
+                else:
+                    working_state_dict[key] = compressed_state_dict[key]
+            compressed_state_dict = working_state_dict
         # HACK: Override the dtype_byte_size function in transformers to
         # support float8 types. Fix is posted upstream
         # https://github.com/huggingface/transformers/pull/30488

compressed_tensors/compressors/naive_quantized.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
-from typing import Dict, Generator, Tuple
+from typing import Dict, Optional, Tuple
 import torch
 from compressed_tensors.compressors import Compressor
@@ -21,10 +21,7 @@ from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import QuantizationArgs
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
-from compressed_tensors.utils import get_nested_weight_mappings, merge_names
-from safetensors import safe_open
 from torch import Tensor
-from tqdm import tqdm
 __all__ = [
@@ -44,86 +41,85 @@ class QuantizationCompressor(Compressor):
     type to the type specified by the layer's QuantizationArgs.
     """
-    COMPRESSION_PARAM_NAMES = ["weight", "weight_scale", "weight_zero_point"]
+    COMPRESSION_PARAM_NAMES = [
+        "weight",
+        "weight_scale",
+        "weight_zero_point",
+        "weight_g_idx",
+    ]
-    def compress(
+    def compression_param_info(
         self,
-        model_state: Dict[str, Tensor],
-        names_to_scheme: Dict[str, QuantizationArgs],
-        **kwargs,
-    ) -> Dict[str, Tensor]:
+        weight_shape: torch.Size,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
         """
-        Compresses a dense state dict
+        Creates a dictionary of expected shapes and dtypes for each compression
+            parameter used by the compressor
-        :param model_state: state dict of uncompressed model
-        :param names_to_scheme: quantization args for each quantized weight, needed for
-        quantize function to calculate bit depth
-        :return: compressed state dict
+        :param weight_shape: uncompressed weight shape
+        :param quantization_args: quantization parameters for the weight
+        :return: dictionary mapping compressed parameter names to shape and dtype
         """
-        compressed_dict = {}
-        weight_suffix = ".weight"
-        _LOGGER.debug(
-            f"Compressing model with {len(model_state)} parameterized layers..."
-        )
+        dtype = quantization_args.pytorch_dtype()
+        return {"weight": (weight_shape, dtype)}
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
-            if name.endswith(weight_suffix):
-                prefix = name[: -(len(weight_suffix))]
-                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
-                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
-                if scale is not None and zp is not None:
-                    # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
-                    if can_quantize(value, quant_args):
-                        # only quantize if not already quantized
-                        value = quantize(
-                            x=value,
-                            scale=scale,
-                            zero_point=zp,
-                            args=quant_args,
-                            dtype=quant_args.pytorch_dtype(),
-                        )
-            elif name.endswith("zero_point"):
-                if torch.all(value == 0):
-                    # all zero_points are 0, no need to include in
-                    # compressed state_dict
-                    continue
-            compressed_dict[name] = value.to("cpu")
-        return compressed_dict
-    def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
-    ) -> Generator[Tuple[str, Tensor], None, None]:
+    def compress_weight(
+        self,
+        weight: Tensor,
+        scale: Tensor,
+        zero_point: Optional[Tensor] = None,
+        g_idx: Optional[torch.Tensor] = None,
+        quantization_args: Optional[QuantizationArgs] = None,
+        device: Optional[torch.device] = None,
+    ) -> Dict[str, torch.Tensor]:
         """
-        Reads a compressed state dict located at path_to_model_or_tensors
-        and returns a generator for sequentially decompressing back to a
-        dense state dict
-        :param model_path: path to compressed safetensors model (directory with
-            one or more safetensors files) or compressed tensors file
-        :param device: optional device to load intermediate weights into
-        :return: compressed state dict
+        Compresses a single uncompressed weight
+        :param weight: uncompressed weight tensor
+        :param scale: quantization scale for weight
+        :param zero_point: quantization zero point for weight
+        :param g_idx: optional mapping from column index to group index
+        :param quantization_args: quantization parameters for weight
+        :param device: optional device to move compressed output to
+        :return: dictionary of compressed weight data
+        """
+        if can_quantize(weight, quantization_args):
+            quantized_weight = quantize(
+                x=weight,
+                scale=scale,
+                zero_point=zero_point,
+                g_idx=g_idx,
+                args=quantization_args,
+                dtype=quantization_args.pytorch_dtype(),
+            )
+            if device is not None:
+                quantized_weight = quantized_weight.to(device)
+        return {"weight": quantized_weight}
+    def decompress_weight(
+        self,
+        compressed_data: Dict[str, Tensor],
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> torch.Tensor:
         """
-        weight_mappings = get_nested_weight_mappings(
-            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
+        Decompresses a single compressed weight
+        :param compressed_data: dictionary of data needed for decompression
+        :param quantization_args: quantization parameters for the weight
+        :return: tensor of the decompressed weight
+        """
+        weight = compressed_data["weight"]
+        scale = compressed_data["weight_scale"]
+        zero_point = compressed_data.get("weight_zero_point", None)
+        g_idx = compressed_data.get("weight_g_idx", None)
+        decompressed_weight = dequantize(
+            x_q=weight, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
-        for weight_name in weight_mappings.keys():
-            weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device=device) as f:
-                    weight_data[param_name] = f.get_tensor(full_name)
-            if "weight_scale" in weight_data:
-                zero_point = weight_data.get("weight_zero_point", None)
-                scale = weight_data["weight_scale"]
-                decompressed = dequantize(
-                    x_q=weight_data["weight"],
-                    scale=scale,
-                    zero_point=zero_point,
-                )
-                yield merge_names(weight_name, "weight"), decompressed
+        return decompressed_weight
 @Compressor.register(name=CompressionFormat.int_quantized.value)

compressed-tensors-nightly 0.5.0.20240814__py3-none-any.whl → 0.5.0.20240830__py3-none-any.whl

compressed-tensors-nightly 0.5.0.20240814py3-none-any.whl → 0.5.0.20240830py3-none-any.whl