PyPI - compressed-tensors-nightly - Versions diffs - 0.4.0.20240623__py3-none-any.whl → 0.4.0.20240627__py3-none-any.whl - Mend

compressed-tensors-nightly 0.4.0.20240623py3-none-any.whl → 0.4.0.20240627py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

compressed_tensors/base.py CHANGED Viewed

@@ -15,3 +15,4 @@
 SPARSITY_CONFIG_NAME = "sparsity_config"
 QUANTIZATION_CONFIG_NAME = "quantization_config"
 COMPRESSION_CONFIG_NAME = "compression_config"
+KV_CACHE_SCHEME_NAME = "kv_cache_scheme"

compressed_tensors/compressors/base.py CHANGED Viewed

@@ -45,7 +45,7 @@ class Compressor(RegistryMixin):
         raise NotImplementedError()
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors

compressed_tensors/compressors/dense.py CHANGED Viewed

@@ -29,6 +29,6 @@ class DenseCompressor(Compressor):
         return model_state
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         return iter([])

compressed_tensors/compressors/marlin_24.py CHANGED Viewed

@@ -107,7 +107,7 @@ class Marlin24Compressor(Compressor):
     def compress(
         self,
         model_state: Dict[str, Tensor],
-        model_quant_args: Dict[str, QuantizationArgs],
+        names_to_scheme: Dict[str, QuantizationArgs],
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
@@ -115,11 +115,11 @@ class Marlin24Compressor(Compressor):
         with the Marlin24 kernel
         :param model_state: state dict of uncompressed model
-        :param model_quant_args: quantization args for each quantized weight, needed for
+        :param names_to_scheme: quantization args for each quantized weight, needed for
            quantize function to calculate bit depth
         :return: compressed state dict
         """
-        self.validate_quant_compatability(model_quant_args)
+        self.validate_quant_compatability(names_to_scheme)
         compressed_dict = {}
         weight_suffix = ".weight"
@@ -139,7 +139,7 @@ class Marlin24Compressor(Compressor):
                     value = value.to(torch.float16)
                     # quantize weight, keeping it as a float16 for now
-                    quant_args = model_quant_args[prefix]
+                    quant_args = names_to_scheme[prefix]
                     value = quantize(
                         x=value, scale=scale, zero_point=zp, args=quant_args
                     )
@@ -175,7 +175,7 @@ class Marlin24Compressor(Compressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         raise NotImplementedError(
             "Decompression is not implemented for the Marlin24 Compressor."

compressed_tensors/compressors/model_compressor.py CHANGED Viewed

@@ -231,7 +231,7 @@ class ModelCompressor:
         quantized_modules_to_args = map_modules_to_quant_args(model)
         if self.quantization_compressor is not None:
             compressed_state_dict = self.quantization_compressor.compress(
-                state_dict, model_quant_args=quantized_modules_to_args
+                state_dict, names_to_scheme=quantized_modules_to_args
             )
         if self.sparsity_compressor is not None:
@@ -260,9 +260,11 @@ class ModelCompressor:
             setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
         if self.quantization_compressor is not None:
-            apply_quantization_config(model, self.quantization_config)
+            names_to_scheme = apply_quantization_config(model, self.quantization_config)
             load_pretrained_quantization(model, model_path)
-            dense_gen = self.quantization_compressor.decompress(model_path)
+            dense_gen = self.quantization_compressor.decompress(
+                model_path, names_to_scheme=names_to_scheme
+            )
             self._replace_weights(dense_gen, model)
             def update_status(module):

compressed_tensors/compressors/naive_quantized.py CHANGED Viewed

@@ -49,14 +49,14 @@ class QuantizationCompressor(Compressor):
     def compress(
         self,
         model_state: Dict[str, Tensor],
-        model_quant_args: Dict[str, QuantizationArgs],
+        names_to_scheme: Dict[str, QuantizationArgs],
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
         Compresses a dense state dict
         :param model_state: state dict of uncompressed model
-        :param model_quant_args: quantization args for each quantized weight, needed for
+        :param names_to_scheme: quantization args for each quantized weight, needed for
         quantize function to calculate bit depth
         :return: compressed state dict
         """
@@ -73,7 +73,7 @@ class QuantizationCompressor(Compressor):
                 zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
                 if scale is not None and zp is not None:
                     # weight is quantized, compress it
-                    quant_args = model_quant_args[prefix]
+                    quant_args = names_to_scheme[prefix]
                     if can_quantize(value, quant_args):
                         # only quantize if not already quantized
                         value = quantize(
@@ -93,7 +93,7 @@ class QuantizationCompressor(Compressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors

compressed_tensors/compressors/pack_quantized.py CHANGED Viewed

@@ -29,7 +29,13 @@ from torch import Tensor
 from tqdm import tqdm
-__all__ = ["PackedQuantizationCompressor", "pack_4bit_ints", "unpack_4bit_ints"]
+__all__ = [
+    "PackedQuantizationCompressor",
+    "pack_4bit_ints",
+    "pack_8bit_ints",
+    "unpack_4bit_ints",
+    "unpack_8bit_ints",
+]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -50,14 +56,14 @@ class PackedQuantizationCompressor(Compressor):
     def compress(
         self,
         model_state: Dict[str, Tensor],
-        model_quant_args: Dict[str, QuantizationArgs],
+        names_to_scheme: Dict[str, QuantizationArgs],
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
         Compresses a dense state dict
         :param model_state: state dict of uncompressed model
-        :param model_quant_args: quantization args for each quantized weight, needed for
+        :param names_to_scheme: quantization args for each quantized weight, needed for
         quantize function to calculate bit depth
         :return: compressed state dict
         """
@@ -75,7 +81,7 @@ class PackedQuantizationCompressor(Compressor):
                 shape = torch.tensor(value.shape)
                 if scale is not None and zp is not None:
                     # weight is quantized, compress it
-                    quant_args = model_quant_args[prefix]
+                    quant_args = names_to_scheme[prefix]
                     if can_quantize(value, quant_args):
                         # convert weight to an int if not already compressed
                         value = quantize(
@@ -85,7 +91,11 @@ class PackedQuantizationCompressor(Compressor):
                             args=quant_args,
                             dtype=torch.int8,
                         )
-                    value = pack_4bit_ints(value.cpu())
+                    if quant_args.num_bits == 8:
+                        value = pack_8bit_ints(value.cpu())
+                    else:
+                        value = pack_4bit_ints(value.cpu())
                     compressed_dict[merge_names(prefix, "weight_shape")] = shape
                     compressed_dict[merge_names(prefix, "weight_packed")] = value
                     continue
@@ -101,7 +111,10 @@ class PackedQuantizationCompressor(Compressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self,
+        path_to_model_or_tensors: str,
+        names_to_scheme: Dict[str, QuantizationArgs],
+        device: str = "cpu",
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors
@@ -119,6 +132,7 @@ class PackedQuantizationCompressor(Compressor):
         for weight_name in weight_mappings.keys():
             weight_data = {}
             for param_name, safe_path in weight_mappings[weight_name].items():
+                weight_data["num_bits"] = names_to_scheme.get(weight_name).num_bits
                 full_name = merge_names(weight_name, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
@@ -127,8 +141,12 @@ class PackedQuantizationCompressor(Compressor):
                 zero_point = weight_data.get("weight_zero_point", None)
                 scale = weight_data["weight_scale"]
                 weight = weight_data["weight_packed"]
+                num_bits = weight_data["num_bits"]
                 original_shape = torch.Size(weight_data["weight_shape"])
-                unpacked = unpack_4bit_ints(weight, original_shape)
+                if num_bits == 4:
+                    unpacked = unpack_4bit_ints(weight, original_shape)
+                else:
+                    unpacked = unpack_8bit_ints(weight, original_shape)
                 decompressed = dequantize(
                     x_q=unpacked,
                     scale=scale,
@@ -137,6 +155,19 @@ class PackedQuantizationCompressor(Compressor):
                 yield merge_names(weight_name, "weight"), decompressed
+def pack_8bit_ints(value: torch.Tensor) -> torch.Tensor:
+    """
+    Packs a tensor of int8 into int32s with padding
+    :param value: tensor to pack
+    :returns: packed int32 tensor
+    """
+    # need to convert to unsigned 8bit to use numpy's pack/unpack
+    value_uint = (value - 128).to(torch.uint8)
+    bits = np.unpackbits(value_uint, axis=-1, bitorder="little")
+    return _pack_bits(bits_to_pack=bits)
 def pack_4bit_ints(value: torch.Tensor) -> torch.Tensor:
     """
     Packs a tensor of int4 weights stored in int8 into int32s with padding
@@ -152,22 +183,31 @@ def pack_4bit_ints(value: torch.Tensor) -> torch.Tensor:
     bits = np.unpackbits(temp.numpy(), axis=-1, bitorder="little")
     ranges = np.array([range(x, x + 4) for x in range(0, bits.shape[1], 8)]).flatten()
     only_4_bits = bits[:, ranges]  # top 4 bits are 0 because we're really uint4
+    return _pack_bits(bits_to_pack=only_4_bits)
-    # pad each row to fill a full 32bit int
-    pack_depth = 32
-    padding = (
-        math.ceil(only_4_bits.shape[1] / pack_depth) * pack_depth - only_4_bits.shape[1]
-    )
-    padded_bits = np.pad(
-        only_4_bits, pad_width=[(0, 0), (0, padding)], constant_values=0
-    )
-    # after packbits each uint8 is two packed uint4s
-    # then we keep the bit pattern the same but convert to int32
-    compressed = np.packbits(padded_bits, axis=-1, bitorder="little")
-    compressed = np.ascontiguousarray(compressed).view(np.int32)
+def unpack_8bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
+    """
+    Unpacks a tensor packed int8 weights in int32
-    return torch.from_numpy(compressed)
+    :param value: tensor to upack
+    :param shape: shape to unpack into, used to remove padding
+    :returns: unpacked int8 tensor
+    """
+    if value.dtype is not torch.int32:
+        raise ValueError(
+            f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
+        )
+    # unpack bits and undo padding to nearest int32 bits
+    individual_depth = 8
+    as_uint8 = value.numpy().view(np.uint8)
+    bits = np.unpackbits(as_uint8, axis=-1, bitorder="little")
+    original_row_size = int(shape[1] * individual_depth)
+    bits = bits[:, :original_row_size]
+    bits = np.packbits(bits, axis=-1, bitorder="little")
+    final = (bits - 128).astype(np.int8)
+    return torch.from_numpy(final)
 def unpack_4bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
@@ -206,3 +246,27 @@ def unpack_4bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
     final = repacked.astype(np.int8) - 8
     return torch.from_numpy(final)
+def _pack_bits(bits_to_pack: torch.Tensor) -> torch.Tensor:
+    """
+    Pack a tensor of bits to int32.
+    :param bits_to_pack: tensor of bits to pack
+    """
+    # pad each row to fill a full 32bit int
+    pack_depth = 32
+    padding = (
+        math.ceil(bits_to_pack.shape[1] / pack_depth) * pack_depth
+        - bits_to_pack.shape[1]
+    )
+    padded_bits = np.pad(
+        bits_to_pack, pad_width=[(0, 0), (0, padding)], constant_values=0
+    )
+    # after packbits each uint8 is two packed uint4s
+    # then we keep the bit pattern the same but convert to int32
+    compressed = np.packbits(padded_bits, axis=-1, bitorder="little")
+    compressed = np.ascontiguousarray(compressed).view(np.int32)
+    return torch.from_numpy(compressed)

compressed_tensors/compressors/sparse_bitmask.py CHANGED Viewed

@@ -72,7 +72,7 @@ class BitmaskCompressor(Compressor):
         return compressed_dict
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu"
+        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located

compressed_tensors/quantization/lifecycle/apply.py CHANGED Viewed

@@ -15,7 +15,9 @@
 import logging
 import re
 from collections import OrderedDict
-from typing import Dict, Iterable, Optional
+from typing import Dict, Iterable, List, Optional
+from typing import OrderedDict as OrderedDictType
+from typing import Union
 import torch
 from compressed_tensors.quantization.lifecycle.calibration import (
@@ -28,12 +30,16 @@ from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quant
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
 )
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import (
+    KV_CACHE_TARGETS,
     infer_quantization_status,
+    is_kv_cache_quant_scheme,
     iter_named_leaf_modules,
 )
 from compressed_tensors.utils.helpers import fix_fsdp_module_name
@@ -45,7 +51,7 @@ __all__ = [
     "load_pretrained_quantization",
     "apply_quantization_config",
     "apply_quantization_status",
-    "find_first_name_or_class_match",
+    "find_name_or_class_matches",
 ]
 from compressed_tensors.quantization.utils.helpers import is_module_quantized
@@ -96,7 +102,7 @@ def load_pretrained_quantization(model: Module, model_name_or_path: str):
             )
-def apply_quantization_config(model: Module, config: QuantizationConfig):
+def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict:
     """
     Initializes the model for quantization in-place based on the given config
@@ -106,6 +112,8 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
     target_to_scheme = OrderedDict()
+    config = process_quantization_config(config)
+    names_to_scheme = OrderedDict()
     for scheme in config.config_groups.values():
         for target in scheme.targets:
             target_to_scheme[target] = scheme
@@ -116,13 +124,16 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
     for name, submodule in iter_named_leaf_modules(model):
         # potentially fix module name to remove FSDP wrapper prefix
         name = fix_fsdp_module_name(name)
-        if find_first_name_or_class_match(name, submodule, config.ignore):
+        if find_name_or_class_matches(name, submodule, config.ignore):
             ignored_submodules.append(name)
             continue  # layer matches ignore list, continue
-        target = find_first_name_or_class_match(name, submodule, target_to_scheme)
-        if target is not None:
+        targets = find_name_or_class_matches(name, submodule, target_to_scheme)
+        if targets:
             # target matched - add layer and scheme to target list
-            submodule.quantization_scheme = target_to_scheme[target]
+            submodule.quantization_scheme = _scheme_from_targets(
+                target_to_scheme, targets, name
+            )
+            names_to_scheme[name] = submodule.quantization_scheme.weights
     if config.ignore is not None and ignored_submodules is not None:
         if set(config.ignore) - set(ignored_submodules):
@@ -132,7 +143,42 @@ def apply_quantization_config(model: Module, config: QuantizationConfig):
                 f"{set(config.ignore) - set(ignored_submodules)}"
             )
     # apply current quantization status across all targeted layers
     apply_quantization_status(model, config.quantization_status)
+    return names_to_scheme
+def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
+    """
+    Preprocess the raw QuantizationConfig
+    :param config: the raw QuantizationConfig
+    :return: the processed QuantizationConfig
+    """
+    if config.kv_cache_scheme is not None:
+        config = process_kv_cache_config(config)
+    return config
+def process_kv_cache_config(
+    config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS
+) -> QuantizationConfig:
+    """
+    Reformulate the `config.kv_cache` as a `config_group`
+    and add it to the set of existing `config.groups`
+    :param config: the QuantizationConfig
+    :return: the QuantizationConfig with additional "kv_cache" group
+    """
+    kv_cache_dict = config.kv_cache_scheme.model_dump()
+    kv_cache_scheme = QuantizationScheme(
+        output_activations=QuantizationArgs(**kv_cache_dict),
+        targets=targets,
+    )
+    kv_cache_group = dict(kv_cache=kv_cache_scheme)
+    config.config_groups.update(kv_cache_group)
+    return config
 def apply_quantization_status(model: Module, status: QuantizationStatus):
@@ -156,36 +202,45 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
         model.apply(compress_quantized_weights)
-def find_first_name_or_class_match(
+def find_name_or_class_matches(
     name: str, module: Module, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # first element of targets that matches the given name
-    # if no name matches returns first target that matches the class name
-    # returns None otherwise
+) -> List[str]:
+    """
+    Returns all targets that match the given name or the class name.
+    Returns empty list otherwise.
+    The order of the output `matches` list matters.
+    The entries are sorted in the following order:
+        1. matches on exact strings
+        2. matches on regex patterns
+        3. matches on module names
+    """
+    targets = sorted(targets, key=lambda x: ("re:" in x, x))
     if isinstance(targets, Iterable):
-        return _find_first_match(name, targets) or _find_first_match(
+        matches = _find_matches(name, targets) + _find_matches(
             module.__class__.__name__, targets, check_contains
         )
+        matches = [match for match in matches if match is not None]
+        return matches
-def _find_first_match(
+def _find_matches(
     value: str, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
-    # returns first element of target that matches value either
+) -> List[str]:
+    # returns all the targets that match value either
     # exactly or as a regex after 're:'. if check_contains is set to True,
     # additionally checks if the target string is contained with value.
+    matches = []
     for target in targets:
         if target.startswith("re:"):
             pattern = target[3:]
             if re.match(pattern, value):
-                return target
+                matches.append(target)
         elif check_contains:
             if target.lower() in value.lower():
-                return target
+                matches.append(target)
         elif target == value:
-            return target
-    return None
+            matches.append(target)
+    return matches
 def _infer_status(model: Module) -> Optional[QuantizationStatus]:
@@ -223,3 +278,68 @@ def _load_quant_args_from_state_dict(
             zp.data = zp_from_state.to(device).to(zp.dtype)
         else:  # fill with zeros matching scale shape
             zp.data = torch.zeros_like(scale, dtype=zp.dtype).to(device)
+def _scheme_from_targets(
+    target_to_scheme: OrderedDictType[str, QuantizationScheme],
+    targets: List[str],
+    name: str,
+) -> QuantizationScheme:
+    if len(targets) == 1:
+        # if `targets` iterable contains a single element
+        # use it as the key
+        return target_to_scheme[targets[0]]
+    # otherwise, we need to merge QuantizationSchemes corresponding
+    # to multiple targets. This is most likely because `name` module
+    # is being target both as an ordinary quantization target, as well
+    # as kv cache quantization target
+    schemes_to_merge = [target_to_scheme[target] for target in targets]
+    return _merge_schemes(schemes_to_merge, name)
+def _merge_schemes(
+    schemes_to_merge: List[QuantizationScheme], name: str
+) -> QuantizationScheme:
+    kv_cache_quantization_scheme = [
+        scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme)
+    ]
+    if not kv_cache_quantization_scheme:
+        # if the schemes_to_merge do not contain any
+        # kv cache QuantizationScheme
+        # return the first scheme (the prioritized one,
+        # since the order of schemes_to_merge matters)
+        return schemes_to_merge[0]
+    else:
+        # fetch the kv cache QuantizationScheme and the highest
+        # priority non-kv cache QuantizationScheme and merge them
+        kv_cache_quantization_scheme = kv_cache_quantization_scheme[0]
+        quantization_scheme = [
+            scheme
+            for scheme in schemes_to_merge
+            if not is_kv_cache_quant_scheme(scheme)
+        ][0]
+        schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme]
+        merged_scheme = {}
+        for scheme in schemes_to_merge:
+            scheme_dict = {
+                k: v for k, v in scheme.model_dump().items() if v is not None
+            }
+            # when merging multiple schemes, the final target will be
+            # the `name` argument - hence erase the original targets
+            del scheme_dict["targets"]
+            # make sure that schemes do not "clash" with each other
+            overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys())
+            if overlapping_keys:
+                raise ValueError(
+                    f"The module: {name} is being modified by two clashing "
+                    f"quantization schemes, that jointly try to override "
+                    f"properties: {overlapping_keys}. Fix the quantization config "
+                    "so that it is not ambiguous."
+                )
+            merged_scheme.update(scheme_dict)
+        merged_scheme.update(targets=[name])
+        return QuantizationScheme(**merged_scheme)

compressed_tensors/quantization/quant_config.py CHANGED Viewed

@@ -16,6 +16,7 @@ from enum import Enum
 from typing import Dict, List, Optional, Union
 from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_scheme import (
     QuantizationScheme,
     preset_name_to_scheme,
@@ -25,6 +26,7 @@ from compressed_tensors.quantization.utils import (
     is_module_quantized,
     iter_named_leaf_modules,
     module_type,
+    parse_out_kv_cache_args,
 )
 from pydantic import BaseModel, Field
 from torch.nn import Module
@@ -117,7 +119,18 @@ class QuantizationConfig(BaseModel):
     other quantization configs
     :param format: specifies how the quantized model is stored on disk
     :quantization_status: specifies the current status of all quantized layers. It is
-    assumed all layers are in the same state.
+        assumed all layers are in the same state.
+    :param kv_cache_scheme: optional QuantizationArgs, that specify the
+        quantization of the kv cache. If None, kv cache is not quantized.
+        When applying kv cache quantization to transformer AutoModelForCausalLM,
+        the kv_cache_scheme gets converted into a QuantizationScheme that:
+            - targets the `q_proj` and `k_proj` modules of the model. The outputs
+              of those modules are the keys and values that might be cached
+            - quantizes the outputs of the aformentioned layers, so that
+              keys and values are compressed before storing them in the cache
+        There is an explicit assumption that the model contains modules with
+        `k_proj` and `v_proj` in their names. If this is not the case
+        and kv_cache_scheme != None, the quantization of kv cache will fail
     :global_compression_ratio: optional informational config to report the model
     compression ratio acheived by the quantization config
     :ignore: optional list of layers to ignore from config_groups. Layers in this list
@@ -126,6 +139,7 @@ class QuantizationConfig(BaseModel):
     config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
     quant_method: str = DEFAULT_QUANTIZATION_METHOD
+    kv_cache_scheme: Optional[QuantizationArgs] = None
     format: str = DEFAULT_QUANTIZATION_FORMAT
     quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
     global_compression_ratio: Optional[float] = None
@@ -154,7 +168,7 @@ class QuantizationConfig(BaseModel):
     ) -> Optional["QuantizationConfig"]:
         """
         Converts a model into its associated QuantizationConfig based on the
-        QuantizationScheme attached to each quanitzed module
+        QuantizationScheme attached to each quantized module
         :param model: model to calculate quantization scheme of
         :return: filled out QuantizationScheme for the input model
@@ -195,6 +209,13 @@ class QuantizationConfig(BaseModel):
             # else we leave it off the ignore list, doesn't fall under any of the
             # existing quantization schemes so it won't be quantized
+        kv_cache_args, quant_scheme_to_layers = parse_out_kv_cache_args(
+            quant_scheme_to_layers
+        )
+        kv_cache_scheme = (
+            kv_cache_args.model_dump() if kv_cache_args is not None else kv_cache_args
+        )
         config_groups = {}
         for idx, scheme in enumerate(quant_scheme_to_layers):
             group_name = "group_" + str(idx)
@@ -213,6 +234,7 @@ class QuantizationConfig(BaseModel):
         return QuantizationConfig(
             config_groups=config_groups,
             quantization_status=quantization_status,
+            kv_cache_scheme=kv_cache_scheme,
             global_compression_ratio=compression_ratio,
             format=format,
             ignore=consolidated_ignore,

compressed_tensors/quantization/quant_scheme.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import List, Optional
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
-    QuantizationStrategy,
     QuantizationType,
 )
 from pydantic import BaseModel

compressed_tensors/quantization/utils/helpers.py CHANGED Viewed

@@ -13,10 +13,13 @@
 # limitations under the License.
 import logging
-from typing import Optional, Tuple
+import re
+from typing import List, Optional, Tuple
 import torch
 from compressed_tensors.quantization.observers.base import Observer
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from torch.nn import Module
 from tqdm import tqdm
@@ -30,8 +33,12 @@ __all__ = [
     "calculate_compression_ratio",
     "get_torch_bit_depth",
     "can_quantize",
+    "parse_out_kv_cache_args",
+    "KV_CACHE_TARGETS",
+    "is_kv_cache_quant_scheme",
 ]
+KV_CACHE_TARGETS = ["re:.*k_proj", "re:.*v_proj"]
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -182,3 +189,62 @@ def calculate_compression_ratio(model: Module) -> float:
             total_uncompressed += uncompressed_bits * num_weights
     return total_uncompressed / total_compressed
+def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
+    """
+    Check whether the QuantizationScheme targets the kv cache.
+    It does if all the following criteria are met:
+    - the scheme targets either exactly match the KV_CACHE_TARGETS
+        or the match KV_CACHE_TARGETS regex pattern
+    - the scheme quantizes output_activations (we want to quantize the
+        outputs from the KV_CACHE_TARGETS, as their correspond to the
+        keys and values that are to be saved in the cache)
+    :param scheme: The QuantizationScheme to investigate
+    :return: boolean flag
+    """
+    if len(scheme.targets) == 1:
+        # match on the KV_CACHE_TARGETS regex pattern
+        # if there is only one target
+        is_match_targets = any(
+            [re.match(pattern[3:], scheme.targets[0]) for pattern in KV_CACHE_TARGETS]
+        )
+    else:
+        # match on the exact KV_CACHE_TARGETS
+        # if there are multiple targets
+        is_match_targets = set(KV_CACHE_TARGETS) == set(scheme.targets)
+    is_match_output_activations = scheme.output_activations is not None
+    return is_match_targets and is_match_output_activations
+def parse_out_kv_cache_args(
+    quant_scheme_to_layers: List[QuantizationScheme],
+) -> Tuple[Optional[QuantizationArgs], List[QuantizationScheme]]:
+    """
+    If possible, parse out the kv cache specific QuantizationArgs
+    from the list of the QuantizationSchemes. If no kv cache
+    specific QuantizationArgs available, this function acts
+    as an identity function
+    :param quant_scheme_to_layers: list of QuantizationSchemes
+    :return: kv_cache_args (optional) and the (remaining or original)
+        list of the QuantizationSchemes
+    """
+    kv_cache_quant_scheme_to_layers = [
+        scheme for scheme in quant_scheme_to_layers if is_kv_cache_quant_scheme(scheme)
+    ]
+    quant_scheme_to_layers = [
+        scheme
+        for scheme in quant_scheme_to_layers
+        if not is_kv_cache_quant_scheme(scheme)
+    ]
+    if kv_cache_quant_scheme_to_layers:
+        kv_cache_quant_scheme_to_layers = kv_cache_quant_scheme_to_layers[0]
+        kv_cache_args = kv_cache_quant_scheme_to_layers.output_activations
+    else:
+        kv_cache_args = None
+    return kv_cache_args, quant_scheme_to_layers

compressed_tensors/utils/helpers.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
 from transformers import AutoConfig

{compressed_tensors_nightly-0.4.0.20240623.dist-info → compressed_tensors_nightly-0.4.0.20240627.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.4.0.20240623
+Version: 0.4.0.20240627
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors_nightly-0.4.0.20240623.dist-info → compressed_tensors_nightly-0.4.0.20240627.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6hY,789
-compressed_tensors/base.py,sha256=OA2TOLP1gP3LSH7gp508eqr2ZtDQ-pqRHElCp-aB0vs,755
+compressed_tensors/base.py,sha256=Mq4mfVQcJhNpha-BXzpOfpmFIdl01o09BJE7D2oQ_00,796
 compressed_tensors/version.py,sha256=cJJf0y0NnXErTtQtVQjOvrq9hMIkhXIfBwuu4Tuxl24,1586
 compressed_tensors/compressors/__init__.py,sha256=wmX4VnkUTS63xBwK5-6w8FP78bNZpcdcqvf2KOEC5E4,1133
-compressed_tensors/compressors/base.py,sha256=LWEgbpgTxzmoqQ7Xhq2OQszUgWoDtFuGCiV1Y8nlBGw,2134
-compressed_tensors/compressors/dense.py,sha256=G_XHbvuENyupIKlXSITOQgvPkNkcMEOLcLWQr70V9EE,1257
+compressed_tensors/compressors/base.py,sha256=-rqT2h9G2iwDkwrVj0d0jxxn9h0dccJA1mqOzVEkwGM,2144
+compressed_tensors/compressors/dense.py,sha256=xcWECjcRY4INN6jC7vHx5wvUX3NmnKlxA9SVE1A6m2Q,1267
 compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
-compressed_tensors/compressors/marlin_24.py,sha256=X_BjtFB3Mn0hqiLz56UM3jGX2eNmGLnvEIPfbg7di6U,9444
-compressed_tensors/compressors/model_compressor.py,sha256=83AWAhlrR3QTNelfMGCh_10G-VfMIRXRTvV0ZZinCU8,13338
-compressed_tensors/compressors/naive_quantized.py,sha256=N3y5LxsCaTUJHT30sqEhnviZsyoz1v2eUaayE7-f8Xs,5562
-compressed_tensors/compressors/pack_quantized.py,sha256=ODb03_WaBQ1l99Gmp49olAUZ2TB_67z9qNZbc56X7NU,8275
-compressed_tensors/compressors/sparse_bitmask.py,sha256=H9oZSTYI1oRCzAMbd4zThUnZd1h2rfs8DmA3tPcvuNE,8637
+compressed_tensors/compressors/marlin_24.py,sha256=PULMP1fp1sNWz-xOxvM0JXhOrUbq6sPwOTscYSifgDw,9450
+compressed_tensors/compressors/model_compressor.py,sha256=t4dH7Yh637JV53VPyys-gkoMPJHGf_tlWWufLRyIdUM,13418
+compressed_tensors/compressors/naive_quantized.py,sha256=6_1wuTF96-lw-UzzrsiEX_ipciKiQQJoZ8uotVwtbyQ,5569
+compressed_tensors/compressors/pack_quantized.py,sha256=ZRqqBVPB6B-nZQOSdu7WhKrKWIm2-ZVrUQHATxO2Boc,10297
+compressed_tensors/compressors/sparse_bitmask.py,sha256=kiDwBlFV0sJGLcIdDYxIiuF64ccgwDfqq1hWRQThYDc,8647
 compressed_tensors/compressors/utils/__init__.py,sha256=-mbGDZh1hd9T6u62Ht_iBIK255UmMg0f5bLkSs1f9Cc,731
 compressed_tensors/compressors/utils/helpers.py,sha256=4fq7KclSIK__jemCG9pwYlgWLrQjsaAMxhIrhjdw0BQ,1506
 compressed_tensors/compressors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVyah6BUUir_StT28,2530
@@ -20,10 +20,10 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
 compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
 compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
 compressed_tensors/quantization/quant_args.py,sha256=Vc_tWSTcbZZsMJlACpLq4JEPvGx87izc8VEx-mcXjoM,5621
-compressed_tensors/quantization/quant_config.py,sha256=hL42sXp1wAZxyrkHarw7tAMRcwSVEr0MT3wmrmL3NhE,8285
-compressed_tensors/quantization/quant_scheme.py,sha256=Yhaj3QJn4lifGMoQ8mlXXOdLDZA6iGMthb_0hlAzvVk,3811
+compressed_tensors/quantization/quant_config.py,sha256=PU3BchHm09ks6_yAderrHoIZI07zBlU9ejC87v3A-54,9568
+compressed_tensors/quantization/quant_scheme.py,sha256=TU9W3bOWCY2l5Vrha0ufRtW1ac4gew1uwW8N3JGbZvg,3785
 compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
-compressed_tensors/quantization/lifecycle/apply.py,sha256=eQfuIGcX6KBKeMta1svviXXRpKO3og2CRrxhKlGcE_k,8756
+compressed_tensors/quantization/lifecycle/apply.py,sha256=fyv5ujZC0__oG1ESOTmMyMsKK7DGAxG7uQI7_sxT7Mw,13308
 compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=VreB10xPwgSLQQlTu20UCrFpRS--cA7-lx5s7nrPPrg,2247
 compressed_tensors/quantization/lifecycle/forward.py,sha256=tcjL_qyE3ODourNprt2bndF7_ALlUEGY2_Yag4exLoE,11908
@@ -35,14 +35,14 @@ compressed_tensors/quantization/observers/helpers.py,sha256=DSNGNJpZyT2Lyu0c82dH
 compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
 compressed_tensors/quantization/observers/min_max.py,sha256=UK7zCMzxv9GGn6BflBxdajV20RiWaCY2RHcvZodCP1w,3669
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
-compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
+compressed_tensors/quantization/utils/helpers.py,sha256=YjXABJQUnelof-z7qcwck6fnrFLh4uMSrOmPiqNp_RY,8591
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
 compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
 compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
-compressed_tensors/utils/helpers.py,sha256=5ull5yFT31M2zVxKeFvpvvlvX5f1Sk1LGuj_wrfZWCY,2267
+compressed_tensors/utils/helpers.py,sha256=dt4uxSIeqvqDmeJBJ6UUVHEOnMI7EtMSzEDv6PRUu14,2266
 compressed_tensors/utils/safetensors_load.py,sha256=0MheXwx1jeY12PeISppiSIZHs6rmN2YddwPpFb9V67I,8527
-compressed_tensors_nightly-0.4.0.20240623.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors_nightly-0.4.0.20240623.dist-info/METADATA,sha256=TKdmWA3qynRUK6FyOoPODvDpc8DB0sKjjiX2hN3uU7A,5668
-compressed_tensors_nightly-0.4.0.20240623.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-compressed_tensors_nightly-0.4.0.20240623.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors_nightly-0.4.0.20240623.dist-info/RECORD,,
+compressed_tensors_nightly-0.4.0.20240627.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors_nightly-0.4.0.20240627.dist-info/METADATA,sha256=pRkLnBBttymxaUP8mHpKe_NQ4Mfa6gV3TMoBj6o3NCU,5668
+compressed_tensors_nightly-0.4.0.20240627.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+compressed_tensors_nightly-0.4.0.20240627.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors_nightly-0.4.0.20240627.dist-info/RECORD,,

{compressed_tensors_nightly-0.4.0.20240623.dist-info → compressed_tensors_nightly-0.4.0.20240627.dist-info}/LICENSE RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.4.0.20240623.dist-info → compressed_tensors_nightly-0.4.0.20240627.dist-info}/WHEEL RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.4.0.20240623.dist-info → compressed_tensors_nightly-0.4.0.20240627.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors-nightly 0.4.0.20240623__py3-none-any.whl → 0.4.0.20240627__py3-none-any.whl

compressed-tensors-nightly 0.4.0.20240623py3-none-any.whl → 0.4.0.20240627py3-none-any.whl