PyPI - compressed-tensors - Versions diffs - 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

compressed-tensors 0.9.2py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

compressed_tensors/compressors/model_compressors/model_compressor.py CHANGED Viewed

@@ -19,7 +19,7 @@ import os
 import re
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
 import compressed_tensors
 import torch
@@ -522,10 +522,13 @@ class ModelCompressor:
                 update_parameter_data(module, data, param_name)
-def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
+def map_modules_to_quant_args(
+    model: Module,
+) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
     """
     Given a pytorch model, map out the submodule name (usually linear layers)
-     to the QuantizationArgs
+    to the weight QuantizationArgs. If running input activation quantization, will also
+    map to the input QuantizationArgs in a tuple.
     :param model: pytorch model
     """
@@ -535,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
             if submodule.quantization_scheme.weights is not None:
                 name = fix_fsdp_module_name(name)
                 quantized_modules_to_args[name] = submodule.quantization_scheme.weights
+                if submodule.quantization_scheme.input_activations is not None:
+                    weight_args = quantized_modules_to_args.get(name)
+                    quantized_modules_to_args[name] = (
+                        weight_args,
+                        submodule.quantization_scheme.input_activations,
+                    )
     return quantized_modules_to_args

compressed_tensors/compressors/quantized_compressors/base.py CHANGED Viewed

@@ -82,11 +82,32 @@ class BaseQuantizationCompressor(BaseCompressor):
         """
         compressed_dict = {}
         weight_suffix = ".weight"
+        input_zp_suffix = ".input_zero_point"
+        weight_zp_suffix = ".weight_zero_point"
         _LOGGER.debug(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
         for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
+            # check if the parameter we're compressing is the weight zp
+            # or the input zp
+            is_weight_zp = name.endswith(weight_zp_suffix)
+            is_input_zp = name.endswith(input_zp_suffix)
+            # if we're saving the weight zp, fetch weight quant args
+            if is_weight_zp:
+                quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
+                if isinstance(quant_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    quant_args_zp = quant_args_zp[0]
+            # if we're saving the input zp, fetch input quant args
+            if is_input_zp:
+                input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
+                if isinstance(input_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    input_args_zp = input_args_zp[-1]
             if name.endswith(weight_suffix):
                 prefix = name[: -(len(weight_suffix))]
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
@@ -94,7 +115,11 @@ class BaseQuantizationCompressor(BaseCompressor):
                 g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
                 if scale is not None:
                     # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
+                    if isinstance(names_to_scheme[prefix], tuple):
+                        quant_args = names_to_scheme[prefix][0]
+                    else:
+                        quant_args = names_to_scheme[prefix]
                     compressed_data = self.compress_weight(
                         weight=value,
                         scale=scale,
@@ -107,7 +132,11 @@ class BaseQuantizationCompressor(BaseCompressor):
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            elif name.endswith("zero_point") and torch.all(value == 0):
+            # only save if asym
+            elif is_weight_zp and quant_args_zp.symmetric:
+                continue
+            # only save if asym
+            elif is_input_zp and input_args_zp.symmetric:
                 continue
             elif name.endswith("g_idx") and torch.any(value <= -1):
                 continue

compressed_tensors/linear/compressed_linear.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from typing import Dict, Tuple
 import torch
@@ -21,6 +22,7 @@ from compressed_tensors.quantization import (
     QuantizationStatus,
     initialize_module_for_quantization,
 )
+from compressed_tensors.utils import register_offload_parameter
 from torch import Tensor
 from torch.nn import Parameter
 from torch.nn.functional import linear
@@ -32,11 +34,16 @@ class CompressedLinear(Linear):
     Wrapper module for running a compressed forward pass of a quantized Linear module.
     The wrapped layer will decompressed on each forward call.
-    :param module: dense linear module to replace
-    :param quantization_scheme: quantization config for the module to wrap
-    :param quantization_format: compression format module is stored as
     """
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            "CompressedLinear should not be initialized directly. "
+            "Use the from_linear method instead.",
+            UserWarning,
+        )
     @classmethod
     @torch.no_grad()
     def from_linear(
@@ -45,6 +52,12 @@ class CompressedLinear(Linear):
         quantization_scheme: QuantizationScheme,
         quantization_format: str,
     ):
+        """
+        :param module: dense linear module to replace
+        :param quantization_scheme: quantization config for the module to wrap
+        :param quantization_format: compression format module is stored as
+        :return: CompressedLinear module wrapping the input module
+        """
         module.__class__ = CompressedLinear
         module.compressor = BaseCompressor.load_from_registry(quantization_format)
         device = next(module.parameters()).device
@@ -68,7 +81,7 @@ class CompressedLinear(Linear):
             param = Parameter(
                 torch.empty(shape, device=device, dtype=dtype), requires_grad=False
             )
-            module.register_parameter(name, param)
+            register_offload_parameter(module, name, param)
         # mark module as compressed
         module.quantization_status = QuantizationStatus.COMPRESSED
@@ -85,5 +98,11 @@ class CompressedLinear(Linear):
         """
         Decompresses the weight, then runs the wrapped forward pass
         """
-        uncompressed_weight = self.compressor.decompress_module(self)
-        return linear(input, uncompressed_weight, self.bias)
+        if self.quantization_status == QuantizationStatus.COMPRESSED:
+            weight_data = self.compressor.decompress_module(self)
+            param = Parameter(weight_data, requires_grad=False)
+            register_offload_parameter(self, "weight", param)
+            self.quantization_status = QuantizationStatus.FROZEN
+        return linear(input, self.weight, self.bias)

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -203,11 +203,10 @@ def _initialize_attn_scales(module: Module) -> None:
         torch.empty(expected_shape, dtype=scale_dtype, device=device),
         requires_grad=False,
     )
-    module.register_parameter(KVCacheScaleType.KEY.value, init_scale)
+    register_offload_parameter(module, KVCacheScaleType.KEY.value, init_scale)
     init_scale = Parameter(
         torch.empty(expected_shape, dtype=scale_dtype, device=device),
         requires_grad=False,
     )
-    module.register_parameter(KVCacheScaleType.VALUE.value, init_scale)
+    register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)

compressed_tensors/version.py CHANGED Viewed

@@ -17,7 +17,7 @@ Functionality for storing and setting the version info for SparseML
 """
-version_base = "0.9.2"
+version_base = "0.9.3"
 is_release = True  # change to True to set the generated version as a release version

{compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: compressed-tensors
-Version: 0.9.2
+Version: 0.9.3
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.
@@ -26,6 +26,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: summary

{compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
 compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
-compressed_tensors/version.py,sha256=BaFkY2H2ed1_O6ZxJgT-GUlT7oI0xJIeXGPO8yXqBE0,1585
+compressed_tensors/version.py,sha256=X4y5lqlF1QFUgl25iumzagpg3dzyVoLP6i82HZEhCJA,1585
 compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
 compressed_tensors/compressors/base.py,sha256=x8dQrWVEurynXw03yHJZTaAmrRTOsdZJoHjmvs0IKwk,7002
 compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
 compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
-compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=AmIE1SoNRH1fNgQALfNkdQo8y5tePVpdWUgLIOtf5rg,22569
+compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=n0gcrKwefJuO6b4LNjCynJQf7NNqNHDcoLlzZgTCPGc,23080
 compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=09UJq68Pht6Bf-4iP9xYl3tetKsncNPHD8IAGbePsr4,714
-compressed_tensors/compressors/quantized_compressors/base.py,sha256=cp8S1Kr3HhlMHIz7k4vGo-qxxdknEC3qP1QLIhNnwRA,7217
+compressed_tensors/compressors/quantized_compressors/base.py,sha256=GXTSWgFAhksbno94Ulpth9-YM4a7NsDlx4oQGGB0swQ,8567
 compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
 compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=zH2PocRe_T5yt1-3kLdZH9AUQWQyaVOi4U9nEJiYaWA,8509
 compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
@@ -23,7 +23,7 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
 compressed_tensors/config/sparse_24_bitmask.py,sha256=Lhj39zT2V1hxftprvxvneyhv45ShlXOKd75DBbDTyTE,1401
 compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
 compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
-compressed_tensors/linear/compressed_linear.py,sha256=MJa-UfoKhIkdUWRD1shrXXri2cOwR5GK0a4t4bNYosM,3268
+compressed_tensors/linear/compressed_linear.py,sha256=_m6XpNcI53eeSHO8VdiuAM6UBTdpDhn5Ivd8iRMwEKc,3980
 compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
 compressed_tensors/quantization/quant_args.py,sha256=sKpb8DcNObidjXjNol1Tn_Iih3ZXBycSp-fyz68TGhY,9117
 compressed_tensors/quantization/quant_config.py,sha256=vx06wBo91p4LCb3Vzd-2eCTUeIf_Sz2ZXRP263eQyjQ,10385
@@ -33,7 +33,7 @@ compressed_tensors/quantization/lifecycle/apply.py,sha256=lZmCCSm1_o79iUAy460w6B
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
 compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
 compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
-compressed_tensors/quantization/lifecycle/initialize.py,sha256=hymYtayTSumm8KCYAYPY267aWmlsJpt8oQFiRblk8qE,7452
+compressed_tensors/quantization/lifecycle/initialize.py,sha256=sK3PLm69N91QepBuq-83Qd2Br6XcOmRDpD5qo_WWNJo,7469
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
 compressed_tensors/quantization/utils/helpers.py,sha256=DBP-sGRpGAY01K0LFE7qqonNj4hkTYL_mXrMs2LtAD8,14100
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
@@ -45,8 +45,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
 compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
 compressed_tensors/utils/safetensors_load.py,sha256=5SeM2hzLh77Ne8Vk7qR6-km7cf8bhov41ExpWITqX3A,11470
 compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
-compressed_tensors-0.9.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors-0.9.2.dist-info/METADATA,sha256=BmuThcrHjHQVScbgaWfyCRGuLo-cB6OnujZJPOKe1bQ,6975
-compressed_tensors-0.9.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-compressed_tensors-0.9.2.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors-0.9.2.dist-info/RECORD,,
+compressed_tensors-0.9.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors-0.9.3.dist-info/METADATA,sha256=zs3aFaG-BGV9hqJbW9Zwzex0TVcM5sPZhiaeVx2qjR0,6997
+compressed_tensors-0.9.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+compressed_tensors-0.9.3.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors-0.9.3.dist-info/RECORD,,

{compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl

compressed-tensors 0.9.2py3-none-any.whl → 0.9.3py3-none-any.whl