PyPI - compressed-tensors-nightly - Versions diffs - 0.7.1.20241017__py3-none-any.whl → 0.7.1.20241020__py3-none-any.whl - Mend

compressed-tensors-nightly 0.7.1.20241017py3-none-any.whl → 0.7.1.20241020py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

compressed_tensors/quantization/cache.py CHANGED Viewed

@@ -28,7 +28,6 @@ class KVCacheScaleType(Enum):
 class QuantizedKVParameterCache(HFDyanmicCache):
     """
     Quantized KV cache used in the forward call based on HF's dynamic cache.
     Quantization strategy (tensor, group, channel) set from Quantization arg's strategy

compressed_tensors/quantization/lifecycle/calibration.py CHANGED Viewed

@@ -53,7 +53,19 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
     if quantize_weights_upfront and module.quantization_scheme.weights is not None:
         # set weight scale and zero_point up front, calibration data doesn't affect it
+        if not hasattr(module, "weight_observer"):
+            from compressed_tensors.quantization.lifecycle.initialize import (
+                initialize_observers,
+            )
+            initialize_observers(
+                module=module,
+                base_name="weight",
+                quantization_args=module.quantization_scheme.weights,
+            )
         observer = module.weight_observer
         g_idx = getattr(module, "weight_g_idx", None)
         offloaded = is_module_offloaded(module)

compressed_tensors/quantization/lifecycle/forward.py CHANGED Viewed

@@ -38,7 +38,8 @@ __all__ = [
     "dequantize",
     "fake_quantize",
     "wrap_module_forward_quantized",
-    "maybe_calibrate_or_quantize",
+    "forward_quantize",
+    "calibrate_activations",
 ]
@@ -276,14 +277,24 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         if scheme.input_activations is not None:
             # calibrate and (fake) quantize input activations when applicable
-            input_ = maybe_calibrate_or_quantize(
-                module, input_, "input", scheme.input_activations
-            )
+            # NOTE: will be moved out of compressed-tensors
+            if (
+                module.quantization_status == QuantizationStatus.CALIBRATION
+                and not scheme.input_activations.dynamic
+            ):
+                calibrate_activations(
+                    module=module,
+                    value=input_,
+                    base_name="input",
+                    quantization_args=scheme.input_activations,
+                )
+            input_ = forward_quantize(module, input_, "input", scheme.input_activations)
         if scheme.weights is not None and not compressed:
             # calibrate and (fake) quantize weights when applicable
             unquantized_weight = self.weight.data.clone()
-            self.weight.data = maybe_calibrate_or_quantize(
+            self.weight.data = forward_quantize(
                 module, self.weight, "weight", scheme.weights
             )
@@ -296,7 +307,19 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
             # calibrate and (fake) quantize output activations when applicable
             # kv_cache scales updated on model self_attn forward call in
             # wrap_module_forward_quantized_attn
-            output = maybe_calibrate_or_quantize(
+            if (
+                module.quantization_status == QuantizationStatus.CALIBRATION
+                and not scheme.output_activations.dynamic
+            ):
+                calibrate_activations(
+                    module=module,
+                    value=output,
+                    base_name="output",
+                    quantization_args=scheme.ouput_activations,
+                )
+            output = forward_quantize(
                 module, output, "output", scheme.output_activations
             )
@@ -356,12 +379,36 @@ def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationSchem
     setattr(module, "forward", bound_wrapped_forward)
-def maybe_calibrate_or_quantize(
+def calibrate_activations(
+    module: Module,
+    value: torch.Tensor,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+):
+    # If empty tensor, can't update zp/scale
+    # Case for MoEs
+    if value.numel() == 0:
+        return
+    # calibration mode - get new quant params from observer
+    if not hasattr(module, f"{base_name}_observer"):
+        from compressed_tensors.quantization.lifecycle import initialize_observers
+        initialize_observers(
+            module=module, base_name=base_name, quantization_args=quantization_args
+        )
+    observer = getattr(module, f"{base_name}_observer")
+    updated_scale, updated_zero_point = observer(value)
+    # update scale and zero point
+    update_parameter_data(module, updated_scale, f"{base_name}_scale")
+    update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
+def forward_quantize(
     module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
-    # don't run quantization if we haven't entered calibration mode
-    if module.quantization_status == QuantizationStatus.INITIALIZED:
-        return value
     # in compressed mode, the weight is already compressed and quantized so we don't
     # need to run fake quantization
@@ -386,22 +433,6 @@ def maybe_calibrate_or_quantize(
         scale = getattr(module, f"{base_name}_scale")
         zero_point = getattr(module, f"{base_name}_zero_point", None)
-        if (
-            module.quantization_status == QuantizationStatus.CALIBRATION
-            and base_name != "weight"
-        ):
-            # calibration mode - get new quant params from observer
-            observer = getattr(module, f"{base_name}_observer")
-            updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
-            # update scale and zero point
-            update_parameter_data(module, updated_scale, f"{base_name}_scale")
-            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
-            scale = updated_scale
-            zero_point = updated_zero_point
     return fake_quantize(
         x=value,
         scale=scale,

compressed_tensors/quantization/lifecycle/frozen.py CHANGED Viewed

@@ -41,15 +41,11 @@ def freeze_module_quantization(module: Module):
         return
     # delete observers from module if not dynamic
-    if scheme.input_activations and not scheme.input_activations.dynamic:
+    if hasattr(module, "input_observer") and not scheme.input_activations.dynamic:
         delattr(module, "input_observer")
-    if scheme.weights and not scheme.weights.dynamic:
+    if hasattr(module, "weight_observer") and not scheme.weights.dynamic:
         delattr(module, "weight_observer")
-    if (
-        scheme.output_activations
-        and not is_kv_cache_quant_scheme(scheme)
-        and not scheme.output_activations.dynamic
-    ):
+    if hasattr(module, "output_observer") and not scheme.output_activations.dynamic:
         delattr(module, "output_observer")
     module.quantization_status = QuantizationStatus.FROZEN

compressed_tensors/quantization/lifecycle/initialize.py CHANGED Viewed

@@ -34,9 +34,7 @@ from compressed_tensors.utils import get_execution_device, is_module_offloaded
 from torch.nn import Module, Parameter
-__all__ = [
-    "initialize_module_for_quantization",
-]
+__all__ = ["initialize_module_for_quantization", "initialize_observers"]
 _LOGGER = logging.getLogger(__name__)
@@ -74,7 +72,7 @@ def initialize_module_for_quantization(
     else:
         if scheme.input_activations is not None:
-            _initialize_scale_zero_point_observer(
+            _initialize_scale_zero_point(
                 module,
                 "input",
                 scheme.input_activations,
@@ -85,7 +83,7 @@ def initialize_module_for_quantization(
                 weight_shape = None
                 if isinstance(module, torch.nn.Linear):
                     weight_shape = module.weight.shape
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module,
                     "weight",
                     scheme.weights,
@@ -101,7 +99,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module, "output", scheme.output_activations
                 )
@@ -146,21 +144,23 @@ def initialize_module_for_quantization(
                 module._hf_hook.weights_map = new_prefix_dict
-def _initialize_scale_zero_point_observer(
+def initialize_observers(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    weight_shape: Optional[torch.Size] = None,
-    force_zero_point: bool = True,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
-    # no need to register an observer for dynamic quantization
-    if observer:
-        module.register_module(f"{base_name}_observer", observer)
+    module.register_module(f"{base_name}_observer", observer)
-    # no need to register a scale and zero point for a dynamic quantization
+def _initialize_scale_zero_point(
+    module: Module,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+    weight_shape: Optional[torch.Size] = None,
+    force_zero_point: bool = True,
+):
     if quantization_args.dynamic:
         return

compressed_tensors/quantization/observers/mse.py CHANGED Viewed

@@ -70,9 +70,9 @@ class MovingAverageMSEObserver(Observer):
             absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
             absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
-        best = torch.full(absolute_min_val.shape, float("inf"))
-        min_val = torch.ones(absolute_min_val.shape)
-        max_val = torch.zeros(absolute_max_val.shape)
+        best = torch.full_like(absolute_min_val, torch.finfo(absolute_min_val.dtype).max)
+        min_val = torch.ones_like(absolute_min_val)
+        max_val = torch.zeros_like(absolute_max_val)
         for i in range(int(self.maxshrink * self.grid)):
             p = 1 - i / self.grid
             shrinked_min_val = p * absolute_min_val

{compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.7.1.20241017
+Version: 0.7.1.20241020
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/RECORD RENAMED Viewed

@@ -23,23 +23,23 @@ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5y
 compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
 compressed_tensors/linear/compressed_linear.py,sha256=0jTTf6XxOAjAYs3tvFtgiNMAO4W10sSeR-pdH2M413g,3218
 compressed_tensors/quantization/__init__.py,sha256=nWP_fsl6Nn0ksEgZPzerGiETdvF-ZfNwPnwGlRiR5pY,805
-compressed_tensors/quantization/cache.py,sha256=vnBB5zasO_XpHomZvzUPVVbzyCz2VgebsHePm0kANzY,6831
+compressed_tensors/quantization/cache.py,sha256=Sf_9Nfe3RpX04V84iUJMgLN9pWNMFIYvZW02LXcPUQw,6830
 compressed_tensors/quantization/quant_args.py,sha256=k7NuZn8OqjgzmAVaN2-jHPQ1bgDkMuUoLJtLnhkvIOI,9085
 compressed_tensors/quantization/quant_config.py,sha256=NCiMvUMnnz5kTyAkDylxjtEGQnjgsIYIeNR2zyHEdTQ,10371
 compressed_tensors/quantization/quant_scheme.py,sha256=5ggPz5sqEfTUgvJJeiPIINA74QtO-08hb3szsm7UHGE,6000
 compressed_tensors/quantization/lifecycle/__init__.py,sha256=MXE2E7GfIfRRfhrdGy2Og3AZOz5N59B0ZGFcsD89y6c,821
 compressed_tensors/quantization/lifecycle/apply.py,sha256=czaayvpeUYyWRJhO_klffw6esptOgA9sBKL5TWQcRdw,15805
-compressed_tensors/quantization/lifecycle/calibration.py,sha256=IuLeRkVQPrMxkMcIjr4OMFlIUMHkqjH4qAxC2KiUBGw,2673
+compressed_tensors/quantization/lifecycle/calibration.py,sha256=gPSD3kiH4VuU6nq-OLbOmhBGaMXsebEwLm4PkEnUhf0,3043
 compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
-compressed_tensors/quantization/lifecycle/forward.py,sha256=qy6_3z5YWDIffiAjQxgmBRggZifA7z93F9vk2GajIIU,15703
-compressed_tensors/quantization/lifecycle/frozen.py,sha256=NiJw7NP7pcT6idWFa8vksgiLoT8oQ975e57S4QfD2QQ,1874
+compressed_tensors/quantization/lifecycle/forward.py,sha256=8GjOnx4rwOZZqSDTdnejNOY2DVTjNDzH0DfY_rQam6k,16575
+compressed_tensors/quantization/lifecycle/frozen.py,sha256=8myzxsz5h5Odh5cIB2lDHb7xLRYBYnAhA1PO8YGuCtM,1839
 compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
-compressed_tensors/quantization/lifecycle/initialize.py,sha256=2n309DPxeV_nrM5H_yfQOhF5kteu428qBd4CBzocscw,8908
+compressed_tensors/quantization/lifecycle/initialize.py,sha256=lKoFy18PjbSklyum7f4hoLuWtHShBKax7JDTBzPlCqM,8839
 compressed_tensors/quantization/observers/__init__.py,sha256=DYrttzq-8MHLZUzpX-xzzm4hrw6HcXkMkux82KBKb1M,738
 compressed_tensors/quantization/observers/base.py,sha256=5ovQicWPYHjIxr6-EkQ4lgOX0PpI9g23iSzKpxjM1Zg,8420
 compressed_tensors/quantization/observers/helpers.py,sha256=o9hg4E9b5cCb5PaEAj6jHiUWkNrKtYtv0b1pGg-T9B4,5516
 compressed_tensors/quantization/observers/min_max.py,sha256=sQXqU3z-voxIDfR_9mQzwQUflZj2sASm_G8CYaXntFw,3865
-compressed_tensors/quantization/observers/mse.py,sha256=Aeh-253Vbab1F8cYuBiGNn4OXWJ67wXQ_JVfl3mu2a8,6034
+compressed_tensors/quantization/observers/mse.py,sha256=9JRbvXo0VKLrgsTNuVlQ7AV87wwjRUuQludG0v7IJbI,6058
 compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
 compressed_tensors/quantization/utils/helpers.py,sha256=y4LEyC2oUd876ZMdALWKGH3Ct5EgBJZV4id_NUjTGH8,9531
 compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
@@ -51,8 +51,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
 compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
 compressed_tensors/utils/safetensors_load.py,sha256=m08ANVuTBxQdoa6LufDgcNJ7wCLDJolyZljB8VEybAU,8578
 compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
-compressed_tensors_nightly-0.7.1.20241017.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-compressed_tensors_nightly-0.7.1.20241017.dist-info/METADATA,sha256=E6BlsU2l6X07H8GZaadLUS9tmWKcy1jW_gtNPZ2wjrw,6799
-compressed_tensors_nightly-0.7.1.20241017.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-compressed_tensors_nightly-0.7.1.20241017.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
-compressed_tensors_nightly-0.7.1.20241017.dist-info/RECORD,,
+compressed_tensors_nightly-0.7.1.20241020.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+compressed_tensors_nightly-0.7.1.20241020.dist-info/METADATA,sha256=Oir-JDg1u_Tfz5K2YbciN27lTbthuV4fU0UmsvDRu9M,6799
+compressed_tensors_nightly-0.7.1.20241020.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+compressed_tensors_nightly-0.7.1.20241020.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
+compressed_tensors_nightly-0.7.1.20241020.dist-info/RECORD,,

{compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/LICENSE RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/WHEEL RENAMED Viewed

File without changes

{compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/top_level.txt RENAMED Viewed

File without changes

compressed-tensors-nightly 0.7.1.20241017__py3-none-any.whl → 0.7.1.20241020__py3-none-any.whl

compressed-tensors-nightly 0.7.1.20241017py3-none-any.whl → 0.7.1.20241020py3-none-any.whl