PyPI - compressed-tensors-nightly - Versions diffs - 0.7.1.20241018__tar.gz → 0.7.1.20241020__tar.gz - Mend

compressed-tensors-nightly 0.7.1.20241018tar.gz → 0.7.1.20241020tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{compressed-tensors-nightly-0.7.1.20241018/src/compressed_tensors_nightly.egg-info → compressed-tensors-nightly-0.7.1.20241020}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.7.1.20241018
+Version: 0.7.1.20241020
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/cache.py RENAMED Viewed

@@ -28,7 +28,6 @@ class KVCacheScaleType(Enum):
 class QuantizedKVParameterCache(HFDyanmicCache):
     """
     Quantized KV cache used in the forward call based on HF's dynamic cache.
     Quantization strategy (tensor, group, channel) set from Quantization arg's strategy

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/calibration.py RENAMED Viewed

@@ -53,7 +53,19 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
     if quantize_weights_upfront and module.quantization_scheme.weights is not None:
         # set weight scale and zero_point up front, calibration data doesn't affect it
+        if not hasattr(module, "weight_observer"):
+            from compressed_tensors.quantization.lifecycle.initialize import (
+                initialize_observers,
+            )
+            initialize_observers(
+                module=module,
+                base_name="weight",
+                quantization_args=module.quantization_scheme.weights,
+            )
         observer = module.weight_observer
         g_idx = getattr(module, "weight_g_idx", None)
         offloaded = is_module_offloaded(module)

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -38,7 +38,8 @@ __all__ = [
     "dequantize",
     "fake_quantize",
     "wrap_module_forward_quantized",
-    "maybe_calibrate_or_quantize",
+    "forward_quantize",
+    "calibrate_activations",
 ]
@@ -276,14 +277,24 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
         if scheme.input_activations is not None:
             # calibrate and (fake) quantize input activations when applicable
-            input_ = maybe_calibrate_or_quantize(
-                module, input_, "input", scheme.input_activations
-            )
+            # NOTE: will be moved out of compressed-tensors
+            if (
+                module.quantization_status == QuantizationStatus.CALIBRATION
+                and not scheme.input_activations.dynamic
+            ):
+                calibrate_activations(
+                    module=module,
+                    value=input_,
+                    base_name="input",
+                    quantization_args=scheme.input_activations,
+                )
+            input_ = forward_quantize(module, input_, "input", scheme.input_activations)
         if scheme.weights is not None and not compressed:
             # calibrate and (fake) quantize weights when applicable
             unquantized_weight = self.weight.data.clone()
-            self.weight.data = maybe_calibrate_or_quantize(
+            self.weight.data = forward_quantize(
                 module, self.weight, "weight", scheme.weights
             )
@@ -296,7 +307,19 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
             # calibrate and (fake) quantize output activations when applicable
             # kv_cache scales updated on model self_attn forward call in
             # wrap_module_forward_quantized_attn
-            output = maybe_calibrate_or_quantize(
+            if (
+                module.quantization_status == QuantizationStatus.CALIBRATION
+                and not scheme.output_activations.dynamic
+            ):
+                calibrate_activations(
+                    module=module,
+                    value=output,
+                    base_name="output",
+                    quantization_args=scheme.ouput_activations,
+                )
+            output = forward_quantize(
                 module, output, "output", scheme.output_activations
             )
@@ -356,12 +379,36 @@ def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationSchem
     setattr(module, "forward", bound_wrapped_forward)
-def maybe_calibrate_or_quantize(
+def calibrate_activations(
+    module: Module,
+    value: torch.Tensor,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+):
+    # If empty tensor, can't update zp/scale
+    # Case for MoEs
+    if value.numel() == 0:
+        return
+    # calibration mode - get new quant params from observer
+    if not hasattr(module, f"{base_name}_observer"):
+        from compressed_tensors.quantization.lifecycle import initialize_observers
+        initialize_observers(
+            module=module, base_name=base_name, quantization_args=quantization_args
+        )
+    observer = getattr(module, f"{base_name}_observer")
+    updated_scale, updated_zero_point = observer(value)
+    # update scale and zero point
+    update_parameter_data(module, updated_scale, f"{base_name}_scale")
+    update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
+def forward_quantize(
     module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
-    # don't run quantization if we haven't entered calibration mode
-    if module.quantization_status == QuantizationStatus.INITIALIZED:
-        return value
     # in compressed mode, the weight is already compressed and quantized so we don't
     # need to run fake quantization
@@ -386,22 +433,6 @@ def maybe_calibrate_or_quantize(
         scale = getattr(module, f"{base_name}_scale")
         zero_point = getattr(module, f"{base_name}_zero_point", None)
-        if (
-            module.quantization_status == QuantizationStatus.CALIBRATION
-            and base_name != "weight"
-        ):
-            # calibration mode - get new quant params from observer
-            observer = getattr(module, f"{base_name}_observer")
-            updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
-            # update scale and zero point
-            update_parameter_data(module, updated_scale, f"{base_name}_scale")
-            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
-            scale = updated_scale
-            zero_point = updated_zero_point
     return fake_quantize(
         x=value,
         scale=scale,

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/frozen.py RENAMED Viewed

@@ -41,15 +41,11 @@ def freeze_module_quantization(module: Module):
         return
     # delete observers from module if not dynamic
-    if scheme.input_activations and not scheme.input_activations.dynamic:
+    if hasattr(module, "input_observer") and not scheme.input_activations.dynamic:
         delattr(module, "input_observer")
-    if scheme.weights and not scheme.weights.dynamic:
+    if hasattr(module, "weight_observer") and not scheme.weights.dynamic:
         delattr(module, "weight_observer")
-    if (
-        scheme.output_activations
-        and not is_kv_cache_quant_scheme(scheme)
-        and not scheme.output_activations.dynamic
-    ):
+    if hasattr(module, "output_observer") and not scheme.output_activations.dynamic:
         delattr(module, "output_observer")
     module.quantization_status = QuantizationStatus.FROZEN

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -34,9 +34,7 @@ from compressed_tensors.utils import get_execution_device, is_module_offloaded
 from torch.nn import Module, Parameter
-__all__ = [
-    "initialize_module_for_quantization",
-]
+__all__ = ["initialize_module_for_quantization", "initialize_observers"]
 _LOGGER = logging.getLogger(__name__)
@@ -74,7 +72,7 @@ def initialize_module_for_quantization(
     else:
         if scheme.input_activations is not None:
-            _initialize_scale_zero_point_observer(
+            _initialize_scale_zero_point(
                 module,
                 "input",
                 scheme.input_activations,
@@ -85,7 +83,7 @@ def initialize_module_for_quantization(
                 weight_shape = None
                 if isinstance(module, torch.nn.Linear):
                     weight_shape = module.weight.shape
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module,
                     "weight",
                     scheme.weights,
@@ -101,7 +99,7 @@ def initialize_module_for_quantization(
         if scheme.output_activations is not None:
             if not is_kv_cache_quant_scheme(scheme):
-                _initialize_scale_zero_point_observer(
+                _initialize_scale_zero_point(
                     module, "output", scheme.output_activations
                 )
@@ -146,21 +144,23 @@ def initialize_module_for_quantization(
                 module._hf_hook.weights_map = new_prefix_dict
-def _initialize_scale_zero_point_observer(
+def initialize_observers(
     module: Module,
     base_name: str,
     quantization_args: QuantizationArgs,
-    weight_shape: Optional[torch.Size] = None,
-    force_zero_point: bool = True,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
-    # no need to register an observer for dynamic quantization
-    if observer:
-        module.register_module(f"{base_name}_observer", observer)
+    module.register_module(f"{base_name}_observer", observer)
-    # no need to register a scale and zero point for a dynamic quantization
+def _initialize_scale_zero_point(
+    module: Module,
+    base_name: str,
+    quantization_args: QuantizationArgs,
+    weight_shape: Optional[torch.Size] = None,
+    force_zero_point: bool = True,
+):
     if quantization_args.dynamic:
         return

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/mse.py RENAMED Viewed

@@ -70,9 +70,9 @@ class MovingAverageMSEObserver(Observer):
             absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
             absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
-        best = torch.full(absolute_min_val.shape, float("inf"))
-        min_val = torch.ones(absolute_min_val.shape)
-        max_val = torch.zeros(absolute_max_val.shape)
+        best = torch.full_like(absolute_min_val, torch.finfo(absolute_min_val.dtype).max)
+        min_val = torch.ones_like(absolute_min_val)
+        max_val = torch.zeros_like(absolute_max_val)
         for i in range(int(self.maxshrink * self.grid)):
             p = 1 - i / self.grid
             shrinked_min_val = p * absolute_min_val

{compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020/src/compressed_tensors_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.7.1.20241018
+Version: 0.7.1.20241020
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.