PyPI - compressed-tensors-nightly - Versions diffs - 0.7.0.20241011__tar.gz → 0.7.0.20241013__tar.gz - Mend

compressed-tensors-nightly 0.7.0.20241011tar.gz → 0.7.0.20241013tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{compressed-tensors-nightly-0.7.0.20241011/src/compressed_tensors_nightly.egg-info → compressed-tensors-nightly-0.7.0.20241013}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.7.0.20241011
+Version: 0.7.0.20241013
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -18,7 +18,10 @@ from typing import Callable, Optional
 import torch
 from compressed_tensors.quantization.cache import QuantizedKVParameterCache
-from compressed_tensors.quantization.observers.helpers import calculate_range
+from compressed_tensors.quantization.observers.helpers import (
+    calculate_range,
+    compute_dynamic_scales_and_zp,
+)
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
@@ -376,9 +379,8 @@ def maybe_calibrate_or_quantize(
     g_idx = getattr(module, "weight_g_idx", None)
     if args.dynamic:
-        # dynamic quantization - get scale and zero point directly from observer
-        observer = getattr(module, f"{base_name}_observer")
-        scale, zero_point = observer(value, g_idx=g_idx)
+        # dynamic quantization - no need to invoke observer
+        scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args)
     else:
         # static quantization - get previous scale and zero point from layer
         scale = getattr(module, f"{base_name}_scale")

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -153,12 +153,16 @@ def _initialize_scale_zero_point_observer(
     weight_shape: Optional[torch.Size] = None,
     force_zero_point: bool = True,
 ):
     # initialize observer module and attach as submodule
     observer = quantization_args.get_observer()
-    module.register_module(f"{base_name}_observer", observer)
+    # no need to register an observer for dynamic quantization
+    if observer:
+        module.register_module(f"{base_name}_observer", observer)
+    # no need to register a scale and zero point for a dynamic quantization
     if quantization_args.dynamic:
-        return  # no need to register a scale and zero point for a dynamic observer
+        return
     device = next(module.parameters()).device
     if is_module_offloaded(module):
@@ -173,10 +177,7 @@ def _initialize_scale_zero_point_observer(
             expected_shape = (weight_shape[0], 1)
         elif quantization_args.strategy == QuantizationStrategy.GROUP:
             num_groups = weight_shape[1] // quantization_args.group_size
-            expected_shape = (
-                weight_shape[0],
-                max(num_groups, 1)
-            )
+            expected_shape = (weight_shape[0], max(num_groups, 1))
     scale_dtype = module.weight.dtype
     if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/observers/__init__.py RENAMED Viewed

@@ -17,6 +17,5 @@
 from .helpers import *
 from .base import *
-from .memoryless import *
 from .min_max import *
 from .mse import *

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/observers/helpers.py RENAMED Viewed

@@ -13,18 +13,56 @@
 # limitations under the License.
 from collections import Counter
-from typing import Tuple
+from typing import Optional, Tuple
 import torch
 from compressed_tensors.quantization.quant_args import (
     FP8_DTYPE,
     QuantizationArgs,
+    QuantizationStrategy,
     QuantizationType,
 )
 from torch import FloatTensor, IntTensor, Tensor
-__all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
+__all__ = [
+    "calculate_qparams",
+    "get_observer_token_count",
+    "calculate_range",
+    "compute_dynamic_scales_and_zp",
+]
+def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
+    """
+    Returns the computed scales and zero points for dynamic activation
+    qunatization.
+    :param value: tensor to calculate quantization parameters for
+    :param args: quantization args
+    :param reduce_dims: optional tuple of dimensions to reduce along,
+        returned scale and zero point will be shaped (1,) along the
+        reduced dimensions
+    :return: tuple of scale and zero point derived from the observed tensor
+    """
+    if args.strategy == QuantizationStrategy.TOKEN:
+        dim = {1, 2}
+        reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
+    elif args.strategy == QuantizationStrategy.TENSOR:
+        reduce_dims = None
+    else:
+        raise ValueError(
+            f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ",
+            "must be used for dynamic quantization",
+        )
+    if not reduce_dims:
+        min_val, max_val = torch.aminmax(value)
+    else:
+        min_val = torch.amin(value, dim=reduce_dims, keepdims=True)
+        max_val = torch.amax(value, dim=reduce_dims, keepdims=True)
+    return calculate_qparams(min_val, max_val, args)
 def get_observer_token_count(module: torch.nn.Module) -> Counter:

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from enum import Enum
 from typing import Any, Dict, Optional, Union
@@ -94,7 +95,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     block_structure: Optional[str] = None
     dynamic: bool = False
     actorder: Union[ActivationOrdering, bool, None] = None
-    observer: str = Field(
+    observer: Optional[str] = Field(
         default="minmax",
         description=(
             "The class to use to compute the quantization param - "
@@ -115,10 +116,10 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         """
         from compressed_tensors.quantization.observers.base import Observer
+        # No observer required for the dynamic case
         if self.dynamic:
-            # override defualt observer for dynamic, you never want minmax which
-            # keeps state across samples for dynamic
-            self.observer = "memoryless"
+            self.observer = None
+            return self.observer
         return Observer.load_from_registry(self.observer, quantization_args=self)
@@ -171,6 +172,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
         strategy = model.strategy
         group_size = model.group_size
         actorder = model.actorder
+        dynamic = model.dynamic
+        observer = model.observer
         # infer strategy
         if strategy is None:
@@ -207,6 +210,27 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
                 "activation ordering"
             )
+        if dynamic:
+            if strategy not in (
+                QuantizationStrategy.TOKEN,
+                QuantizationStrategy.TENSOR,
+            ):
+                raise ValueError(
+                    f"One of {QuantizationStrategy.TOKEN} or "
+                    f"{QuantizationStrategy.TENSOR} must be used for dynamic ",
+                    "quantization",
+                )
+            if observer is not None:
+                warnings.warn(
+                    "No observer is used for dynamic quantization, setting to None"
+                )
+                model.observer = None
+        # if we have not set an observer and we
+        # are running static quantization, use minmax
+        if not observer and not dynamic:
+            model.observer = "minmax"
         # write back modified values
         model.strategy = strategy
         return model

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors/quantization/quant_scheme.py RENAMED Viewed

@@ -122,6 +122,7 @@ INT8_W8A8 = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
@@ -164,6 +165,7 @@ INT8_W4A8 = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )
@@ -200,6 +202,7 @@ FP8_DYNAMIC = dict(
         strategy=QuantizationStrategy.TOKEN,
         symmetric=True,
         dynamic=True,
+        observer=None,
     ),
 )

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013/src/compressed_tensors_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors-nightly
-Version: 0.7.0.20241011
+Version: 0.7.0.20241013
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

{compressed-tensors-nightly-0.7.0.20241011 → compressed-tensors-nightly-0.7.0.20241013}/src/compressed_tensors_nightly.egg-info/SOURCES.txt RENAMED Viewed

@@ -43,7 +43,6 @@ src/compressed_tensors/quantization/lifecycle/initialize.py
 src/compressed_tensors/quantization/observers/__init__.py
 src/compressed_tensors/quantization/observers/base.py
 src/compressed_tensors/quantization/observers/helpers.py
-src/compressed_tensors/quantization/observers/memoryless.py
 src/compressed_tensors/quantization/observers/min_max.py
 src/compressed_tensors/quantization/observers/mse.py
 src/compressed_tensors/quantization/utils/__init__.py

compressed-tensors-nightly-0.7.0.20241011/src/compressed_tensors/quantization/observers/memoryless.py DELETED Viewed

@@ -1,56 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Optional, Tuple
-import torch
-from compressed_tensors.quantization.observers.base import Observer
-from compressed_tensors.quantization.observers.helpers import calculate_qparams
-from torch import FloatTensor, IntTensor, Tensor
-__all__ = ["MemorylessObserver"]
-@Observer.register("memoryless", alias=["dynamic"])
-class MemorylessObserver(Observer):
-    """
-    Implements a quantization observer that sets the scale and
-    zero point based on the latest observed value without tracking state
-    """
-    def calculate_qparams(
-        self,
-        observed: Tensor,
-        tensor_id: Optional[Any] = None,
-        reduce_dims: Optional[Tuple[int]] = None,
-    ) -> Tuple[FloatTensor, IntTensor]:
-        """
-        Returns the min and max values of observed tensor
-        :param observed: observed tensor to calculate quantization parameters for
-        :param tensor_id: optional id for tensor; not used for memoryless
-        :param reduce_dims: optional tuple of dimensions to reduce along,
-            returned scale and zero point will be shaped (1,) along the
-            reduced dimensions
-        :return: tuple of scale and zero point derived from the observed tensor
-        """
-        if not reduce_dims:
-            min_val, max_val = torch.aminmax(observed)
-        else:
-            min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
-            max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
-        return calculate_qparams(min_val, max_val, self.quantization_args)