PyPI - compressed-tensors - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

compressed-tensors 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{compressed-tensors-0.3.1/src/compressed_tensors.egg-info → compressed-tensors-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors
-Version: 0.3.1
+Version: 0.3.2
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.
@@ -94,4 +94,7 @@ save_compressed_model(model, "compressed_model.safetensors", compression_format=
 state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
+For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/README.md RENAMED Viewed

@@ -80,3 +80,6 @@ save_compressed_model(model, "compressed_model.safetensors", compression_format=
 # load compressed model weights (`dict` turns generator into a dictionary)
 state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
+For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ def _setup_extras() -> Dict:
 setup(
     name="compressed-tensors",
-    version="0.3.1",
+    version="0.3.2",
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     license="Apache 2.0",

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/compressors/__init__.py RENAMED Viewed

@@ -16,10 +16,5 @@
 from .base import ModelCompressor
 from .dense import DenseCompressor
-from .helpers import (
-    infer_compressor_from_model_config,
-    load_compressed,
-    save_compressed,
-    save_compressed_model,
-)
+from .helpers import load_compressed, save_compressed, save_compressed_model
 from .sparse_bitmask import BitmaskCompressor, BitmaskTensor

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/compressors/base.py RENAMED Viewed

@@ -22,6 +22,7 @@ from compressed_tensors.utils import get_safetensors_folder
 from torch import Tensor
 from torch.nn import Module, Parameter
 from tqdm import tqdm
+from transformers import AutoConfig
 __all__ = ["ModelCompressor"]
@@ -34,6 +35,29 @@ class ModelCompressor(RegistryMixin):
     :param config: config specifying compression parameters
     """
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str
+    ) -> Optional["ModelCompressor"]:
+        """
+        Given a path to a model config, extract a sparsity config if it exists and
+        return the associated ModelCompressor
+        :param pretrained_model_name_or_path: path to model config on disk or HF hub
+        :return: matching compressor if config contains a sparsity config
+        """
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
+        if sparsity_config is None:
+            return None
+        format = sparsity_config.get("format")
+        sparsity_config = CompressionConfig.load_from_registry(
+            format, **sparsity_config
+        )
+        compressor = cls.load_from_registry(format, config=sparsity_config)
+        return compressor
     def __init__(self, config: Optional[CompressionConfig] = None):
         self.config = config
@@ -47,7 +71,7 @@ class ModelCompressor(RegistryMixin):
         raise NotImplementedError()
     def decompress(
-        self, path_to_model_or_tensors: str
+        self, path_to_model_or_tensors: str, device: str = "cpu"
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/compressors/dense.py RENAMED Viewed

@@ -29,6 +29,6 @@ class DenseCompressor(ModelCompressor):
         return model_state
     def decompress(
-        self, path_to_model_or_tensors: str, device: str
+        self, path_to_model_or_tensors: str, device: str = "cpu"
     ) -> Generator[Tuple[str, Tensor], None, None]:
         return iter([])

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/compressors/helpers.py RENAMED Viewed

@@ -16,45 +16,21 @@ from pathlib import Path
 from typing import Dict, Generator, Optional, Tuple, Union
 import torch
-from compressed_tensors.base import SPARSITY_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import CompressionConfig, CompressionFormat
 from compressed_tensors.utils.safetensors_load import get_weight_mappings
 from safetensors import safe_open
 from safetensors.torch import save_file
 from torch import Tensor
-from transformers import AutoConfig
 __all__ = [
-    "infer_compressor_from_model_config",
     "load_compressed",
     "save_compressed",
     "save_compressed_model",
 ]
-def infer_compressor_from_model_config(
-    pretrained_model_name_or_path: str,
-) -> Optional[ModelCompressor]:
-    """
-    Given a path to a model config, extract a sparsity config if it exists and return
-    the associated ModelCompressor
-    :param pretrained_model_name_or_path: path to model config on disk or HF hub
-    :return: matching compressor if config contains a sparsity config
-    """
-    config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-    sparsity_config = getattr(config, SPARSITY_CONFIG_NAME, None)
-    if sparsity_config is None:
-        return None
-    format = sparsity_config.get("format")
-    sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config)
-    compressor = ModelCompressor.load_from_registry(format, config=sparsity_config)
-    return compressor
 def save_compressed(
     tensors: Dict[str, Tensor],
     save_path: Union[str, Path],

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/compressors/sparse_bitmask.py RENAMED Viewed

@@ -75,8 +75,9 @@ class BitmaskCompressor(ModelCompressor):
         self, path_to_model_or_tensors: str, device: str = "cpu"
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
-        Reads a bitmask compressed state dict located at path_to_model_or_tensors
-        and returns a generator for sequentially decompressing back to a dense state dict
+        Reads a bitmask compressed state dict located
+        at path_to_model_or_tensors and returns a generator
+        for sequentially decompressing back to a dense state dict
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/quantization/lifecycle/forward.py RENAMED Viewed

@@ -111,7 +111,7 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
 def _maybe_calibrate_or_quantize(
-    module: Module, value: Module, base_name: str, args: "QuantizationArgs"
+    module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
     # only run quantized for the included stages
     if module.quantization_status not in {
@@ -120,17 +120,23 @@ def _maybe_calibrate_or_quantize(
     }:
         return value
-    device = next(module.parameters()).device
-    scale = getattr(module, f"{base_name}_scale")
-    zero_point = getattr(module, f"{base_name}_zero_point")
-    if module.quantization_status == QuantizationStatus.CALIBRATION:
-        # get observer and get new quant params from observation
+    if args.dynamic:
+        # dynamic quantization - get scale and zero point directly from observer
         observer = getattr(module, f"{base_name}_observer")
-        updated_scale, updated_zero_point = observer(value)
-        # update scale and zero point
-        scale.data = updated_scale.to(device)
-        zero_point.data = updated_zero_point.to(device)
+        scale, zero_point = observer(value)
+    else:
+        # static quantization - get previous scale and zero point from layer
+        scale = getattr(module, f"{base_name}_scale")
+        zero_point = getattr(module, f"{base_name}_zero_point")
+        if module.quantization_status == QuantizationStatus.CALIBRATION:
+            # calibration mode - get new quant params from observer
+            observer = getattr(module, f"{base_name}_observer")
+            updated_scale, updated_zero_point = observer(value)
+            # update scale and zero point
+            device = next(module.parameters()).device
+            scale.data = updated_scale.to(device)
+            zero_point.data = updated_zero_point.to(device)
     return fake_quantize(value, scale, zero_point, args)

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/quantization/lifecycle/frozen.py RENAMED Viewed

@@ -30,17 +30,17 @@ def freeze_module_quantization(module: Module):
     :param module: module to freeze quantization for
     """
-    if not getattr(module, "quantization_scheme", None):
+    scheme = getattr(module, "quantization_scheme", None)
+    if not scheme:
         # no quantization scheme nothing to do
         return
-    # delete observers from module
-    observer_names = []
-    for submodule_name, _ in module.named_modules():
-        if "." not in submodule_name and submodule_name.endswith("_observer"):
-            # delete any observers that belong directly to this module
-            observer_names.append(submodule_name)
-    for observer_name in observer_names:
-        delattr(module, observer_name)
+    # delete observers from module if not dynamic
+    if scheme.input_activations and not scheme.input_activations.dynamic:
+        delattr(module, "input_observer")
+    if scheme.weights and not scheme.weights.dynamic:
+        delattr(module, "weight_observer")
+    if scheme.output_activations and not scheme.output_activations.dynamic:
+        delattr(module, "output_observer")
     module.quantization_status = QuantizationStatus.FROZEN

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/quantization/lifecycle/initialize.py RENAMED Viewed

@@ -80,6 +80,13 @@ def initialize_module_for_quantization(
 def _initialize_scale_zero_point_observer(
     module: Module, base_name: str, quantization_args: QuantizationArgs
 ):
+    # initialize observer module and attach as submodule
+    observer = quantization_args.get_observer()
+    module.register_module(f"{base_name}_observer", observer)
+    if quantization_args.dynamic:
+        return  # no need to register a scale and zero point for a dynamic observer
     device = next(module.parameters()).device
     # initializes empty scale and zero point parameters for the module
@@ -90,7 +97,3 @@ def _initialize_scale_zero_point_observer(
         torch.empty(0, device=device, dtype=int), requires_grad=False
     )
     module.register_parameter(f"{base_name}_zero_point", init_zero_point)
-    # initialize observer module and attach as submodule
-    observer = quantization_args.get_observer()
-    module.register_module(f"{base_name}_observer", observer)

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/quantization/observers/memoryless.py RENAMED Viewed

@@ -23,10 +23,10 @@ from torch import FloatTensor, IntTensor, Tensor
 __all__ = ["MemorylessObserver"]
-@Observer.register("memoryless")
+@Observer.register("memoryless", alias=["dynamic"])
 class MemorylessObserver(Observer):
     """
-    Implements a dynamic quantization observer that sets the scale and
+    Implements a quantization observer that sets the scale and
     zero point based on the latest observed value without tracking state
     """

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2}/src/compressed_tensors/quantization/quant_args.py RENAMED Viewed

@@ -53,6 +53,11 @@ class QuantizationArgs(BaseModel):
     :param group_size: group length to use for the group strategy
     :param block_structure: 2d block structure to use for the block strategy, must be
     of the format "2x4", "8x16", etc.
+    :param dynamic: set True to perform dynamic quantization - values will not be
+        calibrated during calibration phase, instead during inference new quantization
+        ranges will be observed with every sample. Defaults to False for static
+        quantization. Note that enabling dynamic quantization will change the default
+        observer to a memoryless one
     """
     num_bits: int = 8
@@ -61,6 +66,7 @@ class QuantizationArgs(BaseModel):
     strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
     group_size: Optional[int] = None
     block_structure: Optional[str] = None
+    dynamic: bool = False
     observer: str = Field(
         default="minmax",
         description=(
@@ -82,4 +88,9 @@ class QuantizationArgs(BaseModel):
         """
         from compressed_tensors.quantization.observers.base import Observer
+        if self.observer == "minmax" and self.dynamic:
+            # override defualt observer for dynamic, you never want minmax which
+            # keeps state across samples for dynamic
+            self.observer = "memoryless"
         return Observer.load_from_registry(self.observer, quantization_args=self)

{compressed-tensors-0.3.1 → compressed-tensors-0.3.2/src/compressed_tensors.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: compressed-tensors
-Version: 0.3.1
+Version: 0.3.2
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.
@@ -94,4 +94,7 @@ save_compressed_model(model, "compressed_model.safetensors", compression_format=
 state_dict = dict(load_compressed("compressed_model.safetensors", compression_config))
 ```
+For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).