PyPI - compressed-tensors - Versions diffs - 0.3.2__tar.gz → 0.4.0__tar.gz - Mend

compressed-tensors 0.3.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{compressed-tensors-0.3.2/src/compressed_tensors.egg-info → compressed-tensors-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,17 +1,16 @@
 Metadata-Version: 2.1
 Name: compressed-tensors
-Version: 0.3.2
+Version: 0.4.0
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.
 Author-email: support@neuralmagic.com
 License: Apache 2.0
-Platform: UNKNOWN
 Description-Content-Type: text/markdown
 Provides-Extra: dev
 License-File: LICENSE
-# compressed-tensors
+# compressed_tensors
 This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
@@ -81,7 +80,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
 from transformers import AutoModelForCausalLM
 model_name = "neuralmagic/llama2.c-stories110M-pruned50"
-model = AutoModelForCausalLM.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 original_state_dict = model.state_dict()
@@ -97,4 +96,42 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
 For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
+## Saving a Compressed Model with PTQ
+We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
+```python
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
+config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
+config.quantization_status = QuantizationStatus.CALIBRATION
+apply_quantization_config(model, config)
+dataset = load_dataset("ptb_text_only")["train"]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+def tokenize_function(examples):
+    return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
+tokenized_dataset = dataset.map(tokenize_function, batched=True)
+data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
+with torch.no_grad():
+    for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
+        sample = {key: value.to(device) for key,value in sample.items()}
+        _ = model(**sample)
+        if idx >= 512:
+            break
+model.apply(freeze_module_quantization)
+model.apply(compress_quantized_weights)
+output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
+compressor = ModelCompressor(quantization_config=config)
+compressed_state_dict = compressor.compress(model)
+model.save_pretrained(output_dir, state_dict=compressed_state_dict)
+```
+For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/README.md RENAMED Viewed

@@ -1,4 +1,4 @@
-# compressed-tensors
+# compressed_tensors
 This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
@@ -68,7 +68,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
 from transformers import AutoModelForCausalLM
 model_name = "neuralmagic/llama2.c-stories110M-pruned50"
-model = AutoModelForCausalLM.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
 original_state_dict = model.state_dict()
@@ -83,3 +83,43 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
 For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
+## Saving a Compressed Model with PTQ
+We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
+```python
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
+config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
+config.quantization_status = QuantizationStatus.CALIBRATION
+apply_quantization_config(model, config)
+dataset = load_dataset("ptb_text_only")["train"]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+def tokenize_function(examples):
+    return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
+tokenized_dataset = dataset.map(tokenize_function, batched=True)
+data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
+with torch.no_grad():
+    for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
+        sample = {key: value.to(device) for key,value in sample.items()}
+        _ = model(**sample)
+        if idx >= 512:
+            break
+model.apply(freeze_module_quantization)
+model.apply(compress_quantized_weights)
+output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
+compressor = ModelCompressor(quantization_config=config)
+compressed_state_dict = compressor.compress(model)
+model.save_pretrained(output_dir, state_dict=compressed_state_dict)
+```
+For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/setup.py RENAMED Viewed

@@ -12,9 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from setuptools import setup, find_packages
 from typing import List, Dict, Tuple
+from utils.artifacts import get_release_and_version
+package_path = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "src", "compressed_tensors"
+)
+(
+    is_release,
+    version,
+    version_major,
+    version_minor,
+    version_bug,
+) = get_release_and_version(package_path)
+version_nm_deps = f"{version_major}.{version_minor}.0"
+if is_release:
+    _PACKAGE_NAME = "compressed-tensors"
+else:
+    _PACKAGE_NAME = "compressed-tensors-nightly"
 def _setup_long_description() -> Tuple[str, str]:
     return open("README.md", "r", encoding="utf-8").read(), "text/markdown"
@@ -25,14 +46,14 @@ def _setup_packages() -> List:
     )
 def _setup_install_requires() -> List:
-    return ["torch>=1.7.0", "transformers<4.41", "pydantic<2.7"]
+    return ["torch>=1.7.0", "transformers", "pydantic>=2.0"]
 def _setup_extras() -> Dict:
     return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
 setup(
-    name="compressed-tensors",
-    version="0.3.2",
+    name=_PACKAGE_NAME,
+    version=version,
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     license="Apache 2.0",

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/base.py RENAMED Viewed

@@ -13,4 +13,5 @@
 # limitations under the License.
 SPARSITY_CONFIG_NAME = "sparsity_config"
-QUANTIZATION_CONFIG_NAME = "sparseml_quantization_config"
+QUANTIZATION_CONFIG_NAME = "quantization_config"
+COMPRESSION_CONFIG_NAME = "compression_config"

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/__init__.py RENAMED Viewed

@@ -14,7 +14,11 @@
 # flake8: noqa
-from .base import ModelCompressor
+from .base import Compressor
 from .dense import DenseCompressor
 from .helpers import load_compressed, save_compressed, save_compressed_model
+from .int_quantized import IntQuantizationCompressor
+from .marlin_24 import Marlin24Compressor
+from .model_compressor import ModelCompressor, map_modules_to_quant_args
+from .pack_quantized import PackedQuantizationCompressor
 from .sparse_bitmask import BitmaskCompressor, BitmaskTensor

compressed-tensors-0.4.0/src/compressed_tensors/compressors/base.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Generator, Tuple, Union
+from compressed_tensors.config import SparsityCompressionConfig
+from compressed_tensors.quantization import QuantizationConfig
+from compressed_tensors.registry import RegistryMixin
+from torch import Tensor
+__all__ = ["Compressor"]
+class Compressor(RegistryMixin):
+    """
+    Base class representing a model compression algorithm
+    :param config: config specifying compression parameters
+    """
+    def __init__(
+        self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None
+    ):
+        self.config = config
+    def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
+    def decompress(
+        self, path_to_model_or_tensors: str, device: str = "cpu"
+    ) -> Generator[Tuple[str, Tensor], None, None]:
+        """
+        Reads a compressed state dict located at path_to_model_or_tensors
+        and returns a generator for sequentially decompressing back to a
+        dense state dict
+        :param model_path: path to compressed safetensors model (directory with
+            one or more safetensors files) or compressed tensors file
+        :param device: optional device to load intermediate weights into
+        :return: compressed state dict
+        """
+        raise NotImplementedError()

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/dense.py RENAMED Viewed

@@ -14,18 +14,18 @@
 from typing import Dict, Generator, Tuple
-from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.compressors import Compressor
 from compressed_tensors.config import CompressionFormat
 from torch import Tensor
-@ModelCompressor.register(name=CompressionFormat.dense_sparsity.value)
-class DenseCompressor(ModelCompressor):
+@Compressor.register(name=CompressionFormat.dense.value)
+class DenseCompressor(Compressor):
     """
     Identity compressor for dense models, returns the original state_dict
     """
-    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
         return model_state
     def decompress(

{compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/helpers.py RENAMED Viewed

@@ -16,8 +16,8 @@ from pathlib import Path
 from typing import Dict, Generator, Optional, Tuple, Union
 import torch
-from compressed_tensors.compressors import ModelCompressor
-from compressed_tensors.config import CompressionConfig, CompressionFormat
+from compressed_tensors.compressors import Compressor
+from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.utils.safetensors_load import get_weight_mappings
 from safetensors import safe_open
 from safetensors.torch import save_file
@@ -48,20 +48,20 @@ def save_compressed(
     if tensors is None or len(tensors) == 0:
         raise ValueError("No tensors or empty tensors provided to compress")
-    # if no compression_format specified, default to `dense_sparsity`
-    compression_format = compression_format or CompressionFormat.dense_sparsity.value
+    # if no compression_format specified, default to `dense`
+    compression_format = compression_format or CompressionFormat.dense.value
     if not (
-        compression_format in ModelCompressor.registered_names()
-        or compression_format in ModelCompressor.registered_aliases()
+        compression_format in Compressor.registered_names()
+        or compression_format in Compressor.registered_aliases()
     ):
         raise ValueError(
             f"Unknown compression format: {compression_format}. "
-            f"Must be one of {set(ModelCompressor.registered_names() + ModelCompressor.registered_aliases())}"  # noqa E501
+            f"Must be one of {set(Compressor.registered_names() + Compressor.registered_aliases())}"  # noqa E501
         )
     # compress
-    compressor = ModelCompressor.load_from_registry(compression_format)
+    compressor = Compressor.load_from_registry(compression_format)
     # save compressed tensors
     compressed_tensors = compressor.compress(tensors)
     save_file(compressed_tensors, save_path)
@@ -69,7 +69,7 @@ def save_compressed(
 def load_compressed(
     compressed_tensors: Union[str, Path],
-    compression_config: CompressionConfig = None,
+    compression_config: SparsityCompressionConfig = None,
     device: Optional[str] = "cpu",
 ) -> Generator[Tuple[str, Tensor], None, None]:
     """
@@ -90,9 +90,9 @@ def load_compressed(
     if (
         compression_config is None
-        or compression_config.format == CompressionFormat.dense_sparsity.value
+        or compression_config.format == CompressionFormat.dense.value
     ):
-        # if no compression_config specified, or `dense_sparsity` format specified,
+        # if no compression_config specified, or `dense` format specified,
         # assume tensors are not compressed on disk
         weight_mappings = get_weight_mappings(compressed_tensors)
         for weight_name, file_with_weight_name in weight_mappings.items():
@@ -102,7 +102,7 @@ def load_compressed(
     else:
         # decompress tensors
         compression_format = compression_config.format
-        compressor = ModelCompressor.load_from_registry(
+        compressor = Compressor.load_from_registry(
             compression_format, config=compression_config
         )
         yield from compressor.decompress(compressed_tensors, device=device)

compressed-tensors-0.4.0/src/compressed_tensors/compressors/int_quantized.py ADDED Viewed

@@ -0,0 +1,126 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Dict, Generator, Tuple
+import torch
+from compressed_tensors.compressors import Compressor
+from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
+from compressed_tensors.quantization.utils import can_quantize
+from compressed_tensors.utils import get_nested_weight_mappings, merge_names
+from safetensors import safe_open
+from torch import Tensor
+from tqdm import tqdm
+__all__ = ["IntQuantizationCompressor"]
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+@Compressor.register(name=CompressionFormat.int_quantized.value)
+class IntQuantizationCompressor(Compressor):
+    """
+    Integer compression for quantized models. Weight of each quantized layer is
+    converted from its original float type to the format specified by the layer's
+    quantization scheme.
+    """
+    COMPRESSION_PARAM_NAMES = ["weight", "weight_scale", "weight_zero_point"]
+    def compress(
+        self,
+        model_state: Dict[str, Tensor],
+        model_quant_args: Dict[str, QuantizationArgs],
+        **kwargs,
+    ) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict
+        :param model_state: state dict of uncompressed model
+        :param model_quant_args: quantization args for each quantized weight, needed for
+        quantize function to calculate bit depth
+        :return: compressed state dict
+        """
+        compressed_dict = {}
+        weight_suffix = ".weight"
+        _LOGGER.debug(
+            f"Compressing model with {len(model_state)} parameterized layers..."
+        )
+        for name, value in tqdm(model_state.items(), desc="Compressing model"):
+            if name.endswith(weight_suffix):
+                prefix = name[: -(len(weight_suffix))]
+                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
+                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
+                if scale is not None and zp is not None:
+                    # weight is quantized, compress it
+                    quant_args = model_quant_args[prefix]
+                    if can_quantize(value, quant_args):
+                        # only quantize if not already quantized
+                        value = quantize(
+                            x=value,
+                            scale=scale,
+                            zero_point=zp,
+                            args=quant_args,
+                            dtype=torch.int8,
+                        )
+            elif name.endswith("zero_point"):
+                if torch.all(value == 0):
+                    # all zero_points are 0, no need to include in
+                    # compressed state_dict
+                    continue
+            compressed_dict[name] = value.to("cpu")
+        return compressed_dict
+    def decompress(
+        self, path_to_model_or_tensors: str, device: str = "cpu"
+    ) -> Generator[Tuple[str, Tensor], None, None]:
+        """
+        Reads a compressed state dict located at path_to_model_or_tensors
+        and returns a generator for sequentially decompressing back to a
+        dense state dict
+        :param model_path: path to compressed safetensors model (directory with
+            one or more safetensors files) or compressed tensors file
+        :param device: optional device to load intermediate weights into
+        :return: compressed state dict
+        """
+        weight_mappings = get_nested_weight_mappings(
+            path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
+        )
+        for weight_name in weight_mappings.keys():
+            weight_data = {}
+            for param_name, safe_path in weight_mappings[weight_name].items():
+                full_name = merge_names(weight_name, param_name)
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    weight_data[param_name] = f.get_tensor(full_name)
+            if "weight_scale" in weight_data:
+                zero_point = weight_data.get("weight_zero_point", None)
+                scale = weight_data["weight_scale"]
+                if zero_point is None:
+                    # zero_point assumed to be 0 if not included in state_dict
+                    zero_point = torch.zeros_like(scale)
+                decompressed = dequantize(
+                    x_q=weight_data["weight"],
+                    scale=scale,
+                    zero_point=zero_point,
+                )
+                yield merge_names(weight_name, "weight"), decompressed

compressed-tensors 0.3.2__tar.gz → 0.4.0__tar.gz

compressed-tensors 0.3.2tar.gz → 0.4.0tar.gz