compressed-tensors 0.3.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {compressed-tensors-0.3.2/src/compressed_tensors.egg-info → compressed-tensors-0.4.0}/PKG-INFO +41 -4
  2. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/README.md +42 -2
  3. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/setup.py +25 -4
  4. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/base.py +2 -1
  5. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/__init__.py +5 -1
  6. compressed-tensors-0.4.0/src/compressed_tensors/compressors/base.py +60 -0
  7. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/dense.py +4 -4
  8. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/helpers.py +12 -12
  9. compressed-tensors-0.4.0/src/compressed_tensors/compressors/int_quantized.py +126 -0
  10. compressed-tensors-0.4.0/src/compressed_tensors/compressors/marlin_24.py +250 -0
  11. compressed-tensors-0.4.0/src/compressed_tensors/compressors/model_compressor.py +315 -0
  12. compressed-tensors-0.4.0/src/compressed_tensors/compressors/pack_quantized.py +212 -0
  13. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/compressors/sparse_bitmask.py +4 -4
  14. compressed-tensors-0.4.0/src/compressed_tensors/compressors/utils/__init__.py +19 -0
  15. compressed-tensors-0.4.0/src/compressed_tensors/compressors/utils/helpers.py +43 -0
  16. compressed-tensors-0.4.0/src/compressed_tensors/compressors/utils/permutations_24.py +65 -0
  17. compressed-tensors-0.4.0/src/compressed_tensors/compressors/utils/semi_structured_conversions.py +341 -0
  18. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/config/base.py +7 -4
  19. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/config/dense.py +4 -4
  20. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/config/sparse_bitmask.py +3 -3
  21. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/lifecycle/__init__.py +1 -0
  22. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/lifecycle/apply.py +75 -19
  23. compressed-tensors-0.4.0/src/compressed_tensors/quantization/lifecycle/compressed.py +69 -0
  24. compressed-tensors-0.4.0/src/compressed_tensors/quantization/lifecycle/forward.py +328 -0
  25. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/lifecycle/frozen.py +4 -0
  26. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/lifecycle/initialize.py +33 -5
  27. compressed-tensors-0.4.0/src/compressed_tensors/quantization/observers/base.py +134 -0
  28. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/observers/helpers.py +6 -1
  29. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/observers/memoryless.py +17 -9
  30. compressed-tensors-0.4.0/src/compressed_tensors/quantization/observers/min_max.py +96 -0
  31. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/quant_args.py +33 -4
  32. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/quant_config.py +69 -21
  33. compressed-tensors-0.4.0/src/compressed_tensors/quantization/quant_scheme.py +119 -0
  34. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/utils/helpers.py +77 -8
  35. compressed-tensors-0.4.0/src/compressed_tensors/utils/helpers.py +63 -0
  36. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/utils/safetensors_load.py +3 -2
  37. compressed-tensors-0.4.0/src/compressed_tensors/version.py +53 -0
  38. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0/src/compressed_tensors.egg-info}/PKG-INFO +41 -4
  39. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors.egg-info/SOURCES.txt +12 -3
  40. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors.egg-info/requires.txt +5 -5
  41. compressed-tensors-0.3.2/src/compressed_tensors/compressors/base.py +0 -103
  42. compressed-tensors-0.3.2/src/compressed_tensors/quantization/lifecycle/forward.py +0 -142
  43. compressed-tensors-0.3.2/src/compressed_tensors/quantization/observers/base.py +0 -69
  44. compressed-tensors-0.3.2/src/compressed_tensors/quantization/observers/min_max.py +0 -65
  45. compressed-tensors-0.3.2/src/compressed_tensors/quantization/quant_scheme.py +0 -39
  46. compressed-tensors-0.3.2/tests/test_bitmask.py +0 -120
  47. compressed-tensors-0.3.2/tests/test_registry.py +0 -53
  48. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/LICENSE +0 -0
  49. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/pyproject.toml +0 -0
  50. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/setup.cfg +0 -0
  51. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/__init__.py +0 -0
  52. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/config/__init__.py +0 -0
  53. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/__init__.py +0 -0
  54. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/lifecycle/calibration.py +0 -0
  55. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/observers/__init__.py +0 -0
  56. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  57. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/registry/__init__.py +0 -0
  58. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/registry/registry.py +0 -0
  59. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors/utils/__init__.py +0 -0
  60. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  61. {compressed-tensors-0.3.2 → compressed-tensors-0.4.0}/src/compressed_tensors.egg-info/top_level.txt +0 -0
@@ -1,17 +1,16 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors
3
- Version: 0.3.2
3
+ Version: 0.4.0
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
7
7
  Author-email: support@neuralmagic.com
8
8
  License: Apache 2.0
9
- Platform: UNKNOWN
10
9
  Description-Content-Type: text/markdown
11
10
  Provides-Extra: dev
12
11
  License-File: LICENSE
13
12
 
14
- # compressed-tensors
13
+ # compressed_tensors
15
14
 
16
15
  This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
17
16
 
@@ -81,7 +80,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
81
80
  from transformers import AutoModelForCausalLM
82
81
 
83
82
  model_name = "neuralmagic/llama2.c-stories110M-pruned50"
84
- model = AutoModelForCausalLM.from_pretrained(model_name)
83
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
85
84
 
86
85
  original_state_dict = model.state_dict()
87
86
 
@@ -97,4 +96,42 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
97
96
  For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
98
97
 
99
98
 
99
+ ## Saving a Compressed Model with PTQ
100
100
 
101
+ We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
102
+
103
+ ```python
104
+ model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
105
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
106
+
107
+ config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
108
+ config.quantization_status = QuantizationStatus.CALIBRATION
109
+ apply_quantization_config(model, config)
110
+
111
+ dataset = load_dataset("ptb_text_only")["train"]
112
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+
114
+ def tokenize_function(examples):
115
+ return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
116
+
117
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
118
+ data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
119
+
120
+ with torch.no_grad():
121
+ for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
122
+ sample = {key: value.to(device) for key,value in sample.items()}
123
+ _ = model(**sample)
124
+
125
+ if idx >= 512:
126
+ break
127
+
128
+ model.apply(freeze_module_quantization)
129
+ model.apply(compress_quantized_weights)
130
+
131
+ output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
132
+ compressor = ModelCompressor(quantization_config=config)
133
+ compressed_state_dict = compressor.compress(model)
134
+ model.save_pretrained(output_dir, state_dict=compressed_state_dict)
135
+ ```
136
+
137
+ For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
@@ -1,4 +1,4 @@
1
- # compressed-tensors
1
+ # compressed_tensors
2
2
 
3
3
  This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
4
4
 
@@ -68,7 +68,7 @@ from compressed_tensors import save_compressed_model, load_compressed, BitmaskCo
68
68
  from transformers import AutoModelForCausalLM
69
69
 
70
70
  model_name = "neuralmagic/llama2.c-stories110M-pruned50"
71
- model = AutoModelForCausalLM.from_pretrained(model_name)
71
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
72
72
 
73
73
  original_state_dict = model.state_dict()
74
74
 
@@ -83,3 +83,43 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
83
83
 
84
84
  For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
85
85
 
86
+
87
+ ## Saving a Compressed Model with PTQ
88
+
89
+ We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
90
+
91
+ ```python
92
+ model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
93
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto")
94
+
95
+ config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
96
+ config.quantization_status = QuantizationStatus.CALIBRATION
97
+ apply_quantization_config(model, config)
98
+
99
+ dataset = load_dataset("ptb_text_only")["train"]
100
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
101
+
102
+ def tokenize_function(examples):
103
+ return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
104
+
105
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
106
+ data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
107
+
108
+ with torch.no_grad():
109
+ for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
110
+ sample = {key: value.to(device) for key,value in sample.items()}
111
+ _ = model(**sample)
112
+
113
+ if idx >= 512:
114
+ break
115
+
116
+ model.apply(freeze_module_quantization)
117
+ model.apply(compress_quantized_weights)
118
+
119
+ output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
120
+ compressor = ModelCompressor(quantization_config=config)
121
+ compressed_state_dict = compressor.compress(model)
122
+ model.save_pretrained(output_dir, state_dict=compressed_state_dict)
123
+ ```
124
+
125
+ For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
@@ -12,9 +12,30 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
-
15
+ import os
16
16
  from setuptools import setup, find_packages
17
17
  from typing import List, Dict, Tuple
18
+ from utils.artifacts import get_release_and_version
19
+
20
+
21
+ package_path = os.path.join(
22
+ os.path.dirname(os.path.realpath(__file__)), "src", "compressed_tensors"
23
+ )
24
+ (
25
+ is_release,
26
+ version,
27
+ version_major,
28
+ version_minor,
29
+ version_bug,
30
+ ) = get_release_and_version(package_path)
31
+
32
+ version_nm_deps = f"{version_major}.{version_minor}.0"
33
+
34
+ if is_release:
35
+ _PACKAGE_NAME = "compressed-tensors"
36
+ else:
37
+ _PACKAGE_NAME = "compressed-tensors-nightly"
38
+
18
39
 
19
40
  def _setup_long_description() -> Tuple[str, str]:
20
41
  return open("README.md", "r", encoding="utf-8").read(), "text/markdown"
@@ -25,14 +46,14 @@ def _setup_packages() -> List:
25
46
  )
26
47
 
27
48
  def _setup_install_requires() -> List:
28
- return ["torch>=1.7.0", "transformers<4.41", "pydantic<2.7"]
49
+ return ["torch>=1.7.0", "transformers", "pydantic>=2.0"]
29
50
 
30
51
  def _setup_extras() -> Dict:
31
52
  return {"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"]}
32
53
 
33
54
  setup(
34
- name="compressed-tensors",
35
- version="0.3.2",
55
+ name=_PACKAGE_NAME,
56
+ version=version,
36
57
  author="Neuralmagic, Inc.",
37
58
  author_email="support@neuralmagic.com",
38
59
  license="Apache 2.0",
@@ -13,4 +13,5 @@
13
13
  # limitations under the License.
14
14
 
15
15
  SPARSITY_CONFIG_NAME = "sparsity_config"
16
- QUANTIZATION_CONFIG_NAME = "sparseml_quantization_config"
16
+ QUANTIZATION_CONFIG_NAME = "quantization_config"
17
+ COMPRESSION_CONFIG_NAME = "compression_config"
@@ -14,7 +14,11 @@
14
14
 
15
15
  # flake8: noqa
16
16
 
17
- from .base import ModelCompressor
17
+ from .base import Compressor
18
18
  from .dense import DenseCompressor
19
19
  from .helpers import load_compressed, save_compressed, save_compressed_model
20
+ from .int_quantized import IntQuantizationCompressor
21
+ from .marlin_24 import Marlin24Compressor
22
+ from .model_compressor import ModelCompressor, map_modules_to_quant_args
23
+ from .pack_quantized import PackedQuantizationCompressor
20
24
  from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
@@ -0,0 +1,60 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict, Generator, Tuple, Union
16
+
17
+ from compressed_tensors.config import SparsityCompressionConfig
18
+ from compressed_tensors.quantization import QuantizationConfig
19
+ from compressed_tensors.registry import RegistryMixin
20
+ from torch import Tensor
21
+
22
+
23
+ __all__ = ["Compressor"]
24
+
25
+
26
+ class Compressor(RegistryMixin):
27
+ """
28
+ Base class representing a model compression algorithm
29
+
30
+ :param config: config specifying compression parameters
31
+ """
32
+
33
+ def __init__(
34
+ self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None
35
+ ):
36
+ self.config = config
37
+
38
+ def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
39
+ """
40
+ Compresses a dense state dict
41
+
42
+ :param model_state: state dict of uncompressed model
43
+ :return: compressed state dict
44
+ """
45
+ raise NotImplementedError()
46
+
47
+ def decompress(
48
+ self, path_to_model_or_tensors: str, device: str = "cpu"
49
+ ) -> Generator[Tuple[str, Tensor], None, None]:
50
+ """
51
+ Reads a compressed state dict located at path_to_model_or_tensors
52
+ and returns a generator for sequentially decompressing back to a
53
+ dense state dict
54
+
55
+ :param model_path: path to compressed safetensors model (directory with
56
+ one or more safetensors files) or compressed tensors file
57
+ :param device: optional device to load intermediate weights into
58
+ :return: compressed state dict
59
+ """
60
+ raise NotImplementedError()
@@ -14,18 +14,18 @@
14
14
 
15
15
  from typing import Dict, Generator, Tuple
16
16
 
17
- from compressed_tensors.compressors import ModelCompressor
17
+ from compressed_tensors.compressors import Compressor
18
18
  from compressed_tensors.config import CompressionFormat
19
19
  from torch import Tensor
20
20
 
21
21
 
22
- @ModelCompressor.register(name=CompressionFormat.dense_sparsity.value)
23
- class DenseCompressor(ModelCompressor):
22
+ @Compressor.register(name=CompressionFormat.dense.value)
23
+ class DenseCompressor(Compressor):
24
24
  """
25
25
  Identity compressor for dense models, returns the original state_dict
26
26
  """
27
27
 
28
- def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
28
+ def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
29
29
  return model_state
30
30
 
31
31
  def decompress(
@@ -16,8 +16,8 @@ from pathlib import Path
16
16
  from typing import Dict, Generator, Optional, Tuple, Union
17
17
 
18
18
  import torch
19
- from compressed_tensors.compressors import ModelCompressor
20
- from compressed_tensors.config import CompressionConfig, CompressionFormat
19
+ from compressed_tensors.compressors import Compressor
20
+ from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
21
21
  from compressed_tensors.utils.safetensors_load import get_weight_mappings
22
22
  from safetensors import safe_open
23
23
  from safetensors.torch import save_file
@@ -48,20 +48,20 @@ def save_compressed(
48
48
  if tensors is None or len(tensors) == 0:
49
49
  raise ValueError("No tensors or empty tensors provided to compress")
50
50
 
51
- # if no compression_format specified, default to `dense_sparsity`
52
- compression_format = compression_format or CompressionFormat.dense_sparsity.value
51
+ # if no compression_format specified, default to `dense`
52
+ compression_format = compression_format or CompressionFormat.dense.value
53
53
 
54
54
  if not (
55
- compression_format in ModelCompressor.registered_names()
56
- or compression_format in ModelCompressor.registered_aliases()
55
+ compression_format in Compressor.registered_names()
56
+ or compression_format in Compressor.registered_aliases()
57
57
  ):
58
58
  raise ValueError(
59
59
  f"Unknown compression format: {compression_format}. "
60
- f"Must be one of {set(ModelCompressor.registered_names() + ModelCompressor.registered_aliases())}" # noqa E501
60
+ f"Must be one of {set(Compressor.registered_names() + Compressor.registered_aliases())}" # noqa E501
61
61
  )
62
62
 
63
63
  # compress
64
- compressor = ModelCompressor.load_from_registry(compression_format)
64
+ compressor = Compressor.load_from_registry(compression_format)
65
65
  # save compressed tensors
66
66
  compressed_tensors = compressor.compress(tensors)
67
67
  save_file(compressed_tensors, save_path)
@@ -69,7 +69,7 @@ def save_compressed(
69
69
 
70
70
  def load_compressed(
71
71
  compressed_tensors: Union[str, Path],
72
- compression_config: CompressionConfig = None,
72
+ compression_config: SparsityCompressionConfig = None,
73
73
  device: Optional[str] = "cpu",
74
74
  ) -> Generator[Tuple[str, Tensor], None, None]:
75
75
  """
@@ -90,9 +90,9 @@ def load_compressed(
90
90
 
91
91
  if (
92
92
  compression_config is None
93
- or compression_config.format == CompressionFormat.dense_sparsity.value
93
+ or compression_config.format == CompressionFormat.dense.value
94
94
  ):
95
- # if no compression_config specified, or `dense_sparsity` format specified,
95
+ # if no compression_config specified, or `dense` format specified,
96
96
  # assume tensors are not compressed on disk
97
97
  weight_mappings = get_weight_mappings(compressed_tensors)
98
98
  for weight_name, file_with_weight_name in weight_mappings.items():
@@ -102,7 +102,7 @@ def load_compressed(
102
102
  else:
103
103
  # decompress tensors
104
104
  compression_format = compression_config.format
105
- compressor = ModelCompressor.load_from_registry(
105
+ compressor = Compressor.load_from_registry(
106
106
  compression_format, config=compression_config
107
107
  )
108
108
  yield from compressor.decompress(compressed_tensors, device=device)
@@ -0,0 +1,126 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Dict, Generator, Tuple
17
+
18
+ import torch
19
+ from compressed_tensors.compressors import Compressor
20
+ from compressed_tensors.config import CompressionFormat
21
+ from compressed_tensors.quantization import QuantizationArgs
22
+ from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
23
+ from compressed_tensors.quantization.utils import can_quantize
24
+ from compressed_tensors.utils import get_nested_weight_mappings, merge_names
25
+ from safetensors import safe_open
26
+ from torch import Tensor
27
+ from tqdm import tqdm
28
+
29
+
30
+ __all__ = ["IntQuantizationCompressor"]
31
+
32
+ _LOGGER: logging.Logger = logging.getLogger(__name__)
33
+
34
+
35
+ @Compressor.register(name=CompressionFormat.int_quantized.value)
36
+ class IntQuantizationCompressor(Compressor):
37
+ """
38
+ Integer compression for quantized models. Weight of each quantized layer is
39
+ converted from its original float type to the format specified by the layer's
40
+ quantization scheme.
41
+ """
42
+
43
+ COMPRESSION_PARAM_NAMES = ["weight", "weight_scale", "weight_zero_point"]
44
+
45
+ def compress(
46
+ self,
47
+ model_state: Dict[str, Tensor],
48
+ model_quant_args: Dict[str, QuantizationArgs],
49
+ **kwargs,
50
+ ) -> Dict[str, Tensor]:
51
+ """
52
+ Compresses a dense state dict
53
+
54
+ :param model_state: state dict of uncompressed model
55
+ :param model_quant_args: quantization args for each quantized weight, needed for
56
+ quantize function to calculate bit depth
57
+ :return: compressed state dict
58
+ """
59
+ compressed_dict = {}
60
+ weight_suffix = ".weight"
61
+ _LOGGER.debug(
62
+ f"Compressing model with {len(model_state)} parameterized layers..."
63
+ )
64
+
65
+ for name, value in tqdm(model_state.items(), desc="Compressing model"):
66
+ if name.endswith(weight_suffix):
67
+ prefix = name[: -(len(weight_suffix))]
68
+ scale = model_state.get(merge_names(prefix, "weight_scale"), None)
69
+ zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
70
+ if scale is not None and zp is not None:
71
+ # weight is quantized, compress it
72
+ quant_args = model_quant_args[prefix]
73
+ if can_quantize(value, quant_args):
74
+ # only quantize if not already quantized
75
+ value = quantize(
76
+ x=value,
77
+ scale=scale,
78
+ zero_point=zp,
79
+ args=quant_args,
80
+ dtype=torch.int8,
81
+ )
82
+ elif name.endswith("zero_point"):
83
+ if torch.all(value == 0):
84
+ # all zero_points are 0, no need to include in
85
+ # compressed state_dict
86
+ continue
87
+ compressed_dict[name] = value.to("cpu")
88
+
89
+ return compressed_dict
90
+
91
+ def decompress(
92
+ self, path_to_model_or_tensors: str, device: str = "cpu"
93
+ ) -> Generator[Tuple[str, Tensor], None, None]:
94
+ """
95
+ Reads a compressed state dict located at path_to_model_or_tensors
96
+ and returns a generator for sequentially decompressing back to a
97
+ dense state dict
98
+
99
+ :param model_path: path to compressed safetensors model (directory with
100
+ one or more safetensors files) or compressed tensors file
101
+ :param device: optional device to load intermediate weights into
102
+ :return: compressed state dict
103
+ """
104
+ weight_mappings = get_nested_weight_mappings(
105
+ path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
106
+ )
107
+ for weight_name in weight_mappings.keys():
108
+ weight_data = {}
109
+ for param_name, safe_path in weight_mappings[weight_name].items():
110
+ full_name = merge_names(weight_name, param_name)
111
+ with safe_open(safe_path, framework="pt", device=device) as f:
112
+ weight_data[param_name] = f.get_tensor(full_name)
113
+
114
+ if "weight_scale" in weight_data:
115
+ zero_point = weight_data.get("weight_zero_point", None)
116
+ scale = weight_data["weight_scale"]
117
+ if zero_point is None:
118
+ # zero_point assumed to be 0 if not included in state_dict
119
+ zero_point = torch.zeros_like(scale)
120
+
121
+ decompressed = dequantize(
122
+ x_q=weight_data["weight"],
123
+ scale=scale,
124
+ zero_point=zero_point,
125
+ )
126
+ yield merge_names(weight_name, "weight"), decompressed