compressed-tensors 0.9.4a20250412__tar.gz → 0.9.4a20250421__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/upload.yml +1 -1
- {compressed_tensors-0.9.4a20250412/src/compressed_tensors.egg-info → compressed_tensors-0.9.4a20250421}/PKG-INFO +1 -1
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/base.py +6 -1
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +90 -7
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/quantized_compressors/base.py +21 -6
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +88 -21
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_compressors/base.py +21 -4
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/apply.py +65 -30
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/initialize.py +13 -2
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/offload.py +0 -1
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/safetensors_load.py +10 -8
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/version.py +2 -2
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/model_compressors/test_model_compressor.py +2 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +1 -1
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/quantized_compressors/test_int_quant.py +2 -2
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/quantized_compressors/test_pack_quant.py +62 -4
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/.gitkeep +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/build-test.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/build.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/report.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/test-check.yaml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/test.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/trigger-all.yml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.gitignore +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/LICENSE +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/Makefile +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/README.md +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/pyproject.toml +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/setup.cfg +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/setup.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/linear/compressed_linear.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/quant_args.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/permute.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors.egg-info/SOURCES.txt +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/conftest.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_apply.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_registry.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_utils/test_offload.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/tests/testing_utils.py +0 -0
- {compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/utils/copyright.py +0 -0
{compressed_tensors-0.9.4a20250412 → compressed_tensors-0.9.4a20250421}/.github/workflows/upload.yml
RENAMED
@@ -99,7 +99,7 @@ jobs:
|
|
99
99
|
|
100
100
|
- name: check if whl is new
|
101
101
|
id: check-whl
|
102
|
-
uses: neuralmagic/nm-actions/actions/check-whl-on-pypi@v1.
|
102
|
+
uses: neuralmagic/nm-actions/actions/check-whl-on-pypi@v1.19.0
|
103
103
|
with:
|
104
104
|
whl: ${{ steps.find-asset-whl.outputs.asset }}
|
105
105
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.4a20250421
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -19,6 +19,7 @@ import torch
|
|
19
19
|
from compressed_tensors.config import SparsityCompressionConfig
|
20
20
|
from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
|
21
21
|
from compressed_tensors.registry import RegistryMixin
|
22
|
+
from compressed_tensors.utils import has_offloaded_params
|
22
23
|
from torch import Tensor
|
23
24
|
from torch.nn import Module
|
24
25
|
|
@@ -169,6 +170,10 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
169
170
|
:param module: PyTorch module to decompress
|
170
171
|
:return: tensor of the decompressed weight, or None if module is not quantized
|
171
172
|
"""
|
173
|
+
|
174
|
+
params_device = next(module.parameters()).device
|
175
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
176
|
+
|
172
177
|
if not hasattr(module, "quantization_scheme"):
|
173
178
|
return None # module is not quantized
|
174
179
|
quantization_scheme = module.quantization_scheme
|
@@ -182,7 +187,7 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
182
187
|
|
183
188
|
return self.decompress_weight(
|
184
189
|
compressed_data=compressed_data, quantization_args=quantization_args
|
185
|
-
)
|
190
|
+
).to(device)
|
186
191
|
|
187
192
|
def decompress_weight(
|
188
193
|
self, compressed_data: Dict[str, Tensor], **kwargs
|
@@ -31,13 +31,14 @@ from compressed_tensors.base import (
|
|
31
31
|
SPARSITY_CONFIG_NAME,
|
32
32
|
)
|
33
33
|
from compressed_tensors.compressors.base import BaseCompressor
|
34
|
+
from compressed_tensors.compressors.sparse_compressors import DenseCompressor
|
34
35
|
from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
|
35
36
|
from compressed_tensors.quantization import (
|
36
37
|
DEFAULT_QUANTIZATION_METHOD,
|
37
38
|
QuantizationConfig,
|
38
39
|
QuantizationStatus,
|
39
40
|
apply_quantization_config,
|
40
|
-
|
41
|
+
load_pretrained_quantization_parameters,
|
41
42
|
)
|
42
43
|
from compressed_tensors.quantization.lifecycle import expand_target_names
|
43
44
|
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
@@ -47,7 +48,9 @@ from compressed_tensors.quantization.utils import (
|
|
47
48
|
)
|
48
49
|
from compressed_tensors.utils import (
|
49
50
|
get_safetensors_folder,
|
51
|
+
has_offloaded_params,
|
50
52
|
merge_names,
|
53
|
+
register_offload_parameter,
|
51
54
|
update_parameter_data,
|
52
55
|
)
|
53
56
|
from compressed_tensors.utils.helpers import (
|
@@ -382,6 +385,7 @@ class ModelCompressor:
|
|
382
385
|
compressed_state_dict = self.quantization_compressor.compress(
|
383
386
|
state_dict, names_to_scheme=quantized_modules_to_args
|
384
387
|
)
|
388
|
+
|
385
389
|
if self.quantization_config.format != CompressionFormat.dense.value:
|
386
390
|
self.quantization_config.quantization_status = (
|
387
391
|
QuantizationStatus.COMPRESSED
|
@@ -411,6 +415,13 @@ class ModelCompressor:
|
|
411
415
|
|
412
416
|
:param model_path: path to compressed weights
|
413
417
|
:param model: pytorch model to load decompressed weights into
|
418
|
+
|
419
|
+
Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
|
420
|
+
The variations in these methods are a result of the subtle variations between the sparsity
|
421
|
+
and quantization compressors. Specifically, quantization compressors return not just the
|
422
|
+
decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
|
423
|
+
compressors only return the decompressed weight.
|
424
|
+
|
414
425
|
"""
|
415
426
|
model_path = get_safetensors_folder(model_path)
|
416
427
|
sparse_decompressed = False
|
@@ -419,9 +430,16 @@ class ModelCompressor:
|
|
419
430
|
self.sparsity_compressor is not None
|
420
431
|
and self.sparsity_config.format != CompressionFormat.dense.value
|
421
432
|
):
|
433
|
+
params_to_ignore = None
|
434
|
+
if self.quantization_compressor is not None:
|
435
|
+
params_to_ignore = self.quantization_compressor.compression_param_names
|
422
436
|
# Sparse decompression is applied on the model_path
|
423
|
-
|
424
|
-
|
437
|
+
# The compressor will try and load any quantization parameters as well
|
438
|
+
# params_to_skip_load will skip over quantization params from being loaded
|
439
|
+
dense_gen = self.sparsity_compressor.decompress(
|
440
|
+
model_path, params_to_skip_load=params_to_ignore
|
441
|
+
)
|
442
|
+
self._replace_sparsity_weights(dense_gen, model)
|
425
443
|
setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
|
426
444
|
sparse_decompressed = True
|
427
445
|
|
@@ -430,13 +448,27 @@ class ModelCompressor:
|
|
430
448
|
# quantization during apply_quantization_config. This ensures
|
431
449
|
# that the dtypes of the weights are not unintentionally updated.
|
432
450
|
# The status is restored after quantization params are loaded.
|
451
|
+
|
433
452
|
with override_quantization_status(
|
434
453
|
self.quantization_config, QuantizationStatus.FROZEN
|
435
454
|
):
|
455
|
+
|
436
456
|
names_to_scheme = apply_quantization_config(
|
437
457
|
model, self.quantization_config
|
438
458
|
)
|
439
|
-
|
459
|
+
# Load activation scales/zp or any other quantization parameters
|
460
|
+
# Conditionally load the weight quantization parameters if we have a dense compressor
|
461
|
+
# Or if a sparsity compressor has already been applied
|
462
|
+
load_pretrained_quantization_parameters(
|
463
|
+
model,
|
464
|
+
model_path,
|
465
|
+
# TODO: all weight quantization params will be moved to the compressor in a follow-up
|
466
|
+
# including initialization
|
467
|
+
load_weight_quantization=(
|
468
|
+
sparse_decompressed
|
469
|
+
or isinstance(self.quantization_compressor, DenseCompressor)
|
470
|
+
),
|
471
|
+
)
|
440
472
|
|
441
473
|
model_path_or_state_dict = (
|
442
474
|
model.state_dict() if sparse_decompressed else model_path
|
@@ -445,6 +477,8 @@ class ModelCompressor:
|
|
445
477
|
dense_gen = self.quantization_compressor.decompress(
|
446
478
|
model_path_or_state_dict, names_to_scheme=names_to_scheme
|
447
479
|
)
|
480
|
+
# TODO: all weight quantization params will be moved to the compressor
|
481
|
+
# to prevent duplicate parameter updates in update_parameter_data
|
448
482
|
self._replace_weights(dense_gen, model)
|
449
483
|
|
450
484
|
def freeze_quantization_status(module):
|
@@ -500,7 +534,7 @@ class ModelCompressor:
|
|
500
534
|
with open(config_file_path, "w") as config_file:
|
501
535
|
json.dump(config_data, config_file, indent=2, sort_keys=True)
|
502
536
|
|
503
|
-
def
|
537
|
+
def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
|
504
538
|
"""
|
505
539
|
Replace the weights of the model with the
|
506
540
|
provided dense weights.
|
@@ -515,11 +549,60 @@ class ModelCompressor:
|
|
515
549
|
:param model: The model whose weights are to be updated.
|
516
550
|
"""
|
517
551
|
for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
|
552
|
+
|
518
553
|
split_name = name.split(".")
|
519
554
|
prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
|
520
555
|
module = operator.attrgetter(prefix)(model)
|
521
|
-
|
522
|
-
|
556
|
+
|
557
|
+
params_device = next(module.parameters()).device
|
558
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
559
|
+
delattr(module, param_name)
|
560
|
+
requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
|
561
|
+
param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
|
562
|
+
register_offload_parameter(module, param_name, param)
|
563
|
+
|
564
|
+
def _replace_weights(self, dense_weight_generator, model: Module):
|
565
|
+
"""
|
566
|
+
Replace the weights of the model with the
|
567
|
+
provided dense weights.
|
568
|
+
|
569
|
+
This method iterates over the dense_weight_generator and
|
570
|
+
updates the corresponding weights in the model. If a parameter
|
571
|
+
name does not exist in the model, it will be skipped.
|
572
|
+
|
573
|
+
:param dense_weight_generator (generator): A generator that yields
|
574
|
+
tuples of (name, data), where 'name' is the parameter name and
|
575
|
+
'data' is the updated param data
|
576
|
+
:param model: The model whose weights are to be updated.
|
577
|
+
"""
|
578
|
+
|
579
|
+
for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
|
580
|
+
module = operator.attrgetter(name)(model)
|
581
|
+
|
582
|
+
params_device = next(module.parameters()).device
|
583
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
584
|
+
|
585
|
+
for param_name, param_data in data.items():
|
586
|
+
if hasattr(module, param_name):
|
587
|
+
# If compressed, will have an incorrect dtype for transformers >4.49
|
588
|
+
# TODO: we can also just skip initialization of scales/zp if in decompression in init
|
589
|
+
# to be consistent with loading which happens later as well
|
590
|
+
# however, update_data does a good shape check - should be moved to the compressor
|
591
|
+
if param_name == "weight":
|
592
|
+
delattr(module, param_name)
|
593
|
+
requires_grad = param_data.dtype in (
|
594
|
+
torch.float16,
|
595
|
+
torch.float32,
|
596
|
+
torch.bfloat16,
|
597
|
+
)
|
598
|
+
param = torch.nn.Parameter(
|
599
|
+
param_data.to(device), requires_grad=requires_grad
|
600
|
+
)
|
601
|
+
register_offload_parameter(module, param_name, param)
|
602
|
+
else:
|
603
|
+
# Should already be registered to the correct device for
|
604
|
+
# for scales/zero-points
|
605
|
+
update_parameter_data(module, param_data, param_name)
|
523
606
|
|
524
607
|
|
525
608
|
def map_modules_to_quant_args(
|
@@ -14,11 +14,11 @@
|
|
14
14
|
|
15
15
|
import logging
|
16
16
|
from pathlib import Path
|
17
|
-
from typing import Any, Dict, Generator, Tuple, Union
|
17
|
+
from typing import Any, Dict, Generator, Optional, Tuple, Union
|
18
18
|
|
19
19
|
import torch
|
20
20
|
from compressed_tensors.compressors.base import BaseCompressor
|
21
|
-
from compressed_tensors.quantization import QuantizationArgs
|
21
|
+
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
|
22
22
|
from compressed_tensors.utils import (
|
23
23
|
get_nested_mappings_from_state_dict,
|
24
24
|
get_nested_weight_mappings,
|
@@ -132,8 +132,10 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
132
132
|
compressed_dict[merge_names(prefix, key)] = value
|
133
133
|
else:
|
134
134
|
compressed_dict[name] = value.to("cpu")
|
135
|
-
# only save if asym
|
136
|
-
elif is_weight_zp and
|
135
|
+
# only save zp if asym and not packed zp
|
136
|
+
elif is_weight_zp and (
|
137
|
+
quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
|
138
|
+
):
|
137
139
|
continue
|
138
140
|
# only save if asym
|
139
141
|
elif is_input_zp and input_args_zp.symmetric:
|
@@ -145,6 +147,17 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
145
147
|
|
146
148
|
return compressed_dict
|
147
149
|
|
150
|
+
def _check_if_zp_pack_quantized(self, quant_args):
|
151
|
+
from compressed_tensors.compressors import PackedQuantizationCompressor
|
152
|
+
|
153
|
+
if isinstance(self, PackedQuantizationCompressor):
|
154
|
+
if not quant_args.symmetric and quant_args.strategy in [
|
155
|
+
QuantizationStrategy.GROUP.value,
|
156
|
+
QuantizationStrategy.CHANNEL.value,
|
157
|
+
]:
|
158
|
+
return True
|
159
|
+
return False
|
160
|
+
|
148
161
|
def decompress(
|
149
162
|
self,
|
150
163
|
path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
|
@@ -186,7 +199,8 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
186
199
|
decompressed = self.decompress_weight(
|
187
200
|
compressed_data=weight_data, quantization_args=quant_args
|
188
201
|
)
|
189
|
-
|
202
|
+
weight_data["weight"] = decompressed
|
203
|
+
yield weight_name, weight_data
|
190
204
|
|
191
205
|
def _decompress_from_state_dict(self, state_dict, names_to_scheme):
|
192
206
|
weight_mappings = get_nested_mappings_from_state_dict(
|
@@ -202,4 +216,5 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
202
216
|
decompressed = self.decompress_weight(
|
203
217
|
compressed_data=weight_data, quantization_args=quant_args
|
204
218
|
)
|
205
|
-
|
219
|
+
weight_data["weight"] = decompressed
|
220
|
+
yield weight_name, weight_data
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
import math
|
15
|
-
from typing import Dict, Optional, Tuple
|
15
|
+
from typing import Dict, Literal, Optional, Tuple, Union
|
16
16
|
|
17
17
|
import numpy as np
|
18
18
|
import torch
|
@@ -21,7 +21,7 @@ from compressed_tensors.compressors.quantized_compressors.base import (
|
|
21
21
|
BaseQuantizationCompressor,
|
22
22
|
)
|
23
23
|
from compressed_tensors.config import CompressionFormat
|
24
|
-
from compressed_tensors.quantization import QuantizationArgs
|
24
|
+
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
|
25
25
|
from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
|
26
26
|
from compressed_tensors.quantization.utils import can_quantize
|
27
27
|
from torch import Tensor
|
@@ -65,10 +65,26 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
65
65
|
"""
|
66
66
|
pack_factor = 32 // quantization_args.num_bits
|
67
67
|
packed_size = math.ceil(weight_shape[1] / pack_factor)
|
68
|
-
|
68
|
+
packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
|
69
|
+
output = {
|
69
70
|
"weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
|
70
71
|
"weight_shape": (torch.Size((2,)), torch.int32),
|
71
72
|
}
|
73
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
74
|
+
QuantizationStrategy.GROUP.value,
|
75
|
+
QuantizationStrategy.CHANNEL.value,
|
76
|
+
]:
|
77
|
+
zp_factor = (
|
78
|
+
quantization_args.group_size
|
79
|
+
if quantization_args.strategy == QuantizationStrategy.GROUP.value
|
80
|
+
else weight_shape[-1]
|
81
|
+
)
|
82
|
+
|
83
|
+
output["weight_zero_point"] = (
|
84
|
+
torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
|
85
|
+
torch.int32,
|
86
|
+
)
|
87
|
+
return output
|
72
88
|
|
73
89
|
def compress_weight(
|
74
90
|
self,
|
@@ -104,6 +120,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
104
120
|
quantized_weight = weight
|
105
121
|
|
106
122
|
packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
|
123
|
+
|
107
124
|
weight_shape = torch.tensor(weight.shape)
|
108
125
|
if device is not None:
|
109
126
|
packed_weight = packed_weight.to(device)
|
@@ -112,6 +129,15 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
112
129
|
compressed_dict["weight_shape"] = weight_shape
|
113
130
|
compressed_dict["weight_packed"] = packed_weight
|
114
131
|
|
132
|
+
# We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
|
133
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
134
|
+
QuantizationStrategy.GROUP.value,
|
135
|
+
QuantizationStrategy.CHANNEL.value,
|
136
|
+
]:
|
137
|
+
packed_zp = pack_to_int32(
|
138
|
+
zero_point, quantization_args.num_bits, packed_dim=0
|
139
|
+
)
|
140
|
+
compressed_dict["weight_zero_point"] = packed_zp
|
115
141
|
return compressed_dict
|
116
142
|
|
117
143
|
def decompress_weight(
|
@@ -133,6 +159,21 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
133
159
|
original_shape = torch.Size(compressed_data["weight_shape"])
|
134
160
|
num_bits = quantization_args.num_bits
|
135
161
|
unpacked = unpack_from_int32(weight, num_bits, original_shape)
|
162
|
+
|
163
|
+
# NOTE: this will fail decompression as we don't currently handle packed zp on decompression
|
164
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
165
|
+
QuantizationStrategy.GROUP.value,
|
166
|
+
QuantizationStrategy.CHANNEL.value,
|
167
|
+
]:
|
168
|
+
raise ValueError(
|
169
|
+
"Decompression of packed zero points is currently not supported"
|
170
|
+
)
|
171
|
+
assert zero_point is not None
|
172
|
+
original_zp_shape = (original_shape[0], scale.shape[-1])
|
173
|
+
zero_point = unpack_from_int32(
|
174
|
+
zero_point, num_bits, original_zp_shape, packed_dim=0
|
175
|
+
)
|
176
|
+
|
136
177
|
decompressed_weight = dequantize(
|
137
178
|
x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
|
138
179
|
)
|
@@ -140,7 +181,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
140
181
|
return decompressed_weight
|
141
182
|
|
142
183
|
|
143
|
-
def pack_to_int32(
|
184
|
+
def pack_to_int32(
|
185
|
+
value: torch.Tensor,
|
186
|
+
num_bits: int,
|
187
|
+
packed_dim: Union[Literal[0], Literal[1]] = 1,
|
188
|
+
) -> torch.Tensor:
|
144
189
|
"""
|
145
190
|
Packs a tensor of quantized weights stored in int8 into int32s with padding
|
146
191
|
|
@@ -176,14 +221,19 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
176
221
|
pack_factor = 32 // num_bits
|
177
222
|
|
178
223
|
# pad input tensor and initialize packed output
|
179
|
-
packed_size = math.ceil(value.shape[
|
180
|
-
padding = packed_size * pack_factor - value.shape[
|
224
|
+
packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
|
225
|
+
padding = packed_size * pack_factor - value.shape[packed_dim]
|
181
226
|
value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
|
182
227
|
|
183
228
|
# pack values
|
184
|
-
|
185
|
-
|
186
|
-
|
229
|
+
if packed_dim == 1:
|
230
|
+
packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
|
231
|
+
for i in range(pack_factor):
|
232
|
+
packed |= value[:, i::pack_factor] << num_bits * i
|
233
|
+
else:
|
234
|
+
packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
|
235
|
+
for i in range(pack_factor):
|
236
|
+
packed |= value[i::pack_factor, :] << num_bits * i
|
187
237
|
|
188
238
|
# convert back to signed and torch
|
189
239
|
packed = np.ascontiguousarray(packed).view(np.int32)
|
@@ -191,7 +241,10 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
191
241
|
|
192
242
|
|
193
243
|
def unpack_from_int32(
|
194
|
-
value: torch.Tensor,
|
244
|
+
value: torch.Tensor,
|
245
|
+
num_bits: int,
|
246
|
+
shape: torch.Size,
|
247
|
+
packed_dim: Union[Literal[0], Literal[1]] = 1,
|
195
248
|
) -> torch.Tensor:
|
196
249
|
"""
|
197
250
|
Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
|
@@ -216,17 +269,31 @@ def unpack_from_int32(
|
|
216
269
|
|
217
270
|
# unpack
|
218
271
|
mask = (1 << num_bits) - 1
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
272
|
+
|
273
|
+
if packed_dim == 1:
|
274
|
+
unpacked = torch.zeros(
|
275
|
+
(value.shape[0], value.shape[1] * pack_factor),
|
276
|
+
device=value.device,
|
277
|
+
dtype=torch.int32,
|
278
|
+
)
|
279
|
+
for i in range(pack_factor):
|
280
|
+
unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
|
281
|
+
|
282
|
+
# remove padding
|
283
|
+
original_row_size = int(shape[1])
|
284
|
+
unpacked = unpacked[:, :original_row_size]
|
285
|
+
else:
|
286
|
+
unpacked = torch.zeros(
|
287
|
+
(value.shape[0] * pack_factor, value.shape[1]),
|
288
|
+
device=value.device,
|
289
|
+
dtype=torch.int32,
|
290
|
+
)
|
291
|
+
for i in range(pack_factor):
|
292
|
+
unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
|
293
|
+
|
294
|
+
# remove padding
|
295
|
+
original_row_size = int(shape[0])
|
296
|
+
unpacked = unpacked[:original_row_size, :]
|
230
297
|
|
231
298
|
# bits are packed in unsigned format, reformat to signed
|
232
299
|
# update the value range from unsigned to signed
|
@@ -98,7 +98,11 @@ class BaseSparseCompressor(BaseCompressor):
|
|
98
98
|
return compressed_dict
|
99
99
|
|
100
100
|
def decompress(
|
101
|
-
self,
|
101
|
+
self,
|
102
|
+
path_to_model_or_tensors: str,
|
103
|
+
device: str = "cpu",
|
104
|
+
params_to_skip_load: Optional[Tuple] = None,
|
105
|
+
**kwargs,
|
102
106
|
) -> Generator[Tuple[str, Tensor], None, None]:
|
103
107
|
"""
|
104
108
|
Reads a bitmask compressed state dict located
|
@@ -108,6 +112,11 @@ class BaseSparseCompressor(BaseCompressor):
|
|
108
112
|
:param model_path: path to compressed safetensors model (directory with
|
109
113
|
one or more safetensors files) or compressed tensors file
|
110
114
|
:param device: device to load decompressed weights onto
|
115
|
+
:param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
|
116
|
+
parameters) that we want to skip loading. As the sparsity compresssor does
|
117
|
+
not handle quantized decompression, this should contain any quantization
|
118
|
+
parameters when decompressing stacked compressors. We want these parameters
|
119
|
+
to be handled by the quantization decompressor
|
111
120
|
:return: iterator for generating decompressed weights
|
112
121
|
"""
|
113
122
|
weight_mappings, ignored_params = get_nested_weight_mappings(
|
@@ -121,13 +130,21 @@ class BaseSparseCompressor(BaseCompressor):
|
|
121
130
|
full_name = merge_names(weight_name, param_name)
|
122
131
|
with safe_open(safe_path, framework="pt", device=device) as f:
|
123
132
|
weight_data[param_name] = f.get_tensor(full_name)
|
133
|
+
|
124
134
|
decompressed = self.decompress_weight(weight_data)
|
125
135
|
yield merge_names(weight_name, "weight"), decompressed
|
126
136
|
|
127
137
|
for ignored_param_name, safe_path in ignored_params.items():
|
128
|
-
|
129
|
-
|
130
|
-
|
138
|
+
should_skip = False
|
139
|
+
if params_to_skip_load is not None:
|
140
|
+
for param_to_skip in params_to_skip_load:
|
141
|
+
if param_to_skip in ignored_param_name:
|
142
|
+
should_skip = True
|
143
|
+
|
144
|
+
if not should_skip:
|
145
|
+
with safe_open(safe_path, framework="pt", device=device) as f:
|
146
|
+
value = f.get_tensor(ignored_param_name)
|
147
|
+
yield ignored_param_name, value
|
131
148
|
|
132
149
|
@staticmethod
|
133
150
|
def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:
|