compressed-tensors 0.9.5a20250513__tar.gz → 0.9.5a20250514__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.9.5a20250513/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250514}/PKG-INFO +1 -1
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +131 -3
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/quantized_compressors/base.py +14 -7
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_compressors/base.py +44 -6
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_compressors/dense.py +7 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +2 -6
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +5 -1
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/linear/compressed_linear.py +3 -2
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/helpers.py +0 -7
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/safetensors_load.py +16 -3
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/model_compressors/test_model_compressor.py +113 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +2 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/.gitkeep +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/build-test.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/build.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/report.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/test-check.yaml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/test.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/trigger-all.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/upload.yml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.gitignore +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/LICENSE +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/Makefile +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/README.md +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/pyproject.toml +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/setup.cfg +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/setup.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/apply.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/lifecycle/initialize.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/quant_args.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/offload.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/permute.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors.egg-info/SOURCES.txt +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/conftest.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/quantized_compressors/test_nvfp4_quant.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_apply.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_registry.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_utils/test_offload.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/testing_utils.py +0 -0
- {compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/utils/copyright.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.5a20250514
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -47,6 +47,9 @@ from compressed_tensors.quantization.utils import (
|
|
47
47
|
iter_named_leaf_modules,
|
48
48
|
)
|
49
49
|
from compressed_tensors.utils import (
|
50
|
+
align_module_device,
|
51
|
+
delete_offload_parameter,
|
52
|
+
get_execution_device,
|
50
53
|
get_safetensors_folder,
|
51
54
|
has_offloaded_params,
|
52
55
|
merge_names,
|
@@ -98,6 +101,9 @@ class ModelCompressor:
|
|
98
101
|
:param quantization_config: config specifying quantization compression parameters
|
99
102
|
"""
|
100
103
|
|
104
|
+
sparsity_config: Optional[SparsityCompressionConfig] = None
|
105
|
+
quantization_config: Optional[QuantizationConfig] = None
|
106
|
+
|
101
107
|
@classmethod
|
102
108
|
def from_pretrained(
|
103
109
|
cls,
|
@@ -261,6 +267,8 @@ class ModelCompressor:
|
|
261
267
|
quantization_config.format, config=quantization_config
|
262
268
|
)
|
263
269
|
|
270
|
+
# ----- used by hf quantizer ----- #
|
271
|
+
|
264
272
|
def get_missing_module_keys(self, model: Module) -> List[str]:
|
265
273
|
"""
|
266
274
|
Identifies the expected missing weight keys in the compressed state_dict.
|
@@ -270,7 +278,6 @@ class ModelCompressor:
|
|
270
278
|
This function determines which weight keys are missing based on the
|
271
279
|
applied compression techniques.
|
272
280
|
|
273
|
-
|
274
281
|
:param model: The PyTorch model to check for missing keys.
|
275
282
|
:return: A list of missing keys expected in the compressed state_dict.
|
276
283
|
"""
|
@@ -362,8 +369,124 @@ class ModelCompressor:
|
|
362
369
|
|
363
370
|
return list(unexpected_keys)
|
364
371
|
|
372
|
+
# ----- model memory compression/decompression pathways ----- #
|
373
|
+
|
374
|
+
def compress_model(self, model: Module):
|
375
|
+
"""
|
376
|
+
Compress a model in memory. Because the model structure is modified in place,
|
377
|
+
this method is more memory-efficient than `self.compress`
|
378
|
+
|
379
|
+
:param model: model containing parameters to compress
|
380
|
+
"""
|
381
|
+
module_to_scheme = map_module_to_scheme(model)
|
382
|
+
sparse_compression_targets: Set[str] = expand_target_names(
|
383
|
+
model=model,
|
384
|
+
targets=self.sparsity_config.targets if self.sparsity_config else [],
|
385
|
+
ignore=self.sparsity_config.ignore if self.sparsity_config else [],
|
386
|
+
)
|
387
|
+
|
388
|
+
for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
|
389
|
+
if prefix in module_to_scheme or prefix in sparse_compression_targets:
|
390
|
+
# in the future, support compression on same device
|
391
|
+
with align_module_device(module, execution_device="cpu"):
|
392
|
+
state_dict = module.state_dict(prefix=f"{prefix}.")
|
393
|
+
|
394
|
+
# quantization first
|
395
|
+
if prefix in module_to_scheme:
|
396
|
+
state_dict = self.quantization_compressor.compress(
|
397
|
+
state_dict,
|
398
|
+
names_to_scheme=module_to_scheme,
|
399
|
+
show_progress=False,
|
400
|
+
)
|
401
|
+
|
402
|
+
# sparsity second
|
403
|
+
if prefix in sparse_compression_targets:
|
404
|
+
state_dict = self.sparsity_compressor.compress(
|
405
|
+
state_dict,
|
406
|
+
compression_targets=sparse_compression_targets,
|
407
|
+
show_progress=False,
|
408
|
+
)
|
409
|
+
|
410
|
+
# remove any existing parameters
|
411
|
+
device = get_execution_device(module)
|
412
|
+
for name, _ in list(module.named_parameters()):
|
413
|
+
delattr(module, name)
|
414
|
+
|
415
|
+
# replace with compressed parameters
|
416
|
+
for name, value in state_dict.items():
|
417
|
+
name = name.removeprefix(f"{prefix}.")
|
418
|
+
value = value.to(device)
|
419
|
+
param = torch.nn.Parameter(value, requires_grad=False)
|
420
|
+
register_offload_parameter(module, name, param)
|
421
|
+
|
422
|
+
module.quantization_status = QuantizationStatus.COMPRESSED
|
423
|
+
|
424
|
+
def decompress_model(self, model: Module):
|
425
|
+
"""
|
426
|
+
Decompress a model in memory. Because the model structure is modified in place,
|
427
|
+
this method does not require loading some compression parameters from disk
|
428
|
+
|
429
|
+
:param model: model containing parameters to compress
|
430
|
+
"""
|
431
|
+
module_to_scheme = map_module_to_scheme(model)
|
432
|
+
sparse_compression_targets: Set[str] = expand_target_names(
|
433
|
+
model=model,
|
434
|
+
targets=self.sparsity_config.targets if self.sparsity_config else [],
|
435
|
+
ignore=self.sparsity_config.ignore if self.sparsity_config else [],
|
436
|
+
)
|
437
|
+
|
438
|
+
for prefix, module in tqdm(model.named_modules(), desc="Decompressing model"):
|
439
|
+
if prefix in module_to_scheme or prefix in sparse_compression_targets:
|
440
|
+
# in the future, support decompression on same device
|
441
|
+
with align_module_device(module, execution_device="cpu"):
|
442
|
+
state_dict = module.state_dict(prefix=f"{prefix}.")
|
443
|
+
|
444
|
+
# sparsity first
|
445
|
+
if prefix in sparse_compression_targets:
|
446
|
+
# sparse_compression_targets are automatically inferred by this fn
|
447
|
+
generator = self.sparsity_compressor.decompress_from_state_dict(
|
448
|
+
state_dict,
|
449
|
+
)
|
450
|
+
# generates (param_path, param_val)
|
451
|
+
# of compressed and unused params
|
452
|
+
state_dict = {key: value for key, value in generator}
|
453
|
+
|
454
|
+
# quantization second
|
455
|
+
if prefix in module_to_scheme:
|
456
|
+
generator = self.quantization_compressor.decompress_from_state_dict(
|
457
|
+
state_dict,
|
458
|
+
names_to_scheme=module_to_scheme,
|
459
|
+
)
|
460
|
+
# generates (mod_path, {param_name, param_val})
|
461
|
+
# of compressed params and used params, but not unused params
|
462
|
+
# some used params are removed by get_unexpected_file_keys
|
463
|
+
state_dict = {
|
464
|
+
merge_names(module_path, param_name): param_value
|
465
|
+
for module_path, compressed_data in generator
|
466
|
+
for param_name, param_value in compressed_data.items()
|
467
|
+
}
|
468
|
+
|
469
|
+
# remove any existing parameters
|
470
|
+
device = get_execution_device(module)
|
471
|
+
for name, _ in list(module.named_parameters()):
|
472
|
+
delete_offload_parameter(module, name)
|
473
|
+
|
474
|
+
# replace with decompressed parameters
|
475
|
+
for name, value in state_dict.items():
|
476
|
+
name = name.removeprefix(f"{prefix}.")
|
477
|
+
value = value.to(device)
|
478
|
+
param = torch.nn.Parameter(value, requires_grad=False)
|
479
|
+
register_offload_parameter(module, name, param)
|
480
|
+
|
481
|
+
module.quantization_status = QuantizationStatus.FROZEN
|
482
|
+
|
483
|
+
# ----- state dict compression pathways ----- #
|
484
|
+
|
365
485
|
def compress(
|
366
|
-
self,
|
486
|
+
self,
|
487
|
+
model: Module,
|
488
|
+
state_dict: Optional[Dict[str, Tensor]] = None,
|
489
|
+
show_progress: bool = False,
|
367
490
|
) -> Dict[str, Tensor]:
|
368
491
|
"""
|
369
492
|
Compresses a dense state dict or model with sparsity and/or quantization
|
@@ -379,7 +502,9 @@ class ModelCompressor:
|
|
379
502
|
if self.quantization_compressor is not None:
|
380
503
|
module_to_scheme = map_module_to_scheme(model)
|
381
504
|
state_dict = self.quantization_compressor.compress(
|
382
|
-
state_dict,
|
505
|
+
state_dict,
|
506
|
+
names_to_scheme=module_to_scheme,
|
507
|
+
show_progress=show_progress,
|
383
508
|
)
|
384
509
|
|
385
510
|
# TODO: consider sparse compression to also be compression
|
@@ -397,6 +522,7 @@ class ModelCompressor:
|
|
397
522
|
state_dict = self.sparsity_compressor.compress(
|
398
523
|
state_dict,
|
399
524
|
compression_targets=sparse_compression_targets,
|
525
|
+
show_progress=show_progress,
|
400
526
|
)
|
401
527
|
|
402
528
|
# HACK: Override the dtype_byte_size function in transformers to
|
@@ -406,6 +532,8 @@ class ModelCompressor:
|
|
406
532
|
|
407
533
|
return state_dict
|
408
534
|
|
535
|
+
# ----- disk decompression pathways ----- #
|
536
|
+
|
409
537
|
def decompress(self, model_path: str, model: Module):
|
410
538
|
"""
|
411
539
|
Overwrites the weights in model with weights decompressed from model_path
|
@@ -23,7 +23,6 @@ from compressed_tensors.utils import (
|
|
23
23
|
get_nested_mappings_from_state_dict,
|
24
24
|
get_nested_weight_mappings,
|
25
25
|
merge_names,
|
26
|
-
remove_suffix,
|
27
26
|
)
|
28
27
|
from safetensors import safe_open
|
29
28
|
from torch import Tensor
|
@@ -71,6 +70,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
71
70
|
self,
|
72
71
|
model_state: Dict[str, Tensor],
|
73
72
|
names_to_scheme: Dict[str, QuantizationScheme],
|
73
|
+
show_progress: bool = False,
|
74
74
|
**kwargs,
|
75
75
|
) -> Dict[str, Tensor]:
|
76
76
|
"""
|
@@ -79,18 +79,21 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
79
79
|
:param model_state: state dict of uncompressed model
|
80
80
|
:param names_to_scheme: quantization args for each quantized weight, needed for
|
81
81
|
quantize function to calculate bit depth
|
82
|
+
:param show_progress: whether to show tqdm progress
|
82
83
|
:return: compressed state dict
|
83
84
|
"""
|
85
|
+
uncompressed_names = list(model_state.keys())
|
84
86
|
compressed_dict = {}
|
85
87
|
save_device = "cpu"
|
86
88
|
|
87
|
-
|
88
|
-
|
89
|
+
# compress values
|
90
|
+
desc = "Compressing with quantization"
|
91
|
+
for name in tqdm(uncompressed_names, desc=desc, disable=(not show_progress)):
|
89
92
|
value = model_state[name]
|
90
93
|
|
91
94
|
# compress weights
|
92
95
|
if name.endswith("weight"):
|
93
|
-
prefix =
|
96
|
+
prefix = name.removesuffix("weight")
|
94
97
|
|
95
98
|
# gather qparams
|
96
99
|
scale = model_state.get(prefix + "weight_scale", None)
|
@@ -182,7 +185,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
182
185
|
)
|
183
186
|
|
184
187
|
else:
|
185
|
-
yield from self.
|
188
|
+
yield from self.decompress_from_state_dict(
|
186
189
|
path_to_model_or_tensors, names_to_scheme
|
187
190
|
)
|
188
191
|
|
@@ -209,7 +212,11 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
209
212
|
weight_data["weight"] = decompressed
|
210
213
|
yield module_path, weight_data
|
211
214
|
|
212
|
-
def
|
215
|
+
def decompress_from_state_dict(
|
216
|
+
self,
|
217
|
+
state_dict: Dict[str, torch.Tensor],
|
218
|
+
names_to_scheme: Dict[str, QuantizationScheme],
|
219
|
+
) -> Generator[Tuple[str, Dict[str, torch.Tensor]], None, None]:
|
213
220
|
weight_mappings = get_nested_mappings_from_state_dict(
|
214
221
|
state_dict, self.compression_param_names
|
215
222
|
)
|
@@ -219,7 +226,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
219
226
|
weight_data[param_name] = param_value
|
220
227
|
|
221
228
|
if "weight_scale" in weight_data:
|
222
|
-
quant_args = names_to_scheme[module_path]
|
229
|
+
quant_args = names_to_scheme[module_path].weights
|
223
230
|
decompressed = self.decompress_weight(
|
224
231
|
compressed_data=weight_data, quantization_args=quant_args
|
225
232
|
)
|
@@ -16,7 +16,11 @@ import logging
|
|
16
16
|
from typing import Dict, Generator, Optional, Set, Tuple
|
17
17
|
|
18
18
|
from compressed_tensors.compressors.base import BaseCompressor
|
19
|
-
from compressed_tensors.utils import
|
19
|
+
from compressed_tensors.utils import (
|
20
|
+
get_nested_mappings_from_state_dict,
|
21
|
+
get_nested_weight_mappings,
|
22
|
+
merge_names,
|
23
|
+
)
|
20
24
|
from safetensors import safe_open
|
21
25
|
from torch import Tensor
|
22
26
|
from tqdm import tqdm
|
@@ -63,6 +67,7 @@ class BaseSparseCompressor(BaseCompressor):
|
|
63
67
|
self,
|
64
68
|
model_state: Dict[str, Tensor],
|
65
69
|
compression_targets: Optional[Set[str]] = None,
|
70
|
+
show_progress: bool = False,
|
66
71
|
) -> Dict[str, Tensor]:
|
67
72
|
"""
|
68
73
|
Compresses a dense state dict using bitmask compression
|
@@ -76,7 +81,11 @@ class BaseSparseCompressor(BaseCompressor):
|
|
76
81
|
_LOGGER.debug(
|
77
82
|
f"Compressing model with {len(model_state)} parameterized layers..."
|
78
83
|
)
|
79
|
-
for name, value in tqdm(
|
84
|
+
for name, value in tqdm(
|
85
|
+
model_state.items(),
|
86
|
+
desc="Compressing with sparsity",
|
87
|
+
disable=(not show_progress),
|
88
|
+
):
|
80
89
|
if not self.should_compress(name, compression_targets):
|
81
90
|
compressed_dict[name] = value
|
82
91
|
continue
|
@@ -124,15 +133,15 @@ class BaseSparseCompressor(BaseCompressor):
|
|
124
133
|
self.compression_param_names,
|
125
134
|
return_unmatched_params=True,
|
126
135
|
)
|
127
|
-
for
|
136
|
+
for module_path in weight_mappings.keys():
|
128
137
|
weight_data = {}
|
129
|
-
for param_name, safe_path in weight_mappings[
|
130
|
-
full_name = merge_names(
|
138
|
+
for param_name, safe_path in weight_mappings[module_path].items():
|
139
|
+
full_name = merge_names(module_path, param_name)
|
131
140
|
with safe_open(safe_path, framework="pt", device=device) as f:
|
132
141
|
weight_data[param_name] = f.get_tensor(full_name)
|
133
142
|
|
134
143
|
decompressed = self.decompress_weight(weight_data)
|
135
|
-
yield merge_names(
|
144
|
+
yield merge_names(module_path, "weight"), decompressed
|
136
145
|
|
137
146
|
for ignored_param_name, safe_path in ignored_params.items():
|
138
147
|
should_skip = False
|
@@ -146,6 +155,35 @@ class BaseSparseCompressor(BaseCompressor):
|
|
146
155
|
value = f.get_tensor(ignored_param_name)
|
147
156
|
yield ignored_param_name, value
|
148
157
|
|
158
|
+
def decompress_from_state_dict(
|
159
|
+
self,
|
160
|
+
state_dict: Dict[str, Tensor],
|
161
|
+
) -> Generator[Tuple[str, Dict[str, Tensor]], None, None]:
|
162
|
+
"""
|
163
|
+
Decompress the state dict of a module (or model)
|
164
|
+
|
165
|
+
Unlike `self.decompress`, this function does not need to explicitly skip params
|
166
|
+
via params_to_skip_load because it is more convenient for its only caller
|
167
|
+
(ModelCompressor.decompress_model) to retrieve all unused param keys
|
168
|
+
|
169
|
+
:param state_dict: state dict containing parameters to decompress
|
170
|
+
:return: Generator of (param_path, param_val)
|
171
|
+
"""
|
172
|
+
weight_mappings, ignored_params = get_nested_mappings_from_state_dict(
|
173
|
+
state_dict, self.compression_param_names, return_unmatched_params=True
|
174
|
+
)
|
175
|
+
|
176
|
+
for module_path in weight_mappings.keys():
|
177
|
+
weight_data = {}
|
178
|
+
for param_name, param_value in weight_mappings[module_path].items():
|
179
|
+
weight_data[param_name] = param_value
|
180
|
+
|
181
|
+
decompressed = self.decompress_weight(weight_data)
|
182
|
+
yield merge_names(module_path, "weight"), decompressed
|
183
|
+
|
184
|
+
for ignored_param_path, ignored_param_value in ignored_params.items():
|
185
|
+
yield ignored_param_path, ignored_param_value
|
186
|
+
|
149
187
|
@staticmethod
|
150
188
|
def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:
|
151
189
|
"""
|
@@ -40,3 +40,10 @@ class DenseCompressor(BaseCompressor):
|
|
40
40
|
self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
|
41
41
|
) -> Generator[Tuple[str, Tensor], None, None]:
|
42
42
|
return iter([])
|
43
|
+
|
44
|
+
def decompress_from_state_dict(
|
45
|
+
self,
|
46
|
+
state_dict: Dict[str, Tensor],
|
47
|
+
) -> Generator[Tuple[str, Dict[str, Tensor]], None, None]:
|
48
|
+
for key, value in state_dict.items():
|
49
|
+
yield key, value
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from dataclasses import dataclass
|
16
|
-
from typing import Dict, List, Tuple, Union
|
16
|
+
from typing import Dict, Generator, List, Tuple, Union
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from compressed_tensors.compressors.base import BaseCompressor
|
@@ -202,11 +202,7 @@ def sparse24_bitmask_decompress(
|
|
202
202
|
decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype)
|
203
203
|
decompressed_tensor = decompressed_tensor.to(values.device)
|
204
204
|
values = values.flatten()
|
205
|
-
|
206
|
-
decompressed_tensor[bytemasks_unpacked] = values
|
207
|
-
decompressed_tensor = decompressed_tensor.cuda()
|
208
|
-
else:
|
209
|
-
decompressed_tensor[bytemasks_unpacked] = values
|
205
|
+
decompressed_tensor[bytemasks_unpacked] = values
|
210
206
|
return decompressed_tensor
|
211
207
|
|
212
208
|
|
@@ -125,6 +125,7 @@ class Marlin24Compressor(BaseCompressor):
|
|
125
125
|
self,
|
126
126
|
model_state: Dict[str, Tensor],
|
127
127
|
names_to_scheme: Dict[str, QuantizationScheme],
|
128
|
+
show_progress: bool = False,
|
128
129
|
**kwargs,
|
129
130
|
) -> Dict[str, Tensor]:
|
130
131
|
"""
|
@@ -134,6 +135,7 @@ class Marlin24Compressor(BaseCompressor):
|
|
134
135
|
:param model_state: state dict of uncompressed model
|
135
136
|
:param names_to_scheme: quantization scheme for each quantized weight, needed
|
136
137
|
for quantize function to calculate bit depth
|
138
|
+
:param show_progress: whether to show tqdm progress
|
137
139
|
:return: compressed state dict
|
138
140
|
"""
|
139
141
|
self.validate_quant_compatability(names_to_scheme)
|
@@ -144,7 +146,9 @@ class Marlin24Compressor(BaseCompressor):
|
|
144
146
|
f"Compressing model with {len(model_state)} parameterized layers..."
|
145
147
|
)
|
146
148
|
|
147
|
-
for name, value in tqdm(
|
149
|
+
for name, value in tqdm(
|
150
|
+
model_state.items(), desc="Compressing model", disable=(not show_progress)
|
151
|
+
):
|
148
152
|
if name.endswith(weight_suffix):
|
149
153
|
prefix = name[: -(len(weight_suffix))]
|
150
154
|
scale = model_state.get(merge_names(prefix, "weight_scale"), None)
|
@@ -23,6 +23,7 @@ from compressed_tensors.quantization import (
|
|
23
23
|
initialize_module_for_quantization,
|
24
24
|
)
|
25
25
|
from compressed_tensors.utils import register_offload_parameter
|
26
|
+
from compressed_tensors.utils.offload import get_execution_device
|
26
27
|
from torch import Tensor
|
27
28
|
from torch.nn import Parameter
|
28
29
|
from torch.nn.functional import linear
|
@@ -60,7 +61,7 @@ class CompressedLinear(Linear):
|
|
60
61
|
"""
|
61
62
|
module.__class__ = CompressedLinear
|
62
63
|
module.compressor = BaseCompressor.load_from_registry(quantization_format)
|
63
|
-
|
64
|
+
init_device = get_execution_device(module)
|
64
65
|
|
65
66
|
# this will initialize all the scales and zero points
|
66
67
|
initialize_module_for_quantization(
|
@@ -79,7 +80,7 @@ class CompressedLinear(Linear):
|
|
79
80
|
# populate compressed weights and quantization parameters
|
80
81
|
for name, (shape, dtype) in compression_params.items():
|
81
82
|
param = Parameter(
|
82
|
-
torch.empty(shape, device=
|
83
|
+
torch.empty(shape, device=init_device, dtype=dtype), requires_grad=False
|
83
84
|
)
|
84
85
|
register_offload_parameter(module, name, param)
|
85
86
|
|
@@ -38,7 +38,6 @@ __all__ = [
|
|
38
38
|
"shard_tensor",
|
39
39
|
"pack_bitmasks",
|
40
40
|
"unpack_bitmasks",
|
41
|
-
"remove_suffix",
|
42
41
|
]
|
43
42
|
|
44
43
|
FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
|
@@ -329,9 +328,3 @@ def unpack_bitmasks(
|
|
329
328
|
)
|
330
329
|
|
331
330
|
return unpacked_bitmasks_torch
|
332
|
-
|
333
|
-
|
334
|
-
def remove_suffix(value: str, suffix: str) -> str:
|
335
|
-
# can replace with str.removesuffix in python3.9+
|
336
|
-
assert value.endswith(suffix)
|
337
|
-
return value[: -len(suffix)]
|
@@ -35,6 +35,7 @@ __all__ = [
|
|
35
35
|
"is_quantization_param",
|
36
36
|
]
|
37
37
|
|
38
|
+
NestedStateDictType = Dict[str, Dict[str, Tensor]]
|
38
39
|
WeightMappingType = Dict[str, str]
|
39
40
|
NestedWeightMappingType = Dict[str, WeightMappingType]
|
40
41
|
|
@@ -249,8 +250,10 @@ def get_nested_weight_mappings(
|
|
249
250
|
|
250
251
|
|
251
252
|
def get_nested_mappings_from_state_dict(
|
252
|
-
state_dict
|
253
|
-
|
253
|
+
state_dict: Dict[str, Tensor],
|
254
|
+
params_to_nest: Iterable[str],
|
255
|
+
return_unmatched_params: bool = False,
|
256
|
+
) -> Union[NestedStateDictType, Tuple[NestedStateDictType, Dict[str, Tensor]]]:
|
254
257
|
"""
|
255
258
|
Takes a state dict and returns a nested mapping from uncompressed
|
256
259
|
parameterized layer names to the value of
|
@@ -266,16 +269,26 @@ def get_nested_mappings_from_state_dict(
|
|
266
269
|
:param state_dict: state dict of the model
|
267
270
|
:param params_to_nest: Iterable of parameter names to nest.
|
268
271
|
:return: Nested mapping of parameterized layer names to the value of
|
269
|
-
each layer's compression parameters.
|
272
|
+
each layer's compression parameters. If `return_unmatched_params`, then
|
273
|
+
also return a dictionary mapping unused parameter names to their values
|
270
274
|
"""
|
271
275
|
nested_weight_mappings = {}
|
276
|
+
unmatched_params = {}
|
277
|
+
|
272
278
|
for key in state_dict.keys():
|
279
|
+
matched = False
|
273
280
|
for param_name in params_to_nest:
|
274
281
|
module_path = match_param_name(key, param_name)
|
275
282
|
if module_path:
|
276
283
|
if module_path not in nested_weight_mappings:
|
277
284
|
nested_weight_mappings[module_path] = {}
|
278
285
|
nested_weight_mappings[module_path][param_name] = state_dict[key]
|
286
|
+
matched = True
|
287
|
+
if return_unmatched_params and not matched:
|
288
|
+
unmatched_params[key] = state_dict[key]
|
289
|
+
|
290
|
+
if return_unmatched_params:
|
291
|
+
return nested_weight_mappings, unmatched_params
|
279
292
|
return nested_weight_mappings
|
280
293
|
|
281
294
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.5a20250514
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -24,6 +24,7 @@ from compressed_tensors.config import SparsityCompressionConfig
|
|
24
24
|
from compressed_tensors.quantization import QuantizationConfig
|
25
25
|
from safetensors.torch import save_file
|
26
26
|
from tests.testing_utils import induce_sparsity, requires_hf_quantizer
|
27
|
+
from transformers import AutoModelForCausalLM
|
27
28
|
|
28
29
|
|
29
30
|
def sparsity_config():
|
@@ -365,3 +366,115 @@ def _get_combined_config(s_config, q_config):
|
|
365
366
|
combined["sparsity_config"] = s_config
|
366
367
|
|
367
368
|
return combined
|
369
|
+
|
370
|
+
|
371
|
+
@pytest.mark.parametrize(
|
372
|
+
"model_stub,q_format,s_config",
|
373
|
+
[
|
374
|
+
(
|
375
|
+
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
|
376
|
+
"float-quantized",
|
377
|
+
None,
|
378
|
+
),
|
379
|
+
(
|
380
|
+
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
|
381
|
+
None,
|
382
|
+
"sparse-24-bitmask",
|
383
|
+
),
|
384
|
+
(
|
385
|
+
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
|
386
|
+
"float-quantized",
|
387
|
+
"sparse-24-bitmask",
|
388
|
+
),
|
389
|
+
(
|
390
|
+
"nm-testing/llama2.c-stories15M-ultrachat-mixed-uncompressed",
|
391
|
+
"pack-quantized",
|
392
|
+
None,
|
393
|
+
),
|
394
|
+
],
|
395
|
+
)
|
396
|
+
def test_compress_model(model_stub, q_format, s_config, tmpdir):
|
397
|
+
model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.float32)
|
398
|
+
compressor = ModelCompressor.from_pretrained_model(model, s_config, q_format)
|
399
|
+
|
400
|
+
# compress model by eagerly compressing state dict
|
401
|
+
true_compressed = dict(compressor.compress(model))
|
402
|
+
true_compressed = {key: value.clone() for key, value in true_compressed.items()}
|
403
|
+
|
404
|
+
# compress model directly
|
405
|
+
compressor.compress_model(model)
|
406
|
+
compressed = dict(model.state_dict())
|
407
|
+
|
408
|
+
# equivalent to eagerly compressing state dict
|
409
|
+
assert compressed.keys() == true_compressed.keys()
|
410
|
+
for key in compressed.keys():
|
411
|
+
assert compressed[key].dtype == true_compressed[key].dtype
|
412
|
+
assert torch.all(compressed[key] == true_compressed[key]), f"{key}"
|
413
|
+
|
414
|
+
|
415
|
+
@pytest.mark.parametrize(
|
416
|
+
"model_stub,comp_stub",
|
417
|
+
[
|
418
|
+
(
|
419
|
+
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
|
420
|
+
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
|
421
|
+
),
|
422
|
+
(
|
423
|
+
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
|
424
|
+
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
|
425
|
+
),
|
426
|
+
(
|
427
|
+
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
|
428
|
+
"nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
|
429
|
+
),
|
430
|
+
(
|
431
|
+
"nm-testing/llama2.c-stories15M-ultrachat-mixed-uncompressed",
|
432
|
+
"nm-testing/llama2.c-stories15M-ultrachat-mixed-compressed",
|
433
|
+
),
|
434
|
+
],
|
435
|
+
)
|
436
|
+
def test_decompress_model(model_stub, comp_stub):
|
437
|
+
from transformers.utils.quantization_config import CompressedTensorsConfig
|
438
|
+
|
439
|
+
# decompress from disk
|
440
|
+
# NOTE: transformers adds extra zero points if run_compressed=False or w/ sparsity
|
441
|
+
# https://github.com/huggingface/transformers/blob/main/src/transformers/quantizers/quantizer_compressed_tensors.py#L131-L133
|
442
|
+
# however, decompression does not add zero points in non-asymmetric cases
|
443
|
+
# in order to normalize for this effect in this test, we remove empty weight zps
|
444
|
+
true_decompressed_model = AutoModelForCausalLM.from_pretrained(
|
445
|
+
comp_stub,
|
446
|
+
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
447
|
+
torch_dtype=torch.float32,
|
448
|
+
)
|
449
|
+
true_decompressed = dict(true_decompressed_model.state_dict())
|
450
|
+
true_decompressed = remove_empty_weight_zero_points(true_decompressed) # see above
|
451
|
+
|
452
|
+
# decompress from memory
|
453
|
+
# NOTE there is no other way to load a compressed model into memory, since
|
454
|
+
# there is no way to turn off decompression for sparse models
|
455
|
+
# https://github.com/huggingface/transformers/blob/main/src/transformers/quantizers/quantizer_compressed_tensors.py#L133
|
456
|
+
model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.float32)
|
457
|
+
compressor = ModelCompressor.from_pretrained(comp_stub)
|
458
|
+
compressor.compress_model(model)
|
459
|
+
compressor.decompress_model(model)
|
460
|
+
decompressed = dict(model.state_dict())
|
461
|
+
|
462
|
+
# remove keys not in model definition
|
463
|
+
# NOTE it would be better if compressors only returned keys to keep, rather than
|
464
|
+
# relying on the model structure + missing keys to catch and remove them later
|
465
|
+
model_keys = true_decompressed_model.state_dict().keys()
|
466
|
+
decompressed = {key: val for key, val in decompressed.items() if key in model_keys}
|
467
|
+
|
468
|
+
# equivalent to decompressing from disk
|
469
|
+
assert decompressed.keys() == true_decompressed.keys()
|
470
|
+
for key in decompressed.keys():
|
471
|
+
assert decompressed[key].dtype == true_decompressed[key].dtype
|
472
|
+
assert torch.all(decompressed[key] == true_decompressed[key]), f"{key}"
|
473
|
+
|
474
|
+
|
475
|
+
def remove_empty_weight_zero_points(state_dict):
|
476
|
+
return {
|
477
|
+
name: value
|
478
|
+
for name, value in state_dict.items()
|
479
|
+
if not (name.endswith("weight_zero_point") and torch.all(value == 0))
|
480
|
+
}
|
@@ -47,6 +47,8 @@ def shard_validation():
|
|
47
47
|
|
48
48
|
def validate_compression(dense_matrix, decompressed_tensor):
|
49
49
|
"""Validate that the decompressed tensor matches the original dense matrix."""
|
50
|
+
if decompressed_tensor.dtype == FP8_DTYPE:
|
51
|
+
decompressed_tensor = decompressed_tensor.to("cuda")
|
50
52
|
dense_matrix = dense_matrix.to(decompressed_tensor.device)
|
51
53
|
assert dense_matrix.dtype == decompressed_tensor.dtype, "Dtype mismatch"
|
52
54
|
assert dense_matrix.shape == decompressed_tensor.shape, "Shape mismatch"
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/scripts/step-status
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/build.yml
RENAMED
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/report.yml
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/test.yml
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/.github/workflows/upload.yml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_registry.py
RENAMED
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/test_utils/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250513 → compressed_tensors-0.9.5a20250514}/tests/testing_utils.py
RENAMED
File without changes
|
File without changes
|