compressed-tensors 0.9.5a20250514__tar.gz → 0.9.5a20250520__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.9.5a20250514/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250520}/PKG-INFO +1 -1
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +7 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/base.py +2 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +6 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +6 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/apply.py +10 -1
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/forward.py +50 -14
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/initialize.py +120 -4
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_args.py +1 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/utils/helpers.py +87 -14
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_initialize.py +15 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_utils/test_helpers.py +21 -2
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/.gitkeep +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/build-test.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/build.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/report.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/test-check.yaml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/test.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/trigger-all.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/upload.yml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.gitignore +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/LICENSE +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/Makefile +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/README.md +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/pyproject.toml +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/setup.cfg +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/setup.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/linear/compressed_linear.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/helpers.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/offload.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/permute.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/safetensors_load.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/SOURCES.txt +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/conftest.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_nvfp4_quant.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_apply.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_helpers.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_registry.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_offload.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/testing_utils.py +0 -0
- {compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/utils/copyright.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.5a20250520
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -421,6 +421,13 @@ class ModelCompressor:
|
|
421
421
|
|
422
422
|
module.quantization_status = QuantizationStatus.COMPRESSED
|
423
423
|
|
424
|
+
# TODO: consider sparse compression to also be compression
|
425
|
+
if (
|
426
|
+
self.quantization_config is not None
|
427
|
+
and self.quantization_config.format != CompressionFormat.dense.value
|
428
|
+
):
|
429
|
+
self.quantization_config.quantization_status = QuantizationStatus.COMPRESSED
|
430
|
+
|
424
431
|
def decompress_model(self, model: Module):
|
425
432
|
"""
|
426
433
|
Decompress a model in memory. Because the model structure is modified in place,
|
@@ -99,6 +99,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
99
99
|
scale = model_state.get(prefix + "weight_scale", None)
|
100
100
|
g_idx = model_state.get(prefix + "weight_g_idx", None)
|
101
101
|
zp = model_state.get(prefix + "weight_zero_point", None)
|
102
|
+
global_scale = model_state.get(prefix + "weight_global_scale", None)
|
102
103
|
|
103
104
|
# is scale does not exist, then weight cannot be compressed
|
104
105
|
if scale is None:
|
@@ -112,6 +113,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
112
113
|
weight=value,
|
113
114
|
scale=scale,
|
114
115
|
zero_point=zp,
|
116
|
+
global_scale=global_scale,
|
115
117
|
g_idx=g_idx,
|
116
118
|
quantization_args=quant_args,
|
117
119
|
device="cpu",
|
@@ -78,6 +78,7 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
|
|
78
78
|
zero_point: Optional[Tensor] = None,
|
79
79
|
g_idx: Optional[torch.Tensor] = None,
|
80
80
|
device: Optional[torch.device] = None,
|
81
|
+
global_scale: Optional[torch.Tensor] = None,
|
81
82
|
) -> Dict[str, torch.Tensor]:
|
82
83
|
"""
|
83
84
|
Compresses a single uncompressed weight
|
@@ -90,6 +91,11 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
|
|
90
91
|
:param device: optional device to move compressed output to
|
91
92
|
:return: dictionary of compressed weight data
|
92
93
|
"""
|
94
|
+
if global_scale is not None:
|
95
|
+
raise ValueError(
|
96
|
+
"global_scale is not supported for the NaiveQuantizationCompressor"
|
97
|
+
)
|
98
|
+
|
93
99
|
if can_quantize(weight, quantization_args):
|
94
100
|
quantized_weight = quantize(
|
95
101
|
x=weight,
|
@@ -94,6 +94,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
94
94
|
zero_point: Optional[Tensor] = None,
|
95
95
|
g_idx: Optional[torch.Tensor] = None,
|
96
96
|
device: Optional[torch.device] = None,
|
97
|
+
global_scale: Optional[torch.Tensor] = None,
|
97
98
|
) -> Dict[str, torch.Tensor]:
|
98
99
|
"""
|
99
100
|
Compresses a single uncompressed weight
|
@@ -106,6 +107,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
106
107
|
:param device: optional device to move compressed output to
|
107
108
|
:return: dictionary of compressed weight data
|
108
109
|
"""
|
110
|
+
if global_scale is not None:
|
111
|
+
raise ValueError(
|
112
|
+
"global_scale is not supported for the PackQuantizationCompressor"
|
113
|
+
)
|
114
|
+
|
109
115
|
compressed_dict = {}
|
110
116
|
if can_quantize(weight, quantization_args):
|
111
117
|
quantized_weight = quantize(
|
@@ -27,8 +27,14 @@ from compressed_tensors.quantization.lifecycle.compressed import (
|
|
27
27
|
)
|
28
28
|
from compressed_tensors.quantization.lifecycle.initialize import (
|
29
29
|
initialize_module_for_quantization,
|
30
|
+
update_fused_layer_weight_global_scales,
|
31
|
+
)
|
32
|
+
from compressed_tensors.quantization.quant_args import (
|
33
|
+
FP4_E2M1_DATA,
|
34
|
+
FP8_E4M3_DATA,
|
35
|
+
QuantizationArgs,
|
36
|
+
QuantizationType,
|
30
37
|
)
|
31
|
-
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
32
38
|
from compressed_tensors.quantization.quant_config import (
|
33
39
|
QuantizationConfig,
|
34
40
|
QuantizationStatus,
|
@@ -266,6 +272,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
|
|
266
272
|
)
|
267
273
|
)
|
268
274
|
|
275
|
+
if status == QuantizationStatus.INITIALIZED:
|
276
|
+
update_fused_layer_weight_global_scales(model)
|
277
|
+
|
269
278
|
if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
|
270
279
|
model.apply(compress_quantized_weights)
|
271
280
|
|
@@ -20,6 +20,7 @@ import torch
|
|
20
20
|
from compressed_tensors.quantization.quant_args import (
|
21
21
|
QuantizationArgs,
|
22
22
|
QuantizationStrategy,
|
23
|
+
QuantizationType,
|
23
24
|
round_to_quantized_type,
|
24
25
|
)
|
25
26
|
from compressed_tensors.quantization.quant_config import QuantizationStatus
|
@@ -49,6 +50,7 @@ def quantize(
|
|
49
50
|
args: QuantizationArgs,
|
50
51
|
dtype: Optional[torch.dtype] = None,
|
51
52
|
g_idx: Optional[torch.Tensor] = None,
|
53
|
+
global_scale: Optional[torch.Tensor] = None,
|
52
54
|
) -> torch.Tensor:
|
53
55
|
"""
|
54
56
|
Quantize the input tensor x using the QuantizationStrategy specified in args.
|
@@ -63,6 +65,7 @@ def quantize(
|
|
63
65
|
:param args: quantization args dictating how to quantize x
|
64
66
|
:param dtype: optional dtype to cast the quantized output to
|
65
67
|
:param g_idx: optional mapping from column index to group index
|
68
|
+
:param global_scale: optional constant to scale the quantization scale during QDQ
|
66
69
|
:return: fake quantized tensor
|
67
70
|
"""
|
68
71
|
|
@@ -75,6 +78,7 @@ def quantize(
|
|
75
78
|
do_quantize=True,
|
76
79
|
do_dequantize=False,
|
77
80
|
g_idx=g_idx,
|
81
|
+
global_scale=global_scale,
|
78
82
|
)
|
79
83
|
|
80
84
|
|
@@ -86,6 +90,7 @@ def dequantize(
|
|
86
90
|
args: Optional[QuantizationArgs] = None,
|
87
91
|
dtype: Optional[torch.dtype] = None,
|
88
92
|
g_idx: Optional[torch.Tensor] = None,
|
93
|
+
global_scale: Optional[torch.Tensor] = None,
|
89
94
|
) -> torch.Tensor:
|
90
95
|
"""
|
91
96
|
Dequantize a quantized input tensor x_q based on the strategy specified in args. If
|
@@ -97,6 +102,7 @@ def dequantize(
|
|
97
102
|
:param args: quantization args used to quantize x_q
|
98
103
|
:param dtype: optional dtype to cast the dequantized output to
|
99
104
|
:param g_idx: optional mapping from column index to group index
|
105
|
+
:param global_scale: optional constant to scale the quantization scale during QDQ
|
100
106
|
:return: dequantized float tensor
|
101
107
|
"""
|
102
108
|
if args is None:
|
@@ -128,6 +134,7 @@ def dequantize(
|
|
128
134
|
do_dequantize=True,
|
129
135
|
dtype=dtype,
|
130
136
|
g_idx=g_idx,
|
137
|
+
global_scale=global_scale,
|
131
138
|
)
|
132
139
|
|
133
140
|
|
@@ -138,6 +145,7 @@ def fake_quantize(
|
|
138
145
|
zero_point: torch.Tensor,
|
139
146
|
args: QuantizationArgs,
|
140
147
|
g_idx: Optional[torch.Tensor] = None,
|
148
|
+
global_scale: Optional[torch.Tensor] = None,
|
141
149
|
) -> torch.Tensor:
|
142
150
|
"""
|
143
151
|
Fake quantize the input tensor x by quantizing then dequantizing with
|
@@ -151,6 +159,7 @@ def fake_quantize(
|
|
151
159
|
:param zero_point: zero point tensor
|
152
160
|
:param args: quantization args dictating how to quantize x
|
153
161
|
:param g_idx: optional mapping from column index to group index
|
162
|
+
:param global_scale: optional constant to scale the quantization scale during QDQ
|
154
163
|
:return: fake quantized tensor
|
155
164
|
"""
|
156
165
|
return _process_quantization(
|
@@ -161,6 +170,7 @@ def fake_quantize(
|
|
161
170
|
do_quantize=True,
|
162
171
|
do_dequantize=True,
|
163
172
|
g_idx=g_idx,
|
173
|
+
global_scale=global_scale,
|
164
174
|
)
|
165
175
|
|
166
176
|
|
@@ -174,6 +184,7 @@ def _process_quantization(
|
|
174
184
|
dtype: Optional[torch.dtype] = None,
|
175
185
|
do_quantize: bool = True,
|
176
186
|
do_dequantize: bool = True,
|
187
|
+
global_scale: Optional[torch.Tensor] = None,
|
177
188
|
) -> torch.Tensor:
|
178
189
|
q_min, q_max = calculate_range(args, x.device)
|
179
190
|
group_size = args.group_size
|
@@ -221,18 +232,21 @@ def _process_quantization(
|
|
221
232
|
end = start + group_count
|
222
233
|
if do_quantize:
|
223
234
|
output[:, start:end] = _quantize(
|
224
|
-
x[:, start:end],
|
225
|
-
sc,
|
226
|
-
zp,
|
227
|
-
q_min,
|
228
|
-
q_max,
|
229
|
-
args,
|
235
|
+
x=x[:, start:end],
|
236
|
+
scale=sc,
|
237
|
+
zero_point=zp,
|
238
|
+
q_min=q_min,
|
239
|
+
q_max=q_max,
|
240
|
+
args=args,
|
230
241
|
dtype=dtype,
|
242
|
+
global_scale=global_scale,
|
231
243
|
)
|
232
244
|
|
233
245
|
if do_dequantize:
|
234
246
|
input = output[:, start:end] if do_quantize else x[:, start:end]
|
235
|
-
output[:, start:end] = _dequantize(
|
247
|
+
output[:, start:end] = _dequantize(
|
248
|
+
x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
|
249
|
+
)
|
236
250
|
|
237
251
|
if not is_column_order:
|
238
252
|
output = safe_permute(output, torch.argsort(perm), dim=1)
|
@@ -240,16 +254,22 @@ def _process_quantization(
|
|
240
254
|
else: # covers channel, token and tensor strategies
|
241
255
|
if do_quantize:
|
242
256
|
output = _quantize(
|
243
|
-
x,
|
244
|
-
scale,
|
245
|
-
zero_point,
|
246
|
-
q_min,
|
247
|
-
q_max,
|
248
|
-
args,
|
257
|
+
x=x,
|
258
|
+
scale=scale,
|
259
|
+
zero_point=zero_point,
|
260
|
+
q_min=q_min,
|
261
|
+
q_max=q_max,
|
262
|
+
args=args,
|
249
263
|
dtype=dtype,
|
264
|
+
global_scale=global_scale,
|
250
265
|
)
|
251
266
|
if do_dequantize:
|
252
|
-
output = _dequantize(
|
267
|
+
output = _dequantize(
|
268
|
+
output if do_quantize else x,
|
269
|
+
scale=scale,
|
270
|
+
zero_point=zero_point,
|
271
|
+
global_scale=global_scale,
|
272
|
+
)
|
253
273
|
|
254
274
|
return output
|
255
275
|
|
@@ -330,6 +350,7 @@ def forward_quantize(
|
|
330
350
|
return value
|
331
351
|
|
332
352
|
g_idx = getattr(module, "weight_g_idx", None)
|
353
|
+
global_scale = getattr(module, f"{base_name}_global_scale", None)
|
333
354
|
|
334
355
|
if args.dynamic:
|
335
356
|
# dynamic quantization - determine the scale/zp on the fly
|
@@ -345,6 +366,7 @@ def forward_quantize(
|
|
345
366
|
zero_point=zero_point,
|
346
367
|
args=args,
|
347
368
|
g_idx=g_idx,
|
369
|
+
global_scale=global_scale,
|
348
370
|
)
|
349
371
|
|
350
372
|
|
@@ -357,11 +379,18 @@ def _quantize(
|
|
357
379
|
q_max: torch.Tensor,
|
358
380
|
args: QuantizationArgs,
|
359
381
|
dtype: Optional[torch.dtype] = None,
|
382
|
+
global_scale: Optional[torch.Tensor] = None,
|
360
383
|
) -> torch.Tensor:
|
361
384
|
|
385
|
+
# if a global scale is optionally provided, use it
|
386
|
+
# to further scale the local `scale` parameter
|
387
|
+
if global_scale:
|
388
|
+
scale = scale.to(global_scale.dtype) / global_scale
|
389
|
+
|
362
390
|
scaled = x / scale
|
363
391
|
if zero_point is not None:
|
364
392
|
scaled += zero_point.to(x.dtype)
|
393
|
+
|
365
394
|
# clamp first because cast isn't guaranteed to be saturated (ie for fp8)
|
366
395
|
clamped_value = torch.clamp(
|
367
396
|
scaled,
|
@@ -381,7 +410,14 @@ def _dequantize(
|
|
381
410
|
scale: torch.Tensor,
|
382
411
|
zero_point: torch.Tensor = None,
|
383
412
|
dtype: Optional[torch.dtype] = None,
|
413
|
+
global_scale: Optional[torch.Tensor] = None,
|
384
414
|
) -> torch.Tensor:
|
415
|
+
|
416
|
+
# if a global scale is optionally provided, use it
|
417
|
+
# to further scale the local `scale` parameter
|
418
|
+
if global_scale:
|
419
|
+
scale = scale.to(global_scale.dtype) / global_scale
|
420
|
+
|
385
421
|
dequant_value = x_q.to(scale.dtype)
|
386
422
|
|
387
423
|
if zero_point is not None:
|
@@ -16,24 +16,33 @@
|
|
16
16
|
import logging
|
17
17
|
import math
|
18
18
|
from enum import Enum
|
19
|
-
from typing import Optional
|
19
|
+
from typing import List, Optional
|
20
20
|
|
21
21
|
import torch
|
22
22
|
from compressed_tensors.quantization.lifecycle.forward import (
|
23
23
|
wrap_module_forward_quantized,
|
24
24
|
)
|
25
25
|
from compressed_tensors.quantization.quant_args import (
|
26
|
+
FP4_E2M1_DATA,
|
27
|
+
FP8_E4M3_DATA,
|
26
28
|
ActivationOrdering,
|
27
29
|
QuantizationArgs,
|
28
30
|
QuantizationStrategy,
|
31
|
+
QuantizationType,
|
29
32
|
)
|
30
33
|
from compressed_tensors.quantization.quant_config import QuantizationStatus
|
31
34
|
from compressed_tensors.quantization.quant_scheme import QuantizationScheme
|
32
|
-
from compressed_tensors.quantization.utils import
|
35
|
+
from compressed_tensors.quantization.utils import (
|
36
|
+
generate_global_scale,
|
37
|
+
is_fp4,
|
38
|
+
is_kv_cache_quant_scheme,
|
39
|
+
iter_named_quantizable_modules,
|
40
|
+
)
|
33
41
|
from compressed_tensors.utils import (
|
34
42
|
disable_hf_hook,
|
35
43
|
get_execution_device,
|
36
44
|
register_offload_parameter,
|
45
|
+
update_parameter_data,
|
37
46
|
)
|
38
47
|
from torch.nn import Module, Parameter
|
39
48
|
|
@@ -42,6 +51,7 @@ __all__ = [
|
|
42
51
|
"initialize_module_for_quantization",
|
43
52
|
"is_attention_module",
|
44
53
|
"KVCacheScaleType",
|
54
|
+
"update_fused_layer_weight_global_scales",
|
45
55
|
]
|
46
56
|
|
47
57
|
|
@@ -170,7 +180,24 @@ def _initialize_scale_zero_point(
|
|
170
180
|
# TODO: consider erroring out in the future as if the dtype if not one fo these,
|
171
181
|
# there is likely bug
|
172
182
|
|
173
|
-
if
|
183
|
+
if is_fp4(quantization_args=quantization_args) and base_name == "weight":
|
184
|
+
scale_dtype = FP8_E4M3_DATA.dtype
|
185
|
+
# When applying weight-only FP4 quantization, generate a global_scale
|
186
|
+
# This scale is applied during runtime to ensure that the generated
|
187
|
+
# local scale falls properly within the FP8 range (i.e max value is FP8_max)
|
188
|
+
# which is the expected dtype of NVFP4A16 scales
|
189
|
+
value = generate_global_scale(input_tensor=module.weight)
|
190
|
+
value = value.to(device)
|
191
|
+
init_global_scale = Parameter(value, requires_grad=False)
|
192
|
+
register_offload_parameter(
|
193
|
+
module, f"{base_name}_global_scale", init_global_scale
|
194
|
+
)
|
195
|
+
|
196
|
+
if scale_dtype not in [
|
197
|
+
torch.float16,
|
198
|
+
torch.bfloat16,
|
199
|
+
torch.float32,
|
200
|
+
] and not is_fp4(quantization_args=quantization_args):
|
174
201
|
scale_dtype = torch.float16
|
175
202
|
|
176
203
|
# initializes empty scale, zero point, and g_idx parameters for the module
|
@@ -181,7 +208,11 @@ def _initialize_scale_zero_point(
|
|
181
208
|
register_offload_parameter(module, f"{base_name}_scale", init_scale)
|
182
209
|
|
183
210
|
if force_zero_point or not quantization_args.symmetric:
|
184
|
-
|
211
|
+
if is_fp4(quantization_args=quantization_args):
|
212
|
+
zp_dtype = FP8_E4M3_DATA.dtype
|
213
|
+
else:
|
214
|
+
zp_dtype = quantization_args.pytorch_dtype()
|
215
|
+
|
185
216
|
init_zero_point = Parameter(
|
186
217
|
torch.zeros(expected_shape, device=device, dtype=zp_dtype),
|
187
218
|
requires_grad=False,
|
@@ -219,3 +250,88 @@ def _initialize_attn_scales(module: Module) -> None:
|
|
219
250
|
requires_grad=False,
|
220
251
|
)
|
221
252
|
register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
|
253
|
+
|
254
|
+
|
255
|
+
# TODO: Potentially introduce an argument to turn this off
|
256
|
+
# Only relevant for NVFP4A16 currently
|
257
|
+
def update_fused_layer_weight_global_scales(model: torch.nn.Module):
|
258
|
+
"""
|
259
|
+
When running NVFP4A16 quantization, update the global scale
|
260
|
+
such that q,k,v layers are treated as one tensor with the same
|
261
|
+
global_scale and gate_proj/up_proj layers are treated as one tensor
|
262
|
+
with the same global scale. This is requirement currently being set
|
263
|
+
by vLLM and may be removed in the future OR potentially make it
|
264
|
+
an optional step.
|
265
|
+
|
266
|
+
:param model: model to quantize
|
267
|
+
"""
|
268
|
+
|
269
|
+
def _is_attention_module(module: Module):
|
270
|
+
return "attention" in module.__class__.__name__.lower() and (
|
271
|
+
hasattr(module, "k_proj")
|
272
|
+
or hasattr(module, "v_proj")
|
273
|
+
or hasattr(module, "qkv_proj")
|
274
|
+
)
|
275
|
+
|
276
|
+
def _is_mlp_module(module: Module):
|
277
|
+
return "mlp" in module.__class__.__name__.lower() and (
|
278
|
+
hasattr(module, "gate_proj") or hasattr(module, "up_proj")
|
279
|
+
)
|
280
|
+
|
281
|
+
def _valid_fp4_quant(layer_list: List[torch.nn.Linear]):
|
282
|
+
"""
|
283
|
+
Return True if all the linear layers in the layer_list are
|
284
|
+
NVFP4A16 quantized.
|
285
|
+
"""
|
286
|
+
for layer in layer_list:
|
287
|
+
scheme = getattr(layer, "quantization_scheme", None)
|
288
|
+
if scheme is None:
|
289
|
+
return False
|
290
|
+
|
291
|
+
weight_quant_args = scheme.weights
|
292
|
+
|
293
|
+
if weight_quant_args is None:
|
294
|
+
return False
|
295
|
+
|
296
|
+
if not is_fp4(quantization_args=weight_quant_args):
|
297
|
+
return False
|
298
|
+
return True
|
299
|
+
|
300
|
+
for name, submodule in iter_named_quantizable_modules(
|
301
|
+
model,
|
302
|
+
include_attn=True,
|
303
|
+
include_mlp=True,
|
304
|
+
):
|
305
|
+
|
306
|
+
if _is_attention_module(submodule):
|
307
|
+
|
308
|
+
if not _valid_fp4_quant(
|
309
|
+
[submodule.q_proj, submodule.v_proj, submodule.k_proj]
|
310
|
+
):
|
311
|
+
continue
|
312
|
+
|
313
|
+
q_weight = submodule.q_proj.weight.data
|
314
|
+
v_weight = submodule.v_proj.weight.data
|
315
|
+
k_weight = submodule.k_proj.weight.data
|
316
|
+
|
317
|
+
value = generate_global_scale(
|
318
|
+
input_tensor=torch.cat((q_weight, v_weight, k_weight), dim=0)
|
319
|
+
)
|
320
|
+
|
321
|
+
update_parameter_data(submodule.q_proj, value, "weight_global_scale")
|
322
|
+
update_parameter_data(submodule.k_proj, value, "weight_global_scale")
|
323
|
+
update_parameter_data(submodule.v_proj, value, "weight_global_scale")
|
324
|
+
|
325
|
+
if _is_mlp_module(submodule):
|
326
|
+
if not _valid_fp4_quant([submodule.gate_proj, submodule.up_proj]):
|
327
|
+
continue
|
328
|
+
|
329
|
+
gate_data = submodule.gate_proj.weight.data
|
330
|
+
up_data = submodule.up_proj.weight.data
|
331
|
+
|
332
|
+
value = generate_global_scale(
|
333
|
+
input_tensor=torch.cat((gate_data, up_data), dim=0)
|
334
|
+
)
|
335
|
+
|
336
|
+
update_parameter_data(submodule.gate_proj, value, "weight_global_scale")
|
337
|
+
update_parameter_data(submodule.up_proj, value, "weight_global_scale")
|
@@ -17,7 +17,9 @@ from typing import Generator, List, Optional, Tuple
|
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from compressed_tensors.quantization.quant_args import (
|
20
|
-
|
20
|
+
FP4_E2M1_DATA,
|
21
|
+
FP8_E4M3_DATA,
|
22
|
+
FloatArgs,
|
21
23
|
QuantizationArgs,
|
22
24
|
QuantizationStrategy,
|
23
25
|
QuantizationType,
|
@@ -44,6 +46,8 @@ __all__ = [
|
|
44
46
|
"compute_dynamic_scales_and_zp",
|
45
47
|
"calculate_range",
|
46
48
|
"calculate_qparams",
|
49
|
+
"generate_global_scale",
|
50
|
+
"is_fp4",
|
47
51
|
]
|
48
52
|
|
49
53
|
# target the self_attn layer
|
@@ -53,8 +57,18 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
|
|
53
57
|
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
54
58
|
|
55
59
|
|
60
|
+
def is_fp4(quantization_args: QuantizationArgs):
|
61
|
+
return (
|
62
|
+
quantization_args.num_bits == 4
|
63
|
+
and quantization_args.type == QuantizationType.FLOAT
|
64
|
+
)
|
65
|
+
|
66
|
+
|
56
67
|
def calculate_qparams(
|
57
|
-
min_vals: Tensor,
|
68
|
+
min_vals: Tensor,
|
69
|
+
max_vals: Tensor,
|
70
|
+
quantization_args: QuantizationArgs,
|
71
|
+
global_scale: Optional[Tensor] = None,
|
58
72
|
) -> Tuple[FloatTensor, IntTensor]:
|
59
73
|
"""
|
60
74
|
:param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
|
@@ -62,7 +76,11 @@ def calculate_qparams(
|
|
62
76
|
:param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
|
63
77
|
from
|
64
78
|
:param quantization_args: settings to quantization
|
65
|
-
:
|
79
|
+
:param global_scale: additional global scale to scale the locally generated scale
|
80
|
+
currently only applied/supported for Fp4
|
81
|
+
|
82
|
+
:return: tuple of the calculated scale(s) and zero point(s). For FP4, the calculated
|
83
|
+
scale if of dtype FP8
|
66
84
|
"""
|
67
85
|
# based on the implementations for consuming quantized values,
|
68
86
|
# 0.0 must always be representable within the quantized range
|
@@ -73,14 +91,40 @@ def calculate_qparams(
|
|
73
91
|
|
74
92
|
bit_min, bit_max = calculate_range(quantization_args, device)
|
75
93
|
bit_range = bit_max - bit_min
|
76
|
-
|
94
|
+
|
95
|
+
if is_fp4(quantization_args=quantization_args):
|
96
|
+
zp_dtype = FP8_E4M3_DATA.dtype
|
97
|
+
else:
|
98
|
+
zp_dtype = quantization_args.pytorch_dtype()
|
77
99
|
|
78
100
|
if quantization_args.symmetric:
|
79
101
|
max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
|
80
|
-
|
81
|
-
|
102
|
+
|
103
|
+
if is_fp4(quantization_args=quantization_args) and global_scale is not None:
|
104
|
+
# Conditionally scale the generated local scale by a global_scale
|
105
|
+
scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
|
106
|
+
scales = scales.to(FP8_E4M3_DATA.dtype)
|
107
|
+
else:
|
108
|
+
scales = max_val_pos / (float(bit_range) / 2)
|
109
|
+
|
110
|
+
if scales.dtype == FP8_E4M3_DATA.dtype:
|
111
|
+
# torch.clamp not supported for FP8
|
112
|
+
# use the next largest fp8 value from 0
|
113
|
+
scales = torch.where(
|
114
|
+
scales == 0,
|
115
|
+
torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
|
116
|
+
scales,
|
117
|
+
)
|
118
|
+
else:
|
119
|
+
scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
|
120
|
+
|
82
121
|
zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
|
83
122
|
else:
|
123
|
+
if is_fp4(quantization_args=quantization_args):
|
124
|
+
raise NotImplementedError(
|
125
|
+
"Asymmetric Quantization is not supported for FP4"
|
126
|
+
)
|
127
|
+
|
84
128
|
scales = (max_vals - min_vals) / float(bit_range)
|
85
129
|
scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
|
86
130
|
zero_points = bit_min - (min_vals / scales)
|
@@ -144,14 +188,16 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
|
|
144
188
|
q_max = torch.tensor(bit_range / 2 - 1, device=device)
|
145
189
|
q_min = torch.tensor(-bit_range / 2, device=device)
|
146
190
|
elif quantization_args.type == QuantizationType.FLOAT:
|
147
|
-
if quantization_args.num_bits
|
148
|
-
|
149
|
-
|
150
|
-
|
191
|
+
if quantization_args.num_bits == 8:
|
192
|
+
q_max = torch.tensor(FP8_E4M3_DATA.max, device=device)
|
193
|
+
q_min = torch.tensor(FP8_E4M3_DATA.min, device=device)
|
194
|
+
elif quantization_args.num_bits == 4:
|
195
|
+
q_max = torch.tensor(FP4_E2M1_DATA.max, device=device)
|
196
|
+
q_min = torch.tensor(FP4_E2M1_DATA.min, device=device)
|
197
|
+
else:
|
198
|
+
raise NotImplementedError(
|
199
|
+
"Range calculation only supported for 4 and 8 bits"
|
151
200
|
)
|
152
|
-
fp_range_info = torch.finfo(FP8_DTYPE)
|
153
|
-
q_max = torch.tensor(fp_range_info.max, device=device)
|
154
|
-
q_min = torch.tensor(fp_range_info.min, device=device)
|
155
201
|
else:
|
156
202
|
raise ValueError(f"Invalid quantization type {quantization_args.type}")
|
157
203
|
|
@@ -249,7 +295,10 @@ def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None
|
|
249
295
|
|
250
296
|
|
251
297
|
def iter_named_quantizable_modules(
|
252
|
-
model: Module,
|
298
|
+
model: Module,
|
299
|
+
include_children: bool = True,
|
300
|
+
include_attn: bool = False,
|
301
|
+
include_mlp: bool = False,
|
253
302
|
) -> Generator[Tuple[str, Module], None, None]:
|
254
303
|
"""
|
255
304
|
Yield name and submodule of
|
@@ -282,6 +331,9 @@ def iter_named_quantizable_modules(
|
|
282
331
|
if include_attn:
|
283
332
|
if name.endswith("self_attn"):
|
284
333
|
yield name, submodule
|
334
|
+
if include_mlp:
|
335
|
+
if name.endswith("mlp"):
|
336
|
+
yield name, submodule
|
285
337
|
|
286
338
|
|
287
339
|
def get_torch_bit_depth(value: torch.Tensor) -> int:
|
@@ -396,3 +448,24 @@ def parse_out_kv_cache_args(
|
|
396
448
|
kv_cache_args = None
|
397
449
|
|
398
450
|
return kv_cache_args, quant_scheme_to_layers
|
451
|
+
|
452
|
+
|
453
|
+
def generate_global_scale(
|
454
|
+
input_tensor: torch.Tensor,
|
455
|
+
scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
|
456
|
+
quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
|
457
|
+
dtype: Optional[torch.dtype] = torch.float32,
|
458
|
+
):
|
459
|
+
"""
|
460
|
+
Generate a global scale for an entire tensor (input_tensor).
|
461
|
+
Goal of the scale is to ensure that the quantization (local) scale
|
462
|
+
falls into the approproiate dtype range.
|
463
|
+
|
464
|
+
E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
|
465
|
+
attempts to use the entire FP8 dtype range while mapping a per-group max
|
466
|
+
to the FP4 max.
|
467
|
+
"""
|
468
|
+
scale_dtype = scale_data.dtype
|
469
|
+
tensor_amax = torch.abs(input_tensor.data).max().to(dtype)
|
470
|
+
global_scale = scale_data.max * quant_data.max / tensor_amax
|
471
|
+
return global_scale.to(dtype)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.5a20250520
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -16,12 +16,15 @@
|
|
16
16
|
import math
|
17
17
|
|
18
18
|
import pytest
|
19
|
+
import torch
|
19
20
|
from compressed_tensors.quantization import (
|
21
|
+
FP8_E4M3_DATA,
|
20
22
|
ActivationOrdering,
|
21
23
|
QuantizationArgs,
|
22
24
|
QuantizationScheme,
|
23
25
|
QuantizationStatus,
|
24
26
|
QuantizationStrategy,
|
27
|
+
QuantizationType,
|
25
28
|
)
|
26
29
|
from compressed_tensors.quantization.lifecycle.initialize import (
|
27
30
|
initialize_module_for_quantization,
|
@@ -152,6 +155,10 @@ def test_initialize_module_for_quantization_offloaded(
|
|
152
155
|
QuantizationArgs(strategy="group", group_size=2, actorder="weight"),
|
153
156
|
None,
|
154
157
|
),
|
158
|
+
(
|
159
|
+
QuantizationArgs(strategy="group", group_size=16, type="float", num_bits=4),
|
160
|
+
None,
|
161
|
+
),
|
155
162
|
(
|
156
163
|
QuantizationArgs(strategy="block"),
|
157
164
|
QuantizationArgs(strategy="block"),
|
@@ -177,6 +184,14 @@ def test_initialize_quantization_parameters(weights, input_activations):
|
|
177
184
|
continue
|
178
185
|
q_param_name = Q_PARAM_NAMES[q_type]
|
179
186
|
|
187
|
+
if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
|
188
|
+
assert hasattr(layer, "weight_global_scale")
|
189
|
+
assert layer.weight_global_scale.dtype == torch.float32
|
190
|
+
assert layer.weight_global_scale.numel() == 1
|
191
|
+
assert layer.weight_scale.dtype == FP8_E4M3_DATA.dtype
|
192
|
+
else:
|
193
|
+
assert not hasattr(layer, "weight_global_scale")
|
194
|
+
|
180
195
|
# scale and zero point
|
181
196
|
if args.strategy == QuantizationStrategy.TENSOR:
|
182
197
|
expected_shape = (1,)
|
@@ -14,8 +14,16 @@
|
|
14
14
|
|
15
15
|
import pytest
|
16
16
|
import torch
|
17
|
-
from compressed_tensors.quantization import
|
18
|
-
|
17
|
+
from compressed_tensors.quantization import (
|
18
|
+
FP4_E2M1_DATA,
|
19
|
+
FP8_E4M3_DATA,
|
20
|
+
QuantizationArgs,
|
21
|
+
QuantizationStrategy,
|
22
|
+
)
|
23
|
+
from compressed_tensors.quantization.utils import (
|
24
|
+
calculate_qparams,
|
25
|
+
generate_global_scale,
|
26
|
+
)
|
19
27
|
|
20
28
|
|
21
29
|
@pytest.mark.parametrize(
|
@@ -56,3 +64,14 @@ def test_calculate_qparams(keepdims, strategy, exp_shape):
|
|
56
64
|
scale, zp = calculate_qparams(min_val, max_val, args)
|
57
65
|
assert scale.shape == exp_shape
|
58
66
|
assert zp.shape == exp_shape
|
67
|
+
|
68
|
+
|
69
|
+
def test_fused_global_scales():
|
70
|
+
layer = torch.nn.Linear(7, 8)
|
71
|
+
max_tensor_value = torch.abs(layer.weight.data).max()
|
72
|
+
# use defaults
|
73
|
+
global_scale = generate_global_scale(layer.weight)
|
74
|
+
# max value should be = (448 * 6) / global_scale
|
75
|
+
assert max_tensor_value == pytest.approx(
|
76
|
+
FP4_E2M1_DATA.max * FP8_E4M3_DATA.max / global_scale, abs=0.001
|
77
|
+
)
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/scripts/step-status
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/build.yml
RENAMED
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/report.yml
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/test.yml
RENAMED
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/.github/workflows/upload.yml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_registry.py
RENAMED
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/test_utils/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed_tensors-0.9.5a20250514 → compressed_tensors-0.9.5a20250520}/tests/testing_utils.py
RENAMED
File without changes
|
File without changes
|