compressed-tensors 0.12.3a20251030__tar.gz → 0.12.3a20251114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.github/workflows/test-check.yaml +1 -1
- {compressed_tensors-0.12.3a20251030/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251114}/PKG-INFO +1 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/quantized_compressors/base.py +8 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +3 -2
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/forward.py +6 -9
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/initialize.py +11 -17
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/quant_args.py +73 -8
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/quant_config.py +0 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/quant_scheme.py +7 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/utils/helpers.py +45 -43
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/helpers.py +40 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors.egg-info/SOURCES.txt +0 -5
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/model_compressors/test_model_compressor.py +13 -2
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_apply.py +40 -7
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +1 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_initialize.py +15 -3
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_lifecycle.py +1 -1
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_static_lifecycle.py +5 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/test_helpers.py +18 -0
- compressed_tensors-0.12.3a20251030/.github/workflows/build-test.yml +0 -57
- compressed_tensors-0.12.3a20251030/.github/workflows/build.yml +0 -134
- compressed_tensors-0.12.3a20251030/.github/workflows/post-release-nightly-build.yml +0 -15
- compressed_tensors-0.12.3a20251030/.github/workflows/test.yml +0 -187
- compressed_tensors-0.12.3a20251030/.github/workflows/trigger-all.yml +0 -45
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.github/.gitkeep +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.github/workflows/quality-check.yaml +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/.gitignore +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/LICENSE +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/Makefile +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/README.md +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/pyproject.toml +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/setup.cfg +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/setup.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/format.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/linear/compressed_linear.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/logger.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/modeling/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/modeling/attention.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/modeling/kvcache.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/apply.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/apply.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/factory/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/factory/base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/transform_args.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/transform_config.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/transform_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/transform/utils/matrix.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/internal.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/match.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/offload.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/safetensors_load.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors/utils/type.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/mock_observer.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_configs/test_infer_quant.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_modeling/test_attention_and_cache.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_registry.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/factory/test_correctness.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/factory/test_memory.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/factory/test_serialization.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/test_transform_args.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/test_transform_config.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/test_transform_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_transform/utils/test_hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/test_match.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/test_offload.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/test_utils/test_type.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/tests/testing_utils.py +0 -0
- {compressed_tensors-0.12.3a20251030 → compressed_tensors-0.12.3a20251114}/utils/copyright.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.3a20251114
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -90,7 +90,6 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
|
90
90
|
desc = "Compressing with quantization"
|
|
91
91
|
for name in tqdm(uncompressed_names, desc=desc, disable=(not show_progress)):
|
|
92
92
|
value = model_state[name]
|
|
93
|
-
|
|
94
93
|
# compress weights
|
|
95
94
|
if name.endswith("weight"):
|
|
96
95
|
prefix = name.removesuffix("weight")
|
|
@@ -129,10 +128,18 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
|
129
128
|
if name.endswith("zero_point") and self._skip_zp(name, names_to_scheme):
|
|
130
129
|
continue
|
|
131
130
|
|
|
131
|
+
if name.endswith("weight_scale") and self._skip_scale():
|
|
132
|
+
continue
|
|
133
|
+
|
|
132
134
|
compressed_dict[name] = value.to(compression_device)
|
|
133
135
|
|
|
134
136
|
return compressed_dict
|
|
135
137
|
|
|
138
|
+
def _skip_scale(self):
|
|
139
|
+
from compressed_tensors.compressors import NVFP4PackedCompressor
|
|
140
|
+
|
|
141
|
+
return isinstance(self, NVFP4PackedCompressor)
|
|
142
|
+
|
|
136
143
|
def _skip_zp(
|
|
137
144
|
self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
|
|
138
145
|
) -> bool:
|
|
@@ -26,7 +26,7 @@ from compressed_tensors.quantization.lifecycle.forward import dequantize, quanti
|
|
|
26
26
|
from torch import Tensor
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8"]
|
|
29
|
+
__all__ = ["pack_fp4_to_uint8", "unpack_fp4_from_uint8", "NVFP4PackedCompressor"]
|
|
30
30
|
|
|
31
31
|
FLOAT_TO_E2M1 = [
|
|
32
32
|
0.0,
|
|
@@ -103,6 +103,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
103
103
|
if device is not None:
|
|
104
104
|
weight_packed = weight_packed.to(device)
|
|
105
105
|
compressed_dict["weight_packed"] = weight_packed
|
|
106
|
+
compressed_dict["weight_scale"] = scale.to(quantization_args.scale_dtype)
|
|
106
107
|
return compressed_dict
|
|
107
108
|
|
|
108
109
|
def decompress_weight(
|
|
@@ -111,8 +112,8 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
111
112
|
quantization_args: Optional[QuantizationArgs] = None,
|
|
112
113
|
) -> torch.Tensor:
|
|
113
114
|
weight = compressed_data["weight_packed"]
|
|
114
|
-
scale = compressed_data["weight_scale"]
|
|
115
115
|
global_scale = compressed_data["weight_global_scale"]
|
|
116
|
+
scale = compressed_data["weight_scale"]
|
|
116
117
|
m, n = weight.shape
|
|
117
118
|
# TODO: use a user provided dequant dtype
|
|
118
119
|
unpacked = unpack_fp4_from_uint8(weight, m, n * 2)
|
|
@@ -21,7 +21,7 @@ from compressed_tensors.quantization.quant_args import (
|
|
|
21
21
|
DynamicType,
|
|
22
22
|
QuantizationArgs,
|
|
23
23
|
QuantizationStrategy,
|
|
24
|
-
|
|
24
|
+
round_to_quantized_type_args,
|
|
25
25
|
)
|
|
26
26
|
from compressed_tensors.quantization.quant_config import QuantizationStatus
|
|
27
27
|
from compressed_tensors.quantization.quant_scheme import QuantizationScheme
|
|
@@ -466,20 +466,17 @@ def _quantize(
|
|
|
466
466
|
# if a global scale is optionally provided, use it
|
|
467
467
|
# to further scale the local `scale` parameter
|
|
468
468
|
if global_scale is not None:
|
|
469
|
-
scale = scale
|
|
469
|
+
scale = scale / global_scale
|
|
470
470
|
|
|
471
471
|
scaled = x / scale
|
|
472
472
|
|
|
473
473
|
if zero_point is not None:
|
|
474
474
|
scaled += zero_point.to(x.dtype)
|
|
475
475
|
|
|
476
|
-
# clamp
|
|
477
|
-
|
|
478
|
-
scaled,
|
|
479
|
-
q_min,
|
|
480
|
-
q_max,
|
|
476
|
+
# clamp and round
|
|
477
|
+
quantized_value = round_to_quantized_type_args(
|
|
478
|
+
tensor=scaled, args=args, min=q_min, max=q_max
|
|
481
479
|
)
|
|
482
|
-
quantized_value = round_to_quantized_type(clamped_value, args)
|
|
483
480
|
|
|
484
481
|
if dtype is not None:
|
|
485
482
|
quantized_value = quantized_value.to(dtype)
|
|
@@ -499,7 +496,7 @@ def _dequantize(
|
|
|
499
496
|
# if a global scale is optionally provided, use it
|
|
500
497
|
# to further scale the local `scale` parameter
|
|
501
498
|
if global_scale is not None:
|
|
502
|
-
scale = scale
|
|
499
|
+
scale = scale / global_scale
|
|
503
500
|
|
|
504
501
|
dequant_value = x_q.to(scale.dtype)
|
|
505
502
|
|
|
@@ -24,7 +24,6 @@ from compressed_tensors.modeling import (
|
|
|
24
24
|
QuantizedKVCache,
|
|
25
25
|
)
|
|
26
26
|
from compressed_tensors.quantization import (
|
|
27
|
-
FP8_E4M3_DATA,
|
|
28
27
|
ActivationOrdering,
|
|
29
28
|
DynamicType,
|
|
30
29
|
QuantizationArgs,
|
|
@@ -36,7 +35,7 @@ from compressed_tensors.quantization import (
|
|
|
36
35
|
from compressed_tensors.quantization.lifecycle.forward import (
|
|
37
36
|
wrap_module_forward_quantized,
|
|
38
37
|
)
|
|
39
|
-
from compressed_tensors.quantization.utils import
|
|
38
|
+
from compressed_tensors.quantization.utils import strategy_cdiv
|
|
40
39
|
from compressed_tensors.utils import (
|
|
41
40
|
disable_hf_hook,
|
|
42
41
|
get_execution_device,
|
|
@@ -250,20 +249,13 @@ def initialize_qparams(
|
|
|
250
249
|
|
|
251
250
|
# 2. Identify quantization scale and zp dtype
|
|
252
251
|
scale_dtype = observed_dtype
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
torch.float16,
|
|
261
|
-
torch.bfloat16,
|
|
262
|
-
torch.float32,
|
|
263
|
-
torch.float64,
|
|
264
|
-
]:
|
|
265
|
-
scale_dtype = torch.bfloat16
|
|
266
|
-
zp_dtype = quantization_args.pytorch_dtype()
|
|
252
|
+
if scale_dtype not in [
|
|
253
|
+
torch.float16,
|
|
254
|
+
torch.bfloat16,
|
|
255
|
+
torch.float32,
|
|
256
|
+
torch.float64,
|
|
257
|
+
]:
|
|
258
|
+
scale_dtype = torch.float16
|
|
267
259
|
|
|
268
260
|
# 3. Initializes scale/zp for the module
|
|
269
261
|
init_scale = Parameter(
|
|
@@ -274,7 +266,9 @@ def initialize_qparams(
|
|
|
274
266
|
|
|
275
267
|
if force_zero_point or not quantization_args.symmetric:
|
|
276
268
|
init_zero_point = Parameter(
|
|
277
|
-
torch.zeros(
|
|
269
|
+
torch.zeros(
|
|
270
|
+
expected_shape, device=device, dtype=quantization_args.zp_dtype
|
|
271
|
+
),
|
|
278
272
|
requires_grad=False,
|
|
279
273
|
)
|
|
280
274
|
register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point)
|
|
@@ -19,7 +19,15 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
19
19
|
import torch
|
|
20
20
|
from compressed_tensors.utils import Aliasable
|
|
21
21
|
from compressed_tensors.utils.helpers import deprecated
|
|
22
|
-
from
|
|
22
|
+
from compressed_tensors.utils.type import TorchDtype
|
|
23
|
+
from pydantic import (
|
|
24
|
+
BaseModel,
|
|
25
|
+
ConfigDict,
|
|
26
|
+
Field,
|
|
27
|
+
field_serializer,
|
|
28
|
+
field_validator,
|
|
29
|
+
model_validator,
|
|
30
|
+
)
|
|
23
31
|
|
|
24
32
|
|
|
25
33
|
__all__ = [
|
|
@@ -30,7 +38,8 @@ __all__ = [
|
|
|
30
38
|
"QuantizationType",
|
|
31
39
|
"QuantizationStrategy",
|
|
32
40
|
"QuantizationArgs",
|
|
33
|
-
"
|
|
41
|
+
"round_to_quantized_type_args",
|
|
42
|
+
"round_to_quantized_type_dtype",
|
|
34
43
|
"ActivationOrdering",
|
|
35
44
|
"DynamicType",
|
|
36
45
|
]
|
|
@@ -174,6 +183,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
174
183
|
block_structure: Optional[List[int]] = None
|
|
175
184
|
dynamic: Union[DynamicType, bool] = False
|
|
176
185
|
actorder: Union[ActivationOrdering, bool, None] = None
|
|
186
|
+
scale_dtype: Optional[TorchDtype] = None
|
|
187
|
+
zp_dtype: Optional[TorchDtype] = None
|
|
177
188
|
observer: Optional[str] = Field(
|
|
178
189
|
default=None,
|
|
179
190
|
description=(
|
|
@@ -189,6 +200,12 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
189
200
|
),
|
|
190
201
|
)
|
|
191
202
|
|
|
203
|
+
@field_serializer("zp_dtype")
|
|
204
|
+
def serialize_dtype(self, dtype: torch.dtype):
|
|
205
|
+
if self.symmetric:
|
|
206
|
+
return None
|
|
207
|
+
return str(dtype)
|
|
208
|
+
|
|
192
209
|
@field_validator("type", mode="before")
|
|
193
210
|
def validate_type(cls, value) -> QuantizationType:
|
|
194
211
|
if isinstance(value, str):
|
|
@@ -266,6 +283,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
266
283
|
dynamic = model.dynamic
|
|
267
284
|
observer = model.observer
|
|
268
285
|
dynamic = model.dynamic
|
|
286
|
+
zp_dtype = model.zp_dtype
|
|
269
287
|
|
|
270
288
|
# infer strategy
|
|
271
289
|
if strategy is None:
|
|
@@ -353,9 +371,16 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
353
371
|
# default to minmax for non-dynamic cases
|
|
354
372
|
observer = "minmax"
|
|
355
373
|
|
|
374
|
+
if zp_dtype is None:
|
|
375
|
+
if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
|
|
376
|
+
zp_dtype = FP8_E4M3_DATA.dtype
|
|
377
|
+
else:
|
|
378
|
+
zp_dtype = model.pytorch_dtype()
|
|
379
|
+
|
|
356
380
|
# write back modified values
|
|
357
381
|
model.strategy = strategy
|
|
358
382
|
model.observer = observer
|
|
383
|
+
model.zp_dtype = zp_dtype
|
|
359
384
|
return model
|
|
360
385
|
|
|
361
386
|
def pytorch_dtype(self) -> torch.dtype:
|
|
@@ -381,18 +406,56 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
381
406
|
model_config = ConfigDict(extra="forbid")
|
|
382
407
|
|
|
383
408
|
|
|
384
|
-
def
|
|
385
|
-
tensor: torch.Tensor,
|
|
409
|
+
def round_to_quantized_type_dtype(
|
|
410
|
+
tensor: torch.Tensor,
|
|
411
|
+
dtype: torch.dtype,
|
|
412
|
+
cast_to_original_dtype: Optional[bool] = True,
|
|
386
413
|
) -> torch.Tensor:
|
|
387
414
|
"""
|
|
388
|
-
Rounds
|
|
389
|
-
|
|
415
|
+
Rounds an input tensor to the nearest quantized representation given a dtype.
|
|
416
|
+
The original dtype is kept post-rounding.
|
|
390
417
|
|
|
391
418
|
:param tensor: tensor to round
|
|
392
|
-
:param
|
|
419
|
+
:param dtype: dtype to use for rounding
|
|
420
|
+
:param cast_to_original_dtype: whether or not we cast the rounded tensor to
|
|
421
|
+
the original dtype
|
|
393
422
|
:return: rounded tensor
|
|
394
423
|
"""
|
|
395
424
|
original_dtype = tensor.dtype
|
|
425
|
+
if torch.is_floating_point(torch.tensor([], dtype=dtype)):
|
|
426
|
+
finfo = torch.finfo(dtype)
|
|
427
|
+
rounded = torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
|
|
428
|
+
else:
|
|
429
|
+
iinfo = torch.iinfo(dtype)
|
|
430
|
+
rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max)).to(dtype)
|
|
431
|
+
|
|
432
|
+
if cast_to_original_dtype:
|
|
433
|
+
return rounded.to(original_dtype)
|
|
434
|
+
return rounded
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def round_to_quantized_type_args(
|
|
438
|
+
tensor: torch.Tensor,
|
|
439
|
+
args: QuantizationArgs,
|
|
440
|
+
min: torch.Tensor,
|
|
441
|
+
max: torch.Tensor,
|
|
442
|
+
cast_to_original_dtype: Optional[bool] = True,
|
|
443
|
+
) -> torch.Tensor:
|
|
444
|
+
"""
|
|
445
|
+
Rounds an input tensor to the nearest quantized representation given
|
|
446
|
+
qunatization args. The original dtype is kept post-rounding.
|
|
447
|
+
|
|
448
|
+
:param tensor: tensor to round
|
|
449
|
+
:param args: quantization args to use for rounding
|
|
450
|
+
:param min: min value to use for clamping
|
|
451
|
+
:param max: max value to use for clamping
|
|
452
|
+
:param cast_to_original_dtype: whether or not we cast the rounded tensor to
|
|
453
|
+
the original dtype
|
|
454
|
+
:return: rounded tensor
|
|
455
|
+
"""
|
|
456
|
+
|
|
457
|
+
original_dtype = tensor.dtype
|
|
458
|
+
tensor = torch.clamp(tensor, min, max)
|
|
396
459
|
if args.type == QuantizationType.FLOAT:
|
|
397
460
|
if args.num_bits == 8:
|
|
398
461
|
rounded = tensor.to(FP8_E4M3_DATA.dtype)
|
|
@@ -405,4 +468,6 @@ def round_to_quantized_type(
|
|
|
405
468
|
else:
|
|
406
469
|
raise ValueError(f"Invalid quantization type {args.type}")
|
|
407
470
|
|
|
408
|
-
|
|
471
|
+
if cast_to_original_dtype:
|
|
472
|
+
return rounded.to(original_dtype)
|
|
473
|
+
return rounded
|
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
from collections import defaultdict
|
|
16
15
|
from enum import Enum
|
|
17
16
|
from typing import Annotated, Any, Dict, List, Optional, Set, Union
|
|
@@ -18,6 +18,7 @@ from typing import List, Optional
|
|
|
18
18
|
|
|
19
19
|
from compressed_tensors.config import CompressionFormat
|
|
20
20
|
from compressed_tensors.quantization.quant_args import (
|
|
21
|
+
FP8_E4M3_DATA,
|
|
21
22
|
DynamicType,
|
|
22
23
|
QuantizationArgs,
|
|
23
24
|
QuantizationStrategy,
|
|
@@ -160,6 +161,8 @@ NVFP4A16 = dict(
|
|
|
160
161
|
symmetric=True,
|
|
161
162
|
dynamic=False,
|
|
162
163
|
group_size=16,
|
|
164
|
+
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
165
|
+
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
163
166
|
)
|
|
164
167
|
)
|
|
165
168
|
|
|
@@ -173,6 +176,8 @@ NVFP4 = dict(
|
|
|
173
176
|
dynamic=False,
|
|
174
177
|
group_size=16,
|
|
175
178
|
observer="static_minmax",
|
|
179
|
+
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
180
|
+
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
176
181
|
),
|
|
177
182
|
input_activations=QuantizationArgs(
|
|
178
183
|
num_bits=4,
|
|
@@ -182,6 +187,8 @@ NVFP4 = dict(
|
|
|
182
187
|
dynamic=DynamicType.LOCAL,
|
|
183
188
|
group_size=16,
|
|
184
189
|
observer="static_minmax",
|
|
190
|
+
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
191
|
+
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
185
192
|
),
|
|
186
193
|
)
|
|
187
194
|
|
|
@@ -24,6 +24,7 @@ from compressed_tensors.quantization.quant_args import (
|
|
|
24
24
|
QuantizationArgs,
|
|
25
25
|
QuantizationStrategy,
|
|
26
26
|
QuantizationType,
|
|
27
|
+
round_to_quantized_type_dtype,
|
|
27
28
|
)
|
|
28
29
|
from compressed_tensors.quantization.quant_scheme import QuantizationScheme
|
|
29
30
|
from compressed_tensors.utils import deprecated
|
|
@@ -46,7 +47,6 @@ __all__ = [
|
|
|
46
47
|
"calculate_range",
|
|
47
48
|
"calculate_qparams",
|
|
48
49
|
"generate_gparam",
|
|
49
|
-
"is_fp4",
|
|
50
50
|
"strategy_cdiv",
|
|
51
51
|
]
|
|
52
52
|
|
|
@@ -57,13 +57,6 @@ KV_CACHE_TARGETS = ["re:.*self_attn$"]
|
|
|
57
57
|
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def is_fp4(quantization_args: QuantizationArgs):
|
|
61
|
-
return (
|
|
62
|
-
quantization_args.num_bits == 4
|
|
63
|
-
and quantization_args.type == QuantizationType.FLOAT
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
60
|
def calculate_qparams(
|
|
68
61
|
min_vals: Tensor,
|
|
69
62
|
max_vals: Tensor,
|
|
@@ -92,52 +85,50 @@ def calculate_qparams(
|
|
|
92
85
|
bit_min, bit_max = calculate_range(quantization_args, device)
|
|
93
86
|
bit_range = bit_max - bit_min
|
|
94
87
|
|
|
95
|
-
|
|
96
|
-
zp_dtype = FP8_E4M3_DATA.dtype
|
|
97
|
-
else:
|
|
98
|
-
zp_dtype = quantization_args.pytorch_dtype()
|
|
99
|
-
|
|
88
|
+
# 1. Generate scale and zero-point
|
|
100
89
|
if quantization_args.symmetric:
|
|
101
90
|
max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
|
|
102
|
-
|
|
103
|
-
if is_fp4(quantization_args=quantization_args) and global_scale is not None:
|
|
104
|
-
# Conditionally scale the generated local scale by a global_scale
|
|
105
|
-
scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
|
|
106
|
-
scales = torch.clamp(scales, max=FP8_E4M3_DATA.max, min=FP8_E4M3_DATA.min)
|
|
107
|
-
scales = scales.to(FP8_E4M3_DATA.dtype)
|
|
108
|
-
|
|
109
|
-
else:
|
|
110
|
-
scales = max_val_pos / (float(bit_range) / 2)
|
|
111
|
-
|
|
112
|
-
# TODO: in the case of MoEs, the global_scale may also be 0/need to be clamped
|
|
113
|
-
if scales.dtype == FP8_E4M3_DATA.dtype:
|
|
114
|
-
# torch.clamp not supported for FP8
|
|
115
|
-
# use the next largest fp8 value from 0
|
|
116
|
-
scales = torch.where(
|
|
117
|
-
scales == 0,
|
|
118
|
-
torch.tensor(0.125, dtype=FP8_E4M3_DATA.dtype, device=device),
|
|
119
|
-
scales,
|
|
120
|
-
)
|
|
121
|
-
else:
|
|
122
|
-
scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
|
|
123
|
-
|
|
91
|
+
scales = max_val_pos / (float(bit_range) / 2)
|
|
124
92
|
zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
|
|
125
93
|
else:
|
|
126
|
-
if
|
|
94
|
+
if (
|
|
95
|
+
quantization_args.num_bits == 4
|
|
96
|
+
and quantization_args.type == QuantizationType.FLOAT
|
|
97
|
+
):
|
|
127
98
|
raise NotImplementedError(
|
|
128
99
|
"Asymmetric Quantization is not supported for FP4"
|
|
129
100
|
)
|
|
130
|
-
|
|
131
101
|
scales = (max_vals - min_vals) / float(bit_range)
|
|
132
|
-
scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
|
|
133
102
|
zero_points = bit_min - (min_vals / scales)
|
|
134
103
|
zero_points = torch.clamp(zero_points, bit_min, bit_max)
|
|
135
104
|
|
|
136
|
-
#
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
105
|
+
# 2. Conditionally scale the generated local scale by a global_scale
|
|
106
|
+
if global_scale is not None:
|
|
107
|
+
scales = global_scale * scales
|
|
108
|
+
|
|
109
|
+
# 3. Conditionally round the scale to the quantized dtype, if scale_dtype is set
|
|
110
|
+
if quantization_args.scale_dtype is not None:
|
|
111
|
+
scales = round_to_quantized_type_dtype(
|
|
112
|
+
scales, dtype=quantization_args.scale_dtype
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# 4. Update any 0s with small values to
|
|
116
|
+
# prevent div by 0
|
|
117
|
+
eps = _get_dtype_eps(
|
|
118
|
+
dtype=quantization_args.scale_dtype
|
|
119
|
+
if quantization_args.scale_dtype is not None
|
|
120
|
+
else scales.dtype
|
|
121
|
+
)
|
|
122
|
+
scales = torch.where(
|
|
123
|
+
scales == 0,
|
|
124
|
+
torch.tensor(eps, dtype=scales.dtype, device=device),
|
|
125
|
+
scales,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# 5. Round the zp to zp_dtype
|
|
129
|
+
zero_points = round_to_quantized_type_dtype(
|
|
130
|
+
zero_points, dtype=quantization_args.zp_dtype, cast_to_original_dtype=False
|
|
131
|
+
)
|
|
141
132
|
|
|
142
133
|
if scales.ndim == 0:
|
|
143
134
|
scales = scales.reshape(1)
|
|
@@ -455,3 +446,14 @@ def strategy_cdiv(
|
|
|
455
446
|
logger.bind(log_once=True).warning(message)
|
|
456
447
|
|
|
457
448
|
return dividend
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _get_dtype_eps(dtype: torch.dtype) -> float:
|
|
452
|
+
if dtype == FP8_E4M3_DATA.dtype:
|
|
453
|
+
return 0.125
|
|
454
|
+
elif dtype == FP4_E2M1_DATA.dtype:
|
|
455
|
+
return 0.25
|
|
456
|
+
elif torch.is_floating_point(torch.tensor([], dtype=dtype)):
|
|
457
|
+
return torch.finfo(dtype).eps
|
|
458
|
+
else:
|
|
459
|
+
return 1
|
|
@@ -16,7 +16,17 @@ import contextlib
|
|
|
16
16
|
import warnings
|
|
17
17
|
from functools import wraps
|
|
18
18
|
from types import MappingProxyType
|
|
19
|
-
from typing import
|
|
19
|
+
from typing import (
|
|
20
|
+
TYPE_CHECKING,
|
|
21
|
+
Any,
|
|
22
|
+
Callable,
|
|
23
|
+
Dict,
|
|
24
|
+
Iterable,
|
|
25
|
+
List,
|
|
26
|
+
Mapping,
|
|
27
|
+
Optional,
|
|
28
|
+
TypeVar,
|
|
29
|
+
)
|
|
20
30
|
|
|
21
31
|
import numpy
|
|
22
32
|
import torch
|
|
@@ -44,6 +54,7 @@ __all__ = [
|
|
|
44
54
|
"pack_bitmasks",
|
|
45
55
|
"unpack_bitmasks",
|
|
46
56
|
"patch_attr",
|
|
57
|
+
"patch_attrs",
|
|
47
58
|
"ParameterizedDefaultDict",
|
|
48
59
|
"get_num_attn_heads",
|
|
49
60
|
"get_num_kv_heads",
|
|
@@ -368,6 +379,34 @@ def patch_attr(base: object, attr: str, value: Any):
|
|
|
368
379
|
delattr(base, attr)
|
|
369
380
|
|
|
370
381
|
|
|
382
|
+
@contextlib.contextmanager
|
|
383
|
+
def patch_attrs(bases: Iterable[Any], attr: str, values: Iterable[Any]):
|
|
384
|
+
"""
|
|
385
|
+
Same as `patch_attr` but for a list of objects to patch
|
|
386
|
+
Patch attribute for a list of objects with list of values.
|
|
387
|
+
Original values are restored upon exit
|
|
388
|
+
|
|
389
|
+
:param bases: objects which has the attribute to patch
|
|
390
|
+
:param attr: name of the the attribute to patch
|
|
391
|
+
:param values: used to replace original values. Must be same
|
|
392
|
+
length as bases
|
|
393
|
+
|
|
394
|
+
Usage:
|
|
395
|
+
>>> from types import SimpleNamespace
|
|
396
|
+
>>> obj1 = SimpleNamespace()
|
|
397
|
+
>>> obj2 = SimpleNamespace()
|
|
398
|
+
>>> with patch_attr([obj1, obj2], "attribute", ["value1", "value2"]):
|
|
399
|
+
... assert obj1.attribute == "value1"
|
|
400
|
+
... assert obj2.attribute == "value2"
|
|
401
|
+
>>> assert not hasattr(obj1, "attribute")
|
|
402
|
+
>>> assert not hasattr(obj2, "attribute")
|
|
403
|
+
"""
|
|
404
|
+
with contextlib.ExitStack() as stack:
|
|
405
|
+
for base, value in zip(bases, values):
|
|
406
|
+
stack.enter_context(patch_attr(base, attr, value))
|
|
407
|
+
yield
|
|
408
|
+
|
|
409
|
+
|
|
371
410
|
class ParameterizedDefaultDict(dict):
|
|
372
411
|
"""
|
|
373
412
|
Similar to `collections.DefaultDict`, but upon fetching a key which is missing,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.3a20251114
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -8,13 +8,8 @@ setup.py
|
|
|
8
8
|
.github/.gitkeep
|
|
9
9
|
.github/actions/test/action.yml
|
|
10
10
|
.github/scripts/step-status
|
|
11
|
-
.github/workflows/build-test.yml
|
|
12
|
-
.github/workflows/build.yml
|
|
13
|
-
.github/workflows/post-release-nightly-build.yml
|
|
14
11
|
.github/workflows/quality-check.yaml
|
|
15
12
|
.github/workflows/test-check.yaml
|
|
16
|
-
.github/workflows/test.yml
|
|
17
|
-
.github/workflows/trigger-all.yml
|
|
18
13
|
examples/bitmask_compression.ipynb
|
|
19
14
|
examples/quantize_and_pack_int4.ipynb
|
|
20
15
|
examples/bit_packing/ex_quantize_and_pack.py
|
|
@@ -22,6 +22,7 @@ import torch.nn as nn
|
|
|
22
22
|
from compressed_tensors.compressors import ModelCompressor
|
|
23
23
|
from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
|
|
24
24
|
from compressed_tensors.quantization import (
|
|
25
|
+
FP8_E4M3_DATA,
|
|
25
26
|
QuantizationArgs,
|
|
26
27
|
QuantizationConfig,
|
|
27
28
|
QuantizationScheme,
|
|
@@ -425,8 +426,18 @@ def test_multiple_quant_compressors():
|
|
|
425
426
|
format=CompressionFormat.float_quantized.value,
|
|
426
427
|
)
|
|
427
428
|
|
|
428
|
-
input_activations = QuantizationArgs(
|
|
429
|
-
|
|
429
|
+
input_activations = QuantizationArgs(
|
|
430
|
+
num_bits=4,
|
|
431
|
+
type="float",
|
|
432
|
+
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
433
|
+
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
434
|
+
)
|
|
435
|
+
weights = QuantizationArgs(
|
|
436
|
+
num_bits=4,
|
|
437
|
+
type="float",
|
|
438
|
+
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
439
|
+
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
440
|
+
)
|
|
430
441
|
|
|
431
442
|
scheme_nvfp4 = QuantizationScheme(
|
|
432
443
|
targets=["Linear"],
|