compressed-tensors 0.12.3a20251212__tar.gz → 0.12.3a20251215__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.12.3a20251212/src/compressed_tensors.egg-info → compressed_tensors-0.12.3a20251215}/PKG-INFO +1 -1
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/quantize_and_pack_int4.ipynb +51 -93
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/base.py +1 -33
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/quantized_compressors/base.py +39 -24
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +3 -14
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +7 -35
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.github/.gitkeep +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.github/workflows/quality-check.yaml +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.github/workflows/test-check.yaml +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/.gitignore +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/LICENSE +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/Makefile +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/README.md +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/pyproject.toml +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/setup.cfg +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/setup.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/format.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/linear/compressed_linear.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/logger.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/modeling/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/modeling/attention.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/modeling/kvcache.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/apply.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/lifecycle/initialize.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/quant_args.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/apply.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/factory/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/factory/base.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/transform_args.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/transform_config.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/transform_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/transform/utils/matrix.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/internal.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/match.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/offload.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/safetensors_load.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors/utils/type.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors.egg-info/SOURCES.txt +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/mock_observer.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_configs/test_infer_quant.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_modeling/test_attention_and_cache.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_apply.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_registry.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/conftest.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/factory/test_correctness.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/factory/test_memory.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/factory/test_serialization.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/test_transform_args.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/test_transform_config.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/test_transform_scheme.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_transform/utils/test_hadamard.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/test_match.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/test_offload.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_utils/test_type.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/testing_utils.py +0 -0
- {compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/utils/copyright.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.3a20251215
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
},
|
|
16
16
|
{
|
|
17
17
|
"cell_type": "code",
|
|
18
|
-
"execution_count":
|
|
18
|
+
"execution_count": 12,
|
|
19
19
|
"metadata": {},
|
|
20
20
|
"outputs": [],
|
|
21
21
|
"source": [
|
|
@@ -25,8 +25,7 @@
|
|
|
25
25
|
"from compressed_tensors.quantization import (\n",
|
|
26
26
|
" QuantizationConfig,\n",
|
|
27
27
|
" QuantizationStatus,\n",
|
|
28
|
-
" apply_quantization_config
|
|
29
|
-
" compress_quantized_weights\n",
|
|
28
|
+
" apply_quantization_config\n",
|
|
30
29
|
")\n",
|
|
31
30
|
"from compressed_tensors.compressors import ModelCompressor\n",
|
|
32
31
|
"from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator\n",
|
|
@@ -37,51 +36,9 @@
|
|
|
37
36
|
},
|
|
38
37
|
{
|
|
39
38
|
"cell_type": "code",
|
|
40
|
-
"execution_count":
|
|
39
|
+
"execution_count": 13,
|
|
41
40
|
"metadata": {},
|
|
42
41
|
"outputs": [
|
|
43
|
-
{
|
|
44
|
-
"data": {
|
|
45
|
-
"application/vnd.jupyter.widget-view+json": {
|
|
46
|
-
"model_id": "c883cdc8ecd04866bd01d61796b81c26",
|
|
47
|
-
"version_major": 2,
|
|
48
|
-
"version_minor": 0
|
|
49
|
-
},
|
|
50
|
-
"text/plain": [
|
|
51
|
-
"config.json: 0%| | 0.00/560 [00:00<?, ?B/s]"
|
|
52
|
-
]
|
|
53
|
-
},
|
|
54
|
-
"metadata": {},
|
|
55
|
-
"output_type": "display_data"
|
|
56
|
-
},
|
|
57
|
-
{
|
|
58
|
-
"data": {
|
|
59
|
-
"application/vnd.jupyter.widget-view+json": {
|
|
60
|
-
"model_id": "32b18b14b6774ce7b61d2854a1ed5f49",
|
|
61
|
-
"version_major": 2,
|
|
62
|
-
"version_minor": 0
|
|
63
|
-
},
|
|
64
|
-
"text/plain": [
|
|
65
|
-
"model.safetensors: 0%| | 0.00/4.40G [00:00<?, ?B/s]"
|
|
66
|
-
]
|
|
67
|
-
},
|
|
68
|
-
"metadata": {},
|
|
69
|
-
"output_type": "display_data"
|
|
70
|
-
},
|
|
71
|
-
{
|
|
72
|
-
"data": {
|
|
73
|
-
"application/vnd.jupyter.widget-view+json": {
|
|
74
|
-
"model_id": "370c6d18521a4b65833a411728be1ed7",
|
|
75
|
-
"version_major": 2,
|
|
76
|
-
"version_minor": 0
|
|
77
|
-
},
|
|
78
|
-
"text/plain": [
|
|
79
|
-
"generation_config.json: 0%| | 0.00/129 [00:00<?, ?B/s]"
|
|
80
|
-
]
|
|
81
|
-
},
|
|
82
|
-
"metadata": {},
|
|
83
|
-
"output_type": "display_data"
|
|
84
|
-
},
|
|
85
42
|
{
|
|
86
43
|
"data": {
|
|
87
44
|
"text/plain": [
|
|
@@ -113,7 +70,7 @@
|
|
|
113
70
|
")"
|
|
114
71
|
]
|
|
115
72
|
},
|
|
116
|
-
"execution_count":
|
|
73
|
+
"execution_count": 13,
|
|
117
74
|
"metadata": {},
|
|
118
75
|
"output_type": "execute_result"
|
|
119
76
|
}
|
|
@@ -122,7 +79,7 @@
|
|
|
122
79
|
"# load a dense, unquantized tiny llama model\n",
|
|
123
80
|
"device = \"cuda:0\"\n",
|
|
124
81
|
"model_name = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\"\n",
|
|
125
|
-
"model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype
|
|
82
|
+
"model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16)\n",
|
|
126
83
|
"model"
|
|
127
84
|
]
|
|
128
85
|
},
|
|
@@ -139,7 +96,7 @@
|
|
|
139
96
|
},
|
|
140
97
|
{
|
|
141
98
|
"cell_type": "code",
|
|
142
|
-
"execution_count":
|
|
99
|
+
"execution_count": 14,
|
|
143
100
|
"metadata": {},
|
|
144
101
|
"outputs": [],
|
|
145
102
|
"source": [
|
|
@@ -164,7 +121,7 @@
|
|
|
164
121
|
},
|
|
165
122
|
{
|
|
166
123
|
"cell_type": "code",
|
|
167
|
-
"execution_count":
|
|
124
|
+
"execution_count": 15,
|
|
168
125
|
"metadata": {},
|
|
169
126
|
"outputs": [],
|
|
170
127
|
"source": [
|
|
@@ -177,7 +134,7 @@
|
|
|
177
134
|
},
|
|
178
135
|
{
|
|
179
136
|
"cell_type": "code",
|
|
180
|
-
"execution_count":
|
|
137
|
+
"execution_count": 16,
|
|
181
138
|
"metadata": {},
|
|
182
139
|
"outputs": [],
|
|
183
140
|
"source": [
|
|
@@ -198,14 +155,14 @@
|
|
|
198
155
|
},
|
|
199
156
|
{
|
|
200
157
|
"cell_type": "code",
|
|
201
|
-
"execution_count":
|
|
158
|
+
"execution_count": 17,
|
|
202
159
|
"metadata": {},
|
|
203
160
|
"outputs": [
|
|
204
161
|
{
|
|
205
162
|
"name": "stderr",
|
|
206
163
|
"output_type": "stream",
|
|
207
164
|
"text": [
|
|
208
|
-
"Running calibration: 512it [00:
|
|
165
|
+
"Running calibration: 512it [00:58, 8.82it/s]\n"
|
|
209
166
|
]
|
|
210
167
|
}
|
|
211
168
|
],
|
|
@@ -233,20 +190,24 @@
|
|
|
233
190
|
"\n",
|
|
234
191
|
"Notice that at this point, the weight itself is still a floating point and has not been quantized. \n",
|
|
235
192
|
"\n",
|
|
236
|
-
"To convert the weights to an integer type, we need to apply the `
|
|
193
|
+
"To convert the weights to an integer type, we need to apply the `compress_model` function. After compressing the weights, a forward pass of the model can no longer be run in PyTorch.\n",
|
|
194
|
+
"\n",
|
|
195
|
+
"After compressing the quantized model with the `pack-quantized` format, weights are represented as logical int4 values packed into int32 containers ( `weight_packed` ), with the original shape recorded in `weight_shape`.\n",
|
|
196
|
+
"\n",
|
|
197
|
+
"This packed representation is what gets saved to disk when using ModelCompressor.compress_model(model)."
|
|
237
198
|
]
|
|
238
199
|
},
|
|
239
200
|
{
|
|
240
201
|
"cell_type": "code",
|
|
241
|
-
"execution_count":
|
|
202
|
+
"execution_count": 18,
|
|
242
203
|
"metadata": {},
|
|
243
204
|
"outputs": [
|
|
244
205
|
{
|
|
245
206
|
"name": "stdout",
|
|
246
207
|
"output_type": "stream",
|
|
247
208
|
"text": [
|
|
248
|
-
"Scale: tensor([
|
|
249
|
-
"Weight min: -1.
|
|
209
|
+
"Scale: tensor([-3.0465e+26], device='cuda:0', dtype=torch.bfloat16), Zero Point: tensor([0], device='cuda:0', dtype=torch.int8)\n",
|
|
210
|
+
"Weight min: -1.5859375 max: 1.03125 dtype: torch.bfloat16\n"
|
|
250
211
|
]
|
|
251
212
|
}
|
|
252
213
|
],
|
|
@@ -262,64 +223,62 @@
|
|
|
262
223
|
},
|
|
263
224
|
{
|
|
264
225
|
"cell_type": "code",
|
|
265
|
-
"execution_count":
|
|
226
|
+
"execution_count": 19,
|
|
266
227
|
"metadata": {},
|
|
267
228
|
"outputs": [
|
|
229
|
+
{
|
|
230
|
+
"name": "stderr",
|
|
231
|
+
"output_type": "stream",
|
|
232
|
+
"text": [
|
|
233
|
+
"Compressing model: 154it [00:02, 59.75it/s]"
|
|
234
|
+
]
|
|
235
|
+
},
|
|
268
236
|
{
|
|
269
237
|
"name": "stdout",
|
|
270
238
|
"output_type": "stream",
|
|
271
239
|
"text": [
|
|
272
|
-
"
|
|
273
|
-
"
|
|
240
|
+
"Compressed weight scale: tensor([-3.0465e+26], device='cuda:0', dtype=torch.bfloat16), zero point: tensor([0], device='cuda:0', dtype=torch.int8)\n",
|
|
241
|
+
"Compressed weight dtype: torch.int32\n",
|
|
242
|
+
"Compressed weight shape: torch.Size([2048, 256])\n",
|
|
243
|
+
"Uncompressed weight shape: tensor([2048, 2048], device='cuda:0')\n"
|
|
244
|
+
]
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
"name": "stderr",
|
|
248
|
+
"output_type": "stream",
|
|
249
|
+
"text": [
|
|
250
|
+
"\n"
|
|
274
251
|
]
|
|
275
252
|
}
|
|
276
253
|
],
|
|
277
254
|
"source": [
|
|
278
255
|
"# convert quantized weights to integers\n",
|
|
279
|
-
"
|
|
256
|
+
"compressor = ModelCompressor(quantization_config=config)\n",
|
|
257
|
+
"compressor.compress_model(model)\n",
|
|
280
258
|
"\n",
|
|
281
259
|
"state_dict = model.state_dict()\n",
|
|
282
260
|
"example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n",
|
|
283
261
|
"scale = state_dict[example_layer + \"_scale\"]\n",
|
|
284
262
|
"zero_point = state_dict[example_layer + \"_zero_point\"]\n",
|
|
285
|
-
"weight = state_dict[example_layer]\n",
|
|
286
|
-
"
|
|
287
|
-
"print(f\"
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
"cell_type": "markdown",
|
|
292
|
-
"metadata": {},
|
|
293
|
-
"source": [
|
|
294
|
-
"After compressing the quantized model, the weight matrix has a range of int4 but is stored in an int8. \n",
|
|
295
|
-
"\n",
|
|
296
|
-
"We can further compress the model on disk using the `pack-quantized` format we specified in the config. This compression format will pack the int4 weights into int32"
|
|
263
|
+
"weight = state_dict[example_layer + \"_packed\"]\n",
|
|
264
|
+
"shape = state_dict[example_layer + \"_shape\"]\n",
|
|
265
|
+
"print(f\"Compressed weight scale: {scale}, zero point: {zero_point}\")\n",
|
|
266
|
+
"print(f\"Compressed weight dtype: {weight.dtype}\")\n",
|
|
267
|
+
"print(f\"Compressed weight shape: {weight.shape}\")\n",
|
|
268
|
+
"print(f\"Uncompressed weight shape: {shape}\")"
|
|
297
269
|
]
|
|
298
270
|
},
|
|
299
271
|
{
|
|
300
272
|
"cell_type": "code",
|
|
301
|
-
"execution_count":
|
|
273
|
+
"execution_count": 20,
|
|
302
274
|
"metadata": {},
|
|
303
275
|
"outputs": [
|
|
304
276
|
{
|
|
305
277
|
"name": "stdout",
|
|
306
278
|
"output_type": "stream",
|
|
307
279
|
"text": [
|
|
308
|
-
"Compression format: pack-quantized\n"
|
|
309
|
-
|
|
310
|
-
},
|
|
311
|
-
{
|
|
312
|
-
"name": "stderr",
|
|
313
|
-
"output_type": "stream",
|
|
314
|
-
"text": [
|
|
315
|
-
"Quantized Compression: 100%|██████████| 509/509 [00:03<00:00, 153.70it/s]\n"
|
|
316
|
-
]
|
|
317
|
-
},
|
|
318
|
-
{
|
|
319
|
-
"name": "stdout",
|
|
320
|
-
"output_type": "stream",
|
|
321
|
-
"text": [
|
|
322
|
-
"Size of the model's weights on disk using safetensors: 712.23 MB\n"
|
|
280
|
+
"Compression format: pack-quantized\n",
|
|
281
|
+
"Size of the model's weights on disk using safetensors: 712.25 MB\n"
|
|
323
282
|
]
|
|
324
283
|
}
|
|
325
284
|
],
|
|
@@ -330,9 +289,8 @@
|
|
|
330
289
|
"compression_format = config.format\n",
|
|
331
290
|
"print(f\"Compression format: {compression_format}\")\n",
|
|
332
291
|
"\n",
|
|
333
|
-
"
|
|
334
|
-
"
|
|
335
|
-
"model.save_pretrained(output_dir, state_dict=compressed_state_dict)\n",
|
|
292
|
+
"\n",
|
|
293
|
+
"model.save_pretrained(output_dir, state_dict=model.state_dict())\n",
|
|
336
294
|
"compressor.update_config(output_dir)\n",
|
|
337
295
|
"\n",
|
|
338
296
|
"compressed_size_on_disk_mb = os.path.getsize(os.path.join(output_dir, \"model.safetensors\")) / 1024 / 1024\n",
|
|
@@ -356,7 +314,7 @@
|
|
|
356
314
|
"name": "python",
|
|
357
315
|
"nbconvert_exporter": "python",
|
|
358
316
|
"pygments_lexer": "ipython3",
|
|
359
|
-
"version": "3.
|
|
317
|
+
"version": "3.12.12"
|
|
360
318
|
}
|
|
361
319
|
},
|
|
362
320
|
"nbformat": 4,
|
|
@@ -20,11 +20,6 @@ from compressed_tensors.config import SparsityCompressionConfig
|
|
|
20
20
|
from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
|
|
21
21
|
from compressed_tensors.registry import RegistryMixin
|
|
22
22
|
from compressed_tensors.utils import has_offloaded_params
|
|
23
|
-
from compressed_tensors.utils.offload import (
|
|
24
|
-
delete_offload_parameter,
|
|
25
|
-
get_offloaded_device,
|
|
26
|
-
register_offload_parameter,
|
|
27
|
-
)
|
|
28
23
|
from torch import Tensor
|
|
29
24
|
from torch.nn import Module
|
|
30
25
|
|
|
@@ -190,37 +185,10 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
|
190
185
|
for name, parameter in module.named_parameters():
|
|
191
186
|
compressed_data[name] = parameter
|
|
192
187
|
|
|
193
|
-
|
|
194
|
-
original_scale = compressed_data.get("weight_scale")
|
|
195
|
-
original_zp = compressed_data.get("weight_zero_point")
|
|
196
|
-
|
|
197
|
-
# NOTE: decompress_weight may modify compressed_data dict in-place
|
|
198
|
-
# This is subtle but allows us to update the module's qparams with
|
|
199
|
-
# the unpacked values.
|
|
200
|
-
# TODO: Consider refactoring to return modified qparams explicitly
|
|
201
|
-
result = self.decompress_weight(
|
|
188
|
+
return self.decompress_weight(
|
|
202
189
|
compressed_data=compressed_data, quantization_args=quantization_args
|
|
203
190
|
).to(device)
|
|
204
191
|
|
|
205
|
-
# Update module's parameters only if they were modified
|
|
206
|
-
for param_name, original_param in [
|
|
207
|
-
("weight_scale", original_scale),
|
|
208
|
-
("weight_zero_point", original_zp),
|
|
209
|
-
]:
|
|
210
|
-
if (
|
|
211
|
-
param_name in compressed_data
|
|
212
|
-
and compressed_data[param_name] is not original_param
|
|
213
|
-
):
|
|
214
|
-
# Delete the old parameter and register the updated one
|
|
215
|
-
delete_offload_parameter(module, param_name)
|
|
216
|
-
offload_device = get_offloaded_device(module)
|
|
217
|
-
param = torch.nn.Parameter(
|
|
218
|
-
compressed_data[param_name], requires_grad=False
|
|
219
|
-
)
|
|
220
|
-
register_offload_parameter(module, param_name, param, offload_device)
|
|
221
|
-
|
|
222
|
-
return result
|
|
223
|
-
|
|
224
192
|
def decompress_weight(
|
|
225
193
|
self, compressed_data: Dict[str, Tensor], **kwargs
|
|
226
194
|
) -> torch.Tensor:
|
|
@@ -18,7 +18,7 @@ from typing import Any, Dict, Generator, Tuple, Union
|
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
from compressed_tensors.compressors.base import BaseCompressor
|
|
21
|
-
from compressed_tensors.quantization import QuantizationScheme
|
|
21
|
+
from compressed_tensors.quantization import QuantizationScheme, QuantizationStrategy
|
|
22
22
|
from compressed_tensors.utils import (
|
|
23
23
|
get_nested_mappings_from_state_dict,
|
|
24
24
|
get_nested_weight_mappings,
|
|
@@ -85,7 +85,6 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
|
85
85
|
"""
|
|
86
86
|
uncompressed_names = list(model_state.keys())
|
|
87
87
|
compressed_dict = {}
|
|
88
|
-
compressed_param_names = set()
|
|
89
88
|
|
|
90
89
|
# compress values
|
|
91
90
|
desc = "Compressing with quantization"
|
|
@@ -120,38 +119,54 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
|
120
119
|
device=compression_device,
|
|
121
120
|
)
|
|
122
121
|
|
|
123
|
-
# update state dict
|
|
122
|
+
# update state dict
|
|
124
123
|
for key, value in compressed_values.items():
|
|
125
|
-
|
|
126
|
-
compressed_dict[full_name] = value.to(compression_device)
|
|
127
|
-
compressed_param_names.add(full_name)
|
|
124
|
+
compressed_dict[prefix + key] = value.to(compression_device)
|
|
128
125
|
|
|
129
126
|
else:
|
|
130
|
-
#
|
|
131
|
-
if name
|
|
127
|
+
# omit saving zero points for symmetric or packed quantization
|
|
128
|
+
if name.endswith("zero_point") and self._skip_zp(name, names_to_scheme):
|
|
132
129
|
continue
|
|
133
130
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
if name.endswith("weight_zero_point"):
|
|
137
|
-
module_path = name.rsplit(".", 1)[0]
|
|
138
|
-
if (
|
|
139
|
-
module_path in names_to_scheme
|
|
140
|
-
and names_to_scheme[module_path].weights.symmetric
|
|
141
|
-
):
|
|
142
|
-
continue
|
|
143
|
-
# Call compress_zp if available (for PackedQuantizationCompressor)
|
|
144
|
-
if module_path in names_to_scheme and hasattr(self, "compress_zp"):
|
|
145
|
-
value = self.compress_zp(
|
|
146
|
-
value, names_to_scheme[module_path].weights
|
|
147
|
-
)
|
|
148
|
-
if value is None:
|
|
149
|
-
continue
|
|
131
|
+
if name.endswith("weight_scale") and self._skip_scale():
|
|
132
|
+
continue
|
|
150
133
|
|
|
151
134
|
compressed_dict[name] = value.to(compression_device)
|
|
152
135
|
|
|
153
136
|
return compressed_dict
|
|
154
137
|
|
|
138
|
+
def _skip_scale(self):
|
|
139
|
+
from compressed_tensors.compressors import NVFP4PackedCompressor
|
|
140
|
+
|
|
141
|
+
return isinstance(self, NVFP4PackedCompressor)
|
|
142
|
+
|
|
143
|
+
def _skip_zp(
|
|
144
|
+
self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
|
|
145
|
+
) -> bool:
|
|
146
|
+
from compressed_tensors.compressors import PackedQuantizationCompressor
|
|
147
|
+
|
|
148
|
+
module_name, zp_name = name.rsplit(".", 1) if "." in name else ("", name)
|
|
149
|
+
scheme = names_to_scheme[module_name]
|
|
150
|
+
|
|
151
|
+
if zp_name == "weight_zero_point":
|
|
152
|
+
args = scheme.weights
|
|
153
|
+
if zp_name == "input_zero_point":
|
|
154
|
+
args = scheme.input_activations
|
|
155
|
+
if zp_name == "output_zero_point":
|
|
156
|
+
args = scheme.output_activations
|
|
157
|
+
|
|
158
|
+
symmetric = args.symmetric
|
|
159
|
+
packable_strategies = [
|
|
160
|
+
QuantizationStrategy.GROUP.value,
|
|
161
|
+
QuantizationStrategy.CHANNEL.value,
|
|
162
|
+
]
|
|
163
|
+
packed = (
|
|
164
|
+
isinstance(self, PackedQuantizationCompressor)
|
|
165
|
+
and args.strategy in packable_strategies
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return symmetric or packed
|
|
169
|
+
|
|
155
170
|
def decompress(
|
|
156
171
|
self,
|
|
157
172
|
path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
|
|
@@ -56,6 +56,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
56
56
|
return (
|
|
57
57
|
"weight_packed",
|
|
58
58
|
"weight_scale",
|
|
59
|
+
"weight_zero_point",
|
|
59
60
|
"weight_global_scale",
|
|
60
61
|
)
|
|
61
62
|
|
|
@@ -72,12 +73,13 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
72
73
|
:param quantization_args: quantization parameters for the weight
|
|
73
74
|
:return: dictionary mapping compressed parameter names to shape and dtype
|
|
74
75
|
"""
|
|
75
|
-
|
|
76
|
+
output = {
|
|
76
77
|
"weight_packed": (
|
|
77
78
|
torch.Size((weight_shape[0], weight_shape[1] // 2)),
|
|
78
79
|
torch.uint8,
|
|
79
80
|
),
|
|
80
81
|
}
|
|
82
|
+
return output
|
|
81
83
|
|
|
82
84
|
def compress_scale(
|
|
83
85
|
self,
|
|
@@ -112,13 +114,6 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
112
114
|
compressed_dict["weight_scale"] = self.compress_scale(
|
|
113
115
|
scale=scale, quantization_args=quantization_args
|
|
114
116
|
)
|
|
115
|
-
|
|
116
|
-
if global_scale is None:
|
|
117
|
-
raise ValueError(
|
|
118
|
-
"NVFP4 quantization requires global_scale (TENSOR_GROUP strategy). "
|
|
119
|
-
"Use TENSOR_GROUP strategy instead of GROUP for FP4 quantization."
|
|
120
|
-
)
|
|
121
|
-
|
|
122
117
|
return compressed_dict
|
|
123
118
|
|
|
124
119
|
def decompress_weight(
|
|
@@ -132,12 +127,6 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
|
|
|
132
127
|
m, n = weight.shape
|
|
133
128
|
# TODO: use a user provided dequant dtype
|
|
134
129
|
unpacked = unpack_fp4_from_uint8(weight, m, n * 2)
|
|
135
|
-
|
|
136
|
-
# cast scale dtype to match unpacked dtype for dequantization
|
|
137
|
-
if scale.dtype != unpacked.dtype:
|
|
138
|
-
scale = scale.to(unpacked.dtype)
|
|
139
|
-
compressed_data["weight_scale"] = scale
|
|
140
|
-
|
|
141
130
|
decompressed_weight = dequantize(
|
|
142
131
|
x_q=unpacked, scale=scale, global_scale=global_scale, dtype=unpacked.dtype
|
|
143
132
|
)
|
|
@@ -64,34 +64,25 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
|
64
64
|
"""
|
|
65
65
|
pack_factor = 32 // quantization_args.num_bits
|
|
66
66
|
packed_size = math.ceil(weight_shape[1] / pack_factor)
|
|
67
|
+
packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
|
|
67
68
|
output = {
|
|
68
69
|
"weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
|
|
69
70
|
"weight_shape": (torch.Size((2,)), torch.int32),
|
|
70
71
|
}
|
|
71
|
-
|
|
72
|
-
# Add weight_scale - always needed for quantization
|
|
73
|
-
if quantization_args.strategy in [
|
|
72
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
|
74
73
|
QuantizationStrategy.GROUP.value,
|
|
75
74
|
QuantizationStrategy.CHANNEL.value,
|
|
76
75
|
]:
|
|
77
|
-
|
|
76
|
+
zp_factor = (
|
|
78
77
|
quantization_args.group_size
|
|
79
78
|
if quantization_args.strategy == QuantizationStrategy.GROUP.value
|
|
80
79
|
else weight_shape[-1]
|
|
81
80
|
)
|
|
82
|
-
scale_cols = math.ceil(weight_shape[-1] / shape_factor)
|
|
83
|
-
output["weight_scale"] = (
|
|
84
|
-
torch.Size((weight_shape[0], scale_cols)),
|
|
85
|
-
quantization_args.scale_dtype,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
# Add weight_zero_point for asymmetric quantization
|
|
89
|
-
if not quantization_args.symmetric:
|
|
90
|
-
output["weight_zero_point"] = (
|
|
91
|
-
torch.Size((math.ceil(weight_shape[0] / pack_factor), scale_cols)),
|
|
92
|
-
torch.int32,
|
|
93
|
-
)
|
|
94
81
|
|
|
82
|
+
output["weight_zero_point"] = (
|
|
83
|
+
torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
|
|
84
|
+
torch.int32,
|
|
85
|
+
)
|
|
95
86
|
return output
|
|
96
87
|
|
|
97
88
|
def compress_weight(
|
|
@@ -184,8 +175,6 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
|
184
175
|
zero_point = unpack_from_int32(
|
|
185
176
|
zero_point, num_bits, original_zp_shape, packed_dim=0
|
|
186
177
|
)
|
|
187
|
-
# Update the compressed_data dict with the unpacked zero_point
|
|
188
|
-
compressed_data["weight_zero_point"] = zero_point
|
|
189
178
|
|
|
190
179
|
decompressed_weight = dequantize(
|
|
191
180
|
x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
|
|
@@ -193,20 +182,6 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
|
193
182
|
|
|
194
183
|
return decompressed_weight
|
|
195
184
|
|
|
196
|
-
def compress_zp(
|
|
197
|
-
self, zero_point: Tensor, quantization_args: Optional[QuantizationArgs] = None
|
|
198
|
-
) -> Optional[Tensor]:
|
|
199
|
-
if zero_point is None or quantization_args.symmetric:
|
|
200
|
-
return None
|
|
201
|
-
if zero_point.dtype == torch.int32:
|
|
202
|
-
return zero_point
|
|
203
|
-
if quantization_args.strategy in [
|
|
204
|
-
QuantizationStrategy.GROUP.value,
|
|
205
|
-
QuantizationStrategy.CHANNEL.value,
|
|
206
|
-
]:
|
|
207
|
-
return pack_to_int32(zero_point, quantization_args.num_bits, packed_dim=0)
|
|
208
|
-
return zero_point
|
|
209
|
-
|
|
210
185
|
|
|
211
186
|
def pack_to_int32(
|
|
212
187
|
value: torch.Tensor,
|
|
@@ -251,9 +226,6 @@ def pack_to_int32(
|
|
|
251
226
|
if packed_dim == 0:
|
|
252
227
|
value = value.transpose(0, 1)
|
|
253
228
|
|
|
254
|
-
# Ensure contiguous memory for .view() operation
|
|
255
|
-
value = value.contiguous()
|
|
256
|
-
|
|
257
229
|
rows, cols = value.shape
|
|
258
230
|
padded_cols = math.ceil(cols / pack_factor) * pack_factor
|
|
259
231
|
pad_len = padded_cols - cols
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.3a20251215
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/mock_observer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/test_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/tests/testing_utils.py
RENAMED
|
File without changes
|
{compressed_tensors-0.12.3a20251212 → compressed_tensors-0.12.3a20251215}/utils/copyright.py
RENAMED
|
File without changes
|