compressed-tensors 0.13.1a20260123__tar.gz → 0.13.1a20260130__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/actions/test/action.yml +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/test-check.yaml +2 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/Makefile +1 -1
- {compressed_tensors-0.13.1a20260123/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260130}/PKG-INFO +4 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/setup.py +2 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/compressed_linear.py +0 -6
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/__init__.py +7 -6
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/base.py +3 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/device.py +2 -2
- compressed_tensors-0.13.1a20260130/src/compressed_tensors/offload/cache/dist_cpu.py +53 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/dispatch.py +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/apply.py +6 -9
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/forward.py +18 -19
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/initialize.py +7 -7
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_args.py +29 -26
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_config.py +12 -12
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_scheme.py +6 -12
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/helpers.py +13 -11
- compressed_tensors-0.13.1a20260130/src/compressed_tensors/transform/apply.py +36 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/base.py +3 -11
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/helpers.py +9 -18
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/match.py +20 -21
- compressed_tensors-0.13.1a20260130/src/compressed_tensors/utils/offload.py +195 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/safetensors_load.py +12 -12
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130/src/compressed_tensors.egg-info}/PKG-INFO +4 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/SOURCES.txt +5 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/requires.txt +3 -1
- compressed_tensors-0.13.1a20260130/tests/test_modeling/test_deepseekv3_kvcache_quant.py +100 -0
- compressed_tensors-0.13.1a20260123/tests/test_offload/cache/test_cpu.py → compressed_tensors-0.13.1a20260130/tests/test_offload/cache/helpers.py +30 -49
- compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_cpu.py +80 -0
- compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_dist_cpu.py +139 -0
- compressed_tensors-0.13.1a20260130/tests/test_offload/conftest.py +76 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_apply.py +2 -12
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_initialize.py +4 -5
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_correctness.py +6 -15
- compressed_tensors-0.13.1a20260130/tests/test_transform/factory/test_memory.py +74 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_serialization.py +8 -16
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_match.py +28 -34
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/testing_utils.py +30 -22
- compressed_tensors-0.13.1a20260123/src/compressed_tensors/transform/apply.py +0 -71
- compressed_tensors-0.13.1a20260123/src/compressed_tensors/utils/offload.py +0 -672
- compressed_tensors-0.13.1a20260123/tests/test_transform/factory/test_memory.py +0 -92
- compressed_tensors-0.13.1a20260123/tests/test_utils/test_offload.py +0 -540
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/.gitkeep +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/mergify.yml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/quality-check.yaml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/stale.yml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.gitignore +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/LICENSE +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/README.md +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/pyproject.toml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/setup.cfg +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/format.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/logger.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/attention.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/kvcache.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/cpu.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/module.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/matrix.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/binary_search.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/internal.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/type.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/mock_observer.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_infer_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_modeling/test_attention_and_cache.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_dispatch.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_interface.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_module.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_registry.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/utils/test_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_type.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/utils/copyright.py +0 -0
|
@@ -12,7 +12,7 @@ on:
|
|
|
12
12
|
|
|
13
13
|
jobs:
|
|
14
14
|
python-tests:
|
|
15
|
-
runs-on:
|
|
15
|
+
runs-on: gcp-k8s-vllm-l4-duo
|
|
16
16
|
env:
|
|
17
17
|
HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
|
|
18
18
|
steps:
|
|
@@ -30,7 +30,7 @@ jobs:
|
|
|
30
30
|
- name: Set Env
|
|
31
31
|
run: pip3 install --upgrade pip setuptools
|
|
32
32
|
- name: "⚙️ Install dependencies"
|
|
33
|
-
run: pip3 install .[dev
|
|
33
|
+
run: pip3 install .[dev]
|
|
34
34
|
- name: clean up
|
|
35
35
|
run: |
|
|
36
36
|
echo "cleaning up disk space as GHA runner has limited disk size."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1a20260130
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -9,7 +9,7 @@ License: Apache 2.0
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: torch<=2.9.1,>=1.7.0
|
|
12
|
-
Requires-Dist: transformers
|
|
12
|
+
Requires-Dist: transformers<5.0.0
|
|
13
13
|
Requires-Dist: pydantic>=2.0
|
|
14
14
|
Requires-Dist: loguru
|
|
15
15
|
Provides-Extra: dev
|
|
@@ -19,6 +19,8 @@ Requires-Dist: wheel>=0.36.2; extra == "dev"
|
|
|
19
19
|
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
|
20
20
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
21
21
|
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
|
22
|
+
Requires-Dist: transformers<5.0; extra == "dev"
|
|
23
|
+
Requires-Dist: accelerate; extra == "dev"
|
|
22
24
|
Provides-Extra: accelerate
|
|
23
25
|
Requires-Dist: accelerate; extra == "accelerate"
|
|
24
26
|
Dynamic: author
|
|
@@ -88,11 +88,11 @@ def _setup_packages() -> List:
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
def _setup_install_requires() -> List:
|
|
91
|
-
return ["torch>=1.7.0,<=2.9.1", "transformers", "pydantic>=2.0", "loguru"]
|
|
91
|
+
return ["torch>=1.7.0,<=2.9.1", "transformers<5.0.0", "pydantic>=2.0", "loguru"]
|
|
92
92
|
|
|
93
93
|
def _setup_extras() -> Dict:
|
|
94
94
|
return {
|
|
95
|
-
"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"],
|
|
95
|
+
"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3", "transformers<5.0", "accelerate"],
|
|
96
96
|
"accelerate": ["accelerate"]
|
|
97
97
|
}
|
|
98
98
|
|
|
@@ -87,12 +87,6 @@ class CompressedLinear(Linear):
|
|
|
87
87
|
# mark module as compressed
|
|
88
88
|
module.quantization_status = QuantizationStatus.COMPRESSED
|
|
89
89
|
|
|
90
|
-
# handles case where forward is wrapped in new_forward by accelerate hooks
|
|
91
|
-
if hasattr(module, "_old_forward"):
|
|
92
|
-
module._old_forward = CompressedLinear.forward.__get__(
|
|
93
|
-
module, CompressedLinear
|
|
94
|
-
)
|
|
95
|
-
|
|
96
90
|
return module
|
|
97
91
|
|
|
98
92
|
def forward(self, input: Tensor) -> Tensor:
|
|
@@ -135,9 +135,7 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
|
|
|
135
135
|
"""
|
|
136
136
|
cache = base._parameters
|
|
137
137
|
if isinstance(cache, OffloadCache):
|
|
138
|
-
offload_module(
|
|
139
|
-
module, cache.onload_device, cache.offload_device, no_split=False
|
|
140
|
-
)
|
|
138
|
+
offload_module(module, cache.onload_device, cache.offload_device)
|
|
141
139
|
|
|
142
140
|
base.register_module(name, module)
|
|
143
141
|
|
|
@@ -178,9 +176,12 @@ def align_module_device(
|
|
|
178
176
|
if isinstance(module._parameters, OffloadCache):
|
|
179
177
|
assert isinstance(module._buffers, OffloadCache)
|
|
180
178
|
with module._parameters.disable_offloading():
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
179
|
+
if execution_device is not None:
|
|
180
|
+
with patch_attr(
|
|
181
|
+
module._parameters, "onload_device", execution_device
|
|
182
|
+
), patch_attr(module._buffers, "onload_device", execution_device):
|
|
183
|
+
yield
|
|
184
|
+
else:
|
|
184
185
|
yield
|
|
185
186
|
|
|
186
187
|
else:
|
|
@@ -67,6 +67,7 @@ class OffloadCache(MutableMapping, ABC):
|
|
|
67
67
|
"""
|
|
68
68
|
from compressed_tensors.offload.cache.cpu import CPUCache
|
|
69
69
|
from compressed_tensors.offload.cache.device import DeviceCache
|
|
70
|
+
from compressed_tensors.offload.cache.dist_cpu import DistributedCPUCache
|
|
70
71
|
|
|
71
72
|
device_type = torch.device(device).type if device != "disk" else "disk"
|
|
72
73
|
distributed = dist.is_available() and dist.is_initialized()
|
|
@@ -74,6 +75,8 @@ class OffloadCache(MutableMapping, ABC):
|
|
|
74
75
|
match (device_type, distributed):
|
|
75
76
|
case ("cpu", False):
|
|
76
77
|
return CPUCache
|
|
78
|
+
case ("cpu", True):
|
|
79
|
+
return DistributedCPUCache
|
|
77
80
|
case ("cuda", False):
|
|
78
81
|
return DeviceCache
|
|
79
82
|
case _:
|
|
@@ -35,8 +35,8 @@ class DeviceCache(OffloadCache):
|
|
|
35
35
|
:param key: cpu tensor to onload
|
|
36
36
|
:return: device tensor
|
|
37
37
|
"""
|
|
38
|
-
|
|
39
|
-
return offloaded
|
|
38
|
+
# move because onload_device might be modified after init
|
|
39
|
+
return send_tensors(offloaded, device=self.onload_device, copy=False)
|
|
40
40
|
|
|
41
41
|
def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
|
|
42
42
|
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import torch
|
|
16
|
+
import torch.distributed as dist
|
|
17
|
+
from compressed_tensors.offload.cache.cpu import CPUCache
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DistributedCPUCache(CPUCache):
|
|
21
|
+
"""
|
|
22
|
+
Handles offloading and onloading tensors from/to cpu memory shared across processes
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
offload_device = torch.device("cpu")
|
|
26
|
+
|
|
27
|
+
def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
|
|
28
|
+
if tensor is None:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
# slight runtime cost for views
|
|
32
|
+
tensor = tensor.contiguous()
|
|
33
|
+
|
|
34
|
+
if dist.get_rank() == 0:
|
|
35
|
+
# create shared memory cpu tensor
|
|
36
|
+
tensor = super().offload(tensor).share_memory_()
|
|
37
|
+
(handle, filename, nbytes) = tensor.untyped_storage()._share_filename_cpu_()
|
|
38
|
+
broadcast_obj = [handle, filename, nbytes]
|
|
39
|
+
else:
|
|
40
|
+
broadcast_obj = [None, None, None]
|
|
41
|
+
|
|
42
|
+
# receive shared memory file handle
|
|
43
|
+
dist.broadcast_object_list(broadcast_obj, src=0)
|
|
44
|
+
|
|
45
|
+
if dist.get_rank() != 0:
|
|
46
|
+
# reconstruct tensor from shared memory file handle
|
|
47
|
+
tensor = torch.empty_like(tensor, device=self.offload_device)
|
|
48
|
+
tensor.set_(torch.UntypedStorage._new_shared_filename_cpu(*broadcast_obj))
|
|
49
|
+
|
|
50
|
+
# ensure that rank 0 does not garbage collect before other ranks reconstruct
|
|
51
|
+
dist.barrier()
|
|
52
|
+
|
|
53
|
+
return tensor
|
|
@@ -39,7 +39,7 @@ ModelType = TypeVar("ModelType", bound=torch.nn.Module)
|
|
|
39
39
|
def offload_model(
|
|
40
40
|
model: ModelType,
|
|
41
41
|
onload_device: torch.device | str,
|
|
42
|
-
offload_device:
|
|
42
|
+
offload_device: torch.device | str | Literal["disk"] = torch.device("cpu"),
|
|
43
43
|
) -> ModelType:
|
|
44
44
|
"""
|
|
45
45
|
Offload a model to the `offload_device`. During forward passes, model weights will
|
|
@@ -14,9 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
from collections import OrderedDict
|
|
16
16
|
from copy import deepcopy
|
|
17
|
-
from typing import Dict, List, Optional
|
|
18
|
-
from typing import OrderedDict as OrderedDictType
|
|
19
|
-
from typing import Union
|
|
20
17
|
|
|
21
18
|
import torch
|
|
22
19
|
from compressed_tensors.config import CompressionFormat
|
|
@@ -60,8 +57,8 @@ from compressed_tensors.utils.safetensors_load import (
|
|
|
60
57
|
|
|
61
58
|
def load_pretrained_quantization_parameters(
|
|
62
59
|
model: Module,
|
|
63
|
-
model_name_or_path:
|
|
64
|
-
load_weight_qparams:
|
|
60
|
+
model_name_or_path: str | None = None,
|
|
61
|
+
load_weight_qparams: bool = False,
|
|
65
62
|
):
|
|
66
63
|
"""
|
|
67
64
|
Loads the quantization parameters (scale and zero point) from model_name_or_path to
|
|
@@ -110,7 +107,7 @@ def load_pretrained_quantization_parameters(
|
|
|
110
107
|
|
|
111
108
|
|
|
112
109
|
def apply_quantization_config(
|
|
113
|
-
model: Module, config:
|
|
110
|
+
model: Module, config: QuantizationConfig | None, run_compressed: bool = False
|
|
114
111
|
):
|
|
115
112
|
"""
|
|
116
113
|
Initializes the model for quantization in-place based on the given config.
|
|
@@ -207,7 +204,7 @@ def _apply_kv_cache_scheme(
|
|
|
207
204
|
|
|
208
205
|
|
|
209
206
|
def _load_quant_args_from_mapping(
|
|
210
|
-
base_name: str, module_name: str, module: Module, mapping:
|
|
207
|
+
base_name: str, module_name: str, module: Module, mapping: dict
|
|
211
208
|
):
|
|
212
209
|
# TODO: skip update and just register here, don't do it in initialize
|
|
213
210
|
"""
|
|
@@ -251,8 +248,8 @@ def _load_quant_args_from_mapping(
|
|
|
251
248
|
|
|
252
249
|
|
|
253
250
|
def _scheme_from_targets(
|
|
254
|
-
target_to_scheme:
|
|
255
|
-
targets:
|
|
251
|
+
target_to_scheme: OrderedDict[str, QuantizationScheme],
|
|
252
|
+
targets: list[str],
|
|
256
253
|
name: str,
|
|
257
254
|
) -> QuantizationScheme:
|
|
258
255
|
# return the first scheme (the prioritized one,
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
from functools import wraps
|
|
16
16
|
from math import ceil
|
|
17
|
-
from typing import Optional
|
|
18
17
|
|
|
19
18
|
import torch
|
|
20
19
|
from compressed_tensors.quantization.quant_args import (
|
|
@@ -47,9 +46,9 @@ def quantize(
|
|
|
47
46
|
scale: torch.Tensor,
|
|
48
47
|
zero_point: torch.Tensor,
|
|
49
48
|
args: QuantizationArgs,
|
|
50
|
-
dtype:
|
|
51
|
-
g_idx:
|
|
52
|
-
global_scale:
|
|
49
|
+
dtype: torch.dtype | None = None,
|
|
50
|
+
g_idx: torch.Tensor | None = None,
|
|
51
|
+
global_scale: torch.Tensor | None = None,
|
|
53
52
|
) -> torch.Tensor:
|
|
54
53
|
"""
|
|
55
54
|
Quantize the input tensor x using the QuantizationStrategy specified in args.
|
|
@@ -85,11 +84,11 @@ def quantize(
|
|
|
85
84
|
def dequantize(
|
|
86
85
|
x_q: torch.Tensor,
|
|
87
86
|
scale: torch.Tensor,
|
|
88
|
-
zero_point:
|
|
89
|
-
args:
|
|
90
|
-
dtype:
|
|
91
|
-
g_idx:
|
|
92
|
-
global_scale:
|
|
87
|
+
zero_point: torch.Tensor | None = None,
|
|
88
|
+
args: QuantizationArgs | None = None,
|
|
89
|
+
dtype: torch.dtype | None = None,
|
|
90
|
+
g_idx: torch.Tensor | None = None,
|
|
91
|
+
global_scale: torch.Tensor | None = None,
|
|
93
92
|
) -> torch.Tensor:
|
|
94
93
|
"""
|
|
95
94
|
Dequantize a quantized input tensor x_q based on the strategy specified in args. If
|
|
@@ -159,8 +158,8 @@ def fake_quantize(
|
|
|
159
158
|
scale: torch.Tensor,
|
|
160
159
|
zero_point: torch.Tensor,
|
|
161
160
|
args: QuantizationArgs,
|
|
162
|
-
g_idx:
|
|
163
|
-
global_scale:
|
|
161
|
+
g_idx: torch.Tensor | None = None,
|
|
162
|
+
global_scale: torch.Tensor | None = None,
|
|
164
163
|
) -> torch.Tensor:
|
|
165
164
|
"""
|
|
166
165
|
Fake quantize the input tensor x by quantizing then dequantizing with
|
|
@@ -195,11 +194,11 @@ def _process_quantization(
|
|
|
195
194
|
scale: torch.Tensor,
|
|
196
195
|
zero_point: torch.Tensor,
|
|
197
196
|
args: QuantizationArgs,
|
|
198
|
-
g_idx:
|
|
199
|
-
dtype:
|
|
197
|
+
g_idx: torch.Tensor | None = None,
|
|
198
|
+
dtype: torch.dtype | None = None,
|
|
200
199
|
do_quantize: bool = True,
|
|
201
200
|
do_dequantize: bool = True,
|
|
202
|
-
global_scale:
|
|
201
|
+
global_scale: torch.Tensor | None = None,
|
|
203
202
|
) -> torch.Tensor:
|
|
204
203
|
q_min, q_max = calculate_range(args, x.device)
|
|
205
204
|
group_size = args.group_size
|
|
@@ -457,8 +456,8 @@ def _quantize(
|
|
|
457
456
|
q_min: torch.Tensor,
|
|
458
457
|
q_max: torch.Tensor,
|
|
459
458
|
args: QuantizationArgs,
|
|
460
|
-
dtype:
|
|
461
|
-
global_scale:
|
|
459
|
+
dtype: torch.dtype | None = None,
|
|
460
|
+
global_scale: torch.Tensor | None = None,
|
|
462
461
|
) -> torch.Tensor:
|
|
463
462
|
|
|
464
463
|
# if a global scale is optionally provided, use it
|
|
@@ -486,9 +485,9 @@ def _quantize(
|
|
|
486
485
|
def _dequantize(
|
|
487
486
|
x_q: torch.Tensor,
|
|
488
487
|
scale: torch.Tensor,
|
|
489
|
-
zero_point: torch.Tensor = None,
|
|
490
|
-
dtype:
|
|
491
|
-
global_scale:
|
|
488
|
+
zero_point: torch.Tensor | None = None,
|
|
489
|
+
dtype: torch.dtype | None = None,
|
|
490
|
+
global_scale: torch.Tensor | None = None,
|
|
492
491
|
) -> torch.Tensor:
|
|
493
492
|
|
|
494
493
|
# if a global scale is optionally provided, use it
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
-
from typing import Optional, Tuple, Union
|
|
18
17
|
|
|
19
18
|
import torch
|
|
20
19
|
from compressed_tensors.modeling import (
|
|
@@ -23,6 +22,7 @@ from compressed_tensors.modeling import (
|
|
|
23
22
|
QuantizedAttentionImpl,
|
|
24
23
|
QuantizedKVCache,
|
|
25
24
|
)
|
|
25
|
+
from compressed_tensors.offload import unwrap_offload_forward
|
|
26
26
|
from compressed_tensors.quantization import (
|
|
27
27
|
ActivationOrdering,
|
|
28
28
|
DynamicType,
|
|
@@ -37,7 +37,6 @@ from compressed_tensors.quantization.lifecycle.forward import (
|
|
|
37
37
|
)
|
|
38
38
|
from compressed_tensors.quantization.utils import strategy_cdiv
|
|
39
39
|
from compressed_tensors.utils import (
|
|
40
|
-
disable_hf_hook,
|
|
41
40
|
get_execution_device,
|
|
42
41
|
get_head_dim,
|
|
43
42
|
get_num_attn_heads,
|
|
@@ -60,7 +59,7 @@ _LOGGER = logging.getLogger(__name__)
|
|
|
60
59
|
|
|
61
60
|
def initialize_module_for_quantization(
|
|
62
61
|
module: Module,
|
|
63
|
-
scheme:
|
|
62
|
+
scheme: QuantizationScheme | None = None,
|
|
64
63
|
force_zero_point: bool = True,
|
|
65
64
|
):
|
|
66
65
|
"""
|
|
@@ -134,7 +133,7 @@ def initialize_module_for_quantization(
|
|
|
134
133
|
force_zero_point=force_zero_point,
|
|
135
134
|
)
|
|
136
135
|
|
|
137
|
-
with
|
|
136
|
+
with unwrap_offload_forward(module):
|
|
138
137
|
# wrap forward call of module to perform
|
|
139
138
|
# quantized actions based on calltime status
|
|
140
139
|
wrap_module_forward_quantized(module, scheme)
|
|
@@ -148,6 +147,7 @@ def is_attention_module(module: Module):
|
|
|
148
147
|
hasattr(module, "k_proj")
|
|
149
148
|
or hasattr(module, "v_proj")
|
|
150
149
|
or hasattr(module, "qkv_proj")
|
|
150
|
+
or hasattr(module, "kv_b_proj")
|
|
151
151
|
)
|
|
152
152
|
|
|
153
153
|
|
|
@@ -155,7 +155,7 @@ def initialize_qparams(
|
|
|
155
155
|
module: Module,
|
|
156
156
|
base_name: str,
|
|
157
157
|
quantization_args: QuantizationArgs,
|
|
158
|
-
observed_shape:
|
|
158
|
+
observed_shape: tuple[int | None, ...],
|
|
159
159
|
observed_dtype: torch.dtype,
|
|
160
160
|
force_zero_point: bool = True,
|
|
161
161
|
):
|
|
@@ -279,8 +279,8 @@ def initialize_attn_qparams(
|
|
|
279
279
|
):
|
|
280
280
|
"""Initlaize k_scale, v_scale for self_attn"""
|
|
281
281
|
|
|
282
|
-
impl:
|
|
283
|
-
kv_cache:
|
|
282
|
+
impl: QuantizedAttentionImpl | None = getattr(module, IMPL_ATTR, None)
|
|
283
|
+
kv_cache: QuantizedKVCache | None = getattr(module, KV_CACHE_ATTR, None)
|
|
284
284
|
|
|
285
285
|
if impl is None and kv_cache is None:
|
|
286
286
|
raise ValueError(
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import warnings
|
|
16
16
|
from enum import Enum
|
|
17
|
-
from typing import Any
|
|
17
|
+
from typing import Any
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
from compressed_tensors.utils import Aliasable
|
|
@@ -48,10 +48,10 @@ __all__ = [
|
|
|
48
48
|
class FloatArgs:
|
|
49
49
|
exponent: int
|
|
50
50
|
mantissa: int
|
|
51
|
-
bits:
|
|
52
|
-
max:
|
|
53
|
-
min:
|
|
54
|
-
dtype:
|
|
51
|
+
bits: int | None = None
|
|
52
|
+
max: float | None = None
|
|
53
|
+
min: float | None = None
|
|
54
|
+
dtype: torch.dtype | None = None
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
class FP4_E2M1_DATA(FloatArgs):
|
|
@@ -147,7 +147,7 @@ class ActivationOrdering(Aliasable, str, Enum):
|
|
|
147
147
|
STATIC = "static"
|
|
148
148
|
|
|
149
149
|
@staticmethod
|
|
150
|
-
def get_aliases() ->
|
|
150
|
+
def get_aliases() -> dict[str, str]:
|
|
151
151
|
return {
|
|
152
152
|
"dynamic": "group",
|
|
153
153
|
"static": "weight",
|
|
@@ -178,21 +178,21 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
178
178
|
num_bits: int = 8
|
|
179
179
|
type: QuantizationType = QuantizationType.INT
|
|
180
180
|
symmetric: bool = True
|
|
181
|
-
group_size:
|
|
182
|
-
strategy:
|
|
183
|
-
block_structure:
|
|
184
|
-
dynamic:
|
|
185
|
-
actorder:
|
|
186
|
-
scale_dtype:
|
|
187
|
-
zp_dtype:
|
|
188
|
-
observer:
|
|
181
|
+
group_size: int | None = None
|
|
182
|
+
strategy: QuantizationStrategy | None = None
|
|
183
|
+
block_structure: list[int] | None = None
|
|
184
|
+
dynamic: DynamicType | bool = False
|
|
185
|
+
actorder: ActivationOrdering | bool | None = None
|
|
186
|
+
scale_dtype: TorchDtype | None = None
|
|
187
|
+
zp_dtype: TorchDtype | None = None
|
|
188
|
+
observer: str | None = Field(
|
|
189
189
|
default=None,
|
|
190
190
|
description=(
|
|
191
191
|
"Determines the method of computing quantization parameters (scales and "
|
|
192
192
|
"zero-points). Defaults to min-max when not using dynamic quantization"
|
|
193
193
|
),
|
|
194
194
|
)
|
|
195
|
-
observer_kwargs:
|
|
195
|
+
observer_kwargs: dict[str, Any] = Field(
|
|
196
196
|
default_factory=dict,
|
|
197
197
|
description=(
|
|
198
198
|
"optional dict of kwargs to be passed directly to torch quantization "
|
|
@@ -214,7 +214,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
214
214
|
return value
|
|
215
215
|
|
|
216
216
|
@field_validator("group_size", mode="before")
|
|
217
|
-
def validate_group(cls, value) ->
|
|
217
|
+
def validate_group(cls, value) -> int | None:
|
|
218
218
|
if value is None:
|
|
219
219
|
return value
|
|
220
220
|
|
|
@@ -227,7 +227,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
227
227
|
return value
|
|
228
228
|
|
|
229
229
|
@field_validator("block_structure", mode="before")
|
|
230
|
-
def validate_block_structure(cls, value) ->
|
|
230
|
+
def validate_block_structure(cls, value) -> list[int] | None:
|
|
231
231
|
if value is None:
|
|
232
232
|
return value
|
|
233
233
|
# For backward compatibility, allow string format "2x4", "8x16", etc.
|
|
@@ -251,14 +251,14 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
251
251
|
)
|
|
252
252
|
|
|
253
253
|
@field_validator("strategy", mode="before")
|
|
254
|
-
def validate_strategy(cls, value) ->
|
|
254
|
+
def validate_strategy(cls, value) -> QuantizationStrategy | None:
|
|
255
255
|
if isinstance(value, str):
|
|
256
256
|
return QuantizationStrategy(value.lower())
|
|
257
257
|
|
|
258
258
|
return value
|
|
259
259
|
|
|
260
260
|
@field_validator("actorder", mode="before")
|
|
261
|
-
def validate_actorder(cls, value) ->
|
|
261
|
+
def validate_actorder(cls, value) -> ActivationOrdering | None:
|
|
262
262
|
if isinstance(value, bool):
|
|
263
263
|
return ActivationOrdering.GROUP if value else None
|
|
264
264
|
|
|
@@ -268,7 +268,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
268
268
|
return value
|
|
269
269
|
|
|
270
270
|
@field_validator("dynamic", mode="before")
|
|
271
|
-
def validate_dynamic(cls, value) ->
|
|
271
|
+
def validate_dynamic(cls, value) -> DynamicType | bool:
|
|
272
272
|
if isinstance(value, str):
|
|
273
273
|
return DynamicType(value.lower())
|
|
274
274
|
return value
|
|
@@ -329,10 +329,13 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
329
329
|
raise ValueError(f"Block structure requires block strategy\n{model}")
|
|
330
330
|
|
|
331
331
|
# validate activation ordering and strategy
|
|
332
|
-
if actorder is not None and strategy
|
|
332
|
+
if actorder is not None and strategy not in (
|
|
333
|
+
QuantizationStrategy.GROUP,
|
|
334
|
+
QuantizationStrategy.TENSOR_GROUP,
|
|
335
|
+
):
|
|
333
336
|
raise ValueError(
|
|
334
|
-
"Must use group quantization strategy in
|
|
335
|
-
"activation ordering"
|
|
337
|
+
"Must use group or tensor_group quantization strategy in "
|
|
338
|
+
"order to apply activation ordering"
|
|
336
339
|
)
|
|
337
340
|
|
|
338
341
|
# infer observer w.r.t. dynamic
|
|
@@ -369,7 +372,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
369
372
|
|
|
370
373
|
elif observer is None:
|
|
371
374
|
# default to minmax for non-dynamic cases
|
|
372
|
-
observer = "
|
|
375
|
+
observer = "memoryless_minmax"
|
|
373
376
|
|
|
374
377
|
if zp_dtype is None:
|
|
375
378
|
if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
|
|
@@ -409,7 +412,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
409
412
|
def round_to_quantized_type_dtype(
|
|
410
413
|
tensor: torch.Tensor,
|
|
411
414
|
dtype: torch.dtype,
|
|
412
|
-
cast_to_original_dtype:
|
|
415
|
+
cast_to_original_dtype: bool = True,
|
|
413
416
|
) -> torch.Tensor:
|
|
414
417
|
"""
|
|
415
418
|
Rounds an input tensor to the nearest quantized representation given a dtype.
|
|
@@ -439,7 +442,7 @@ def round_to_quantized_type_args(
|
|
|
439
442
|
args: QuantizationArgs,
|
|
440
443
|
min: torch.Tensor,
|
|
441
444
|
max: torch.Tensor,
|
|
442
|
-
cast_to_original_dtype:
|
|
445
|
+
cast_to_original_dtype: bool = True,
|
|
443
446
|
) -> torch.Tensor:
|
|
444
447
|
"""
|
|
445
448
|
Rounds an input tensor to the nearest quantized representation given
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from collections import defaultdict
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Annotated, Any
|
|
16
|
+
from typing import Annotated, Any
|
|
17
17
|
|
|
18
18
|
from compressed_tensors.config import CompressionFormat
|
|
19
19
|
from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
|
|
@@ -55,7 +55,7 @@ class QuantizationStatus(str, Enum):
|
|
|
55
55
|
COMPRESSED = "compressed"
|
|
56
56
|
|
|
57
57
|
@classmethod
|
|
58
|
-
def lifecycle_order(cls) ->
|
|
58
|
+
def lifecycle_order(cls) -> list["QuantizationStatus"]:
|
|
59
59
|
"""
|
|
60
60
|
:return: list of correct quantization lifecycle order
|
|
61
61
|
"""
|
|
@@ -131,13 +131,13 @@ class QuantizationConfig(BaseModel):
|
|
|
131
131
|
are not quantized even if they match up with a target in config_groups
|
|
132
132
|
"""
|
|
133
133
|
|
|
134
|
-
config_groups:
|
|
134
|
+
config_groups: dict[str, QuantizationScheme | list[str]]
|
|
135
135
|
quant_method: str = DEFAULT_QUANTIZATION_METHOD
|
|
136
|
-
kv_cache_scheme:
|
|
136
|
+
kv_cache_scheme: QuantizationArgs | None = None
|
|
137
137
|
format: str = DEFAULT_QUANTIZATION_FORMAT
|
|
138
138
|
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
|
|
139
|
-
global_compression_ratio:
|
|
140
|
-
ignore:
|
|
139
|
+
global_compression_ratio: float | None = None
|
|
140
|
+
ignore: list[str] | None = Field(default_factory=list)
|
|
141
141
|
# `run_compressed` is a dummy, unused arg for backwards compatibility
|
|
142
142
|
# see: https://github.com/huggingface/transformers/pull/39324
|
|
143
143
|
run_compressed: Annotated[Any, Field(exclude=True)] = None
|
|
@@ -161,8 +161,8 @@ class QuantizationConfig(BaseModel):
|
|
|
161
161
|
|
|
162
162
|
@staticmethod
|
|
163
163
|
def from_pretrained(
|
|
164
|
-
model: Module, format:
|
|
165
|
-
) ->
|
|
164
|
+
model: Module, format: str | list | None = None
|
|
165
|
+
) -> "QuantizationConfig | None":
|
|
166
166
|
"""
|
|
167
167
|
Converts a model into its associated QuantizationConfig based on the
|
|
168
168
|
QuantizationScheme attached to each quantized module
|
|
@@ -177,21 +177,21 @@ class QuantizationConfig(BaseModel):
|
|
|
177
177
|
|
|
178
178
|
# set of all quantization schemes
|
|
179
179
|
# TODO: make quant config/scheme/args frozen/hashable and use a set
|
|
180
|
-
quantization_schemes:
|
|
180
|
+
quantization_schemes: list[QuantizationScheme] = list()
|
|
181
181
|
|
|
182
182
|
# use any status from modules (in practice, use the last module)
|
|
183
183
|
model_status = None
|
|
184
184
|
|
|
185
185
|
# set of all quantized types
|
|
186
186
|
# this is later used to create the ignore list
|
|
187
|
-
quantization_type_names:
|
|
187
|
+
quantization_type_names: set[str] = set()
|
|
188
188
|
|
|
189
189
|
# maps types to names which are not quantized
|
|
190
190
|
# this is later used to create the ignore list
|
|
191
|
-
ignore:
|
|
191
|
+
ignore: dict[str, list[str]] = defaultdict(list)
|
|
192
192
|
|
|
193
193
|
# this keeps track of any kvcache schemes
|
|
194
|
-
kv_cache_scheme:
|
|
194
|
+
kv_cache_scheme: QuantizationArgs | None = None
|
|
195
195
|
|
|
196
196
|
for name, submodule in model.named_modules():
|
|
197
197
|
layer_type: str = module_type(submodule)
|