compressed-tensors 0.13.1a20260127__tar.gz → 0.13.1a20260130__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/test-check.yaml +1 -1
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/Makefile +1 -1
- {compressed_tensors-0.13.1a20260127/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260130}/PKG-INFO +1 -1
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/base.py +3 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/device.py +2 -2
- compressed_tensors-0.13.1a20260130/src/compressed_tensors/offload/cache/dist_cpu.py +53 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/apply.py +6 -9
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/initialize.py +5 -5
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_args.py +29 -26
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_config.py +12 -12
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_scheme.py +6 -12
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/helpers.py +13 -11
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/helpers.py +9 -18
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/match.py +20 -21
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/offload.py +3 -3
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/safetensors_load.py +12 -12
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/SOURCES.txt +5 -0
- compressed_tensors-0.13.1a20260130/tests/test_modeling/test_deepseekv3_kvcache_quant.py +100 -0
- compressed_tensors-0.13.1a20260127/tests/test_offload/cache/test_cpu.py → compressed_tensors-0.13.1a20260130/tests/test_offload/cache/helpers.py +30 -49
- compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_cpu.py +80 -0
- compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_dist_cpu.py +139 -0
- compressed_tensors-0.13.1a20260130/tests/test_offload/conftest.py +76 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/testing_utils.py +30 -4
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/.gitkeep +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/actions/test/action.yml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/mergify.yml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/quality-check.yaml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/stale.yml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.gitignore +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/LICENSE +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/README.md +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/pyproject.toml +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/setup.cfg +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/setup.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/format.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/compressed_linear.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/logger.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/attention.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/kvcache.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/cpu.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/dispatch.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/module.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/utils.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/apply.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/matrix.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/binary_search.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/internal.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/type.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/mock_observer.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_infer_quant.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_modeling/test_attention_and_cache.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_dispatch.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_interface.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_module.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_apply.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_registry.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_correctness.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_memory.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_serialization.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/utils/test_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_match.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_type.py +0 -0
- {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/utils/copyright.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1a20260130
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -67,6 +67,7 @@ class OffloadCache(MutableMapping, ABC):
|
|
|
67
67
|
"""
|
|
68
68
|
from compressed_tensors.offload.cache.cpu import CPUCache
|
|
69
69
|
from compressed_tensors.offload.cache.device import DeviceCache
|
|
70
|
+
from compressed_tensors.offload.cache.dist_cpu import DistributedCPUCache
|
|
70
71
|
|
|
71
72
|
device_type = torch.device(device).type if device != "disk" else "disk"
|
|
72
73
|
distributed = dist.is_available() and dist.is_initialized()
|
|
@@ -74,6 +75,8 @@ class OffloadCache(MutableMapping, ABC):
|
|
|
74
75
|
match (device_type, distributed):
|
|
75
76
|
case ("cpu", False):
|
|
76
77
|
return CPUCache
|
|
78
|
+
case ("cpu", True):
|
|
79
|
+
return DistributedCPUCache
|
|
77
80
|
case ("cuda", False):
|
|
78
81
|
return DeviceCache
|
|
79
82
|
case _:
|
|
@@ -35,8 +35,8 @@ class DeviceCache(OffloadCache):
|
|
|
35
35
|
:param key: cpu tensor to onload
|
|
36
36
|
:return: device tensor
|
|
37
37
|
"""
|
|
38
|
-
|
|
39
|
-
return offloaded
|
|
38
|
+
# move because onload_device might be modified after init
|
|
39
|
+
return send_tensors(offloaded, device=self.onload_device, copy=False)
|
|
40
40
|
|
|
41
41
|
def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
|
|
42
42
|
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import torch
|
|
16
|
+
import torch.distributed as dist
|
|
17
|
+
from compressed_tensors.offload.cache.cpu import CPUCache
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DistributedCPUCache(CPUCache):
|
|
21
|
+
"""
|
|
22
|
+
Handles offloading and onloading tensors from/to cpu memory shared across processes
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
offload_device = torch.device("cpu")
|
|
26
|
+
|
|
27
|
+
def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
|
|
28
|
+
if tensor is None:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
# slight runtime cost for views
|
|
32
|
+
tensor = tensor.contiguous()
|
|
33
|
+
|
|
34
|
+
if dist.get_rank() == 0:
|
|
35
|
+
# create shared memory cpu tensor
|
|
36
|
+
tensor = super().offload(tensor).share_memory_()
|
|
37
|
+
(handle, filename, nbytes) = tensor.untyped_storage()._share_filename_cpu_()
|
|
38
|
+
broadcast_obj = [handle, filename, nbytes]
|
|
39
|
+
else:
|
|
40
|
+
broadcast_obj = [None, None, None]
|
|
41
|
+
|
|
42
|
+
# receive shared memory file handle
|
|
43
|
+
dist.broadcast_object_list(broadcast_obj, src=0)
|
|
44
|
+
|
|
45
|
+
if dist.get_rank() != 0:
|
|
46
|
+
# reconstruct tensor from shared memory file handle
|
|
47
|
+
tensor = torch.empty_like(tensor, device=self.offload_device)
|
|
48
|
+
tensor.set_(torch.UntypedStorage._new_shared_filename_cpu(*broadcast_obj))
|
|
49
|
+
|
|
50
|
+
# ensure that rank 0 does not garbage collect before other ranks reconstruct
|
|
51
|
+
dist.barrier()
|
|
52
|
+
|
|
53
|
+
return tensor
|
|
@@ -14,9 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
from collections import OrderedDict
|
|
16
16
|
from copy import deepcopy
|
|
17
|
-
from typing import Dict, List, Optional
|
|
18
|
-
from typing import OrderedDict as OrderedDictType
|
|
19
|
-
from typing import Union
|
|
20
17
|
|
|
21
18
|
import torch
|
|
22
19
|
from compressed_tensors.config import CompressionFormat
|
|
@@ -60,8 +57,8 @@ from compressed_tensors.utils.safetensors_load import (
|
|
|
60
57
|
|
|
61
58
|
def load_pretrained_quantization_parameters(
|
|
62
59
|
model: Module,
|
|
63
|
-
model_name_or_path:
|
|
64
|
-
load_weight_qparams:
|
|
60
|
+
model_name_or_path: str | None = None,
|
|
61
|
+
load_weight_qparams: bool = False,
|
|
65
62
|
):
|
|
66
63
|
"""
|
|
67
64
|
Loads the quantization parameters (scale and zero point) from model_name_or_path to
|
|
@@ -110,7 +107,7 @@ def load_pretrained_quantization_parameters(
|
|
|
110
107
|
|
|
111
108
|
|
|
112
109
|
def apply_quantization_config(
|
|
113
|
-
model: Module, config:
|
|
110
|
+
model: Module, config: QuantizationConfig | None, run_compressed: bool = False
|
|
114
111
|
):
|
|
115
112
|
"""
|
|
116
113
|
Initializes the model for quantization in-place based on the given config.
|
|
@@ -207,7 +204,7 @@ def _apply_kv_cache_scheme(
|
|
|
207
204
|
|
|
208
205
|
|
|
209
206
|
def _load_quant_args_from_mapping(
|
|
210
|
-
base_name: str, module_name: str, module: Module, mapping:
|
|
207
|
+
base_name: str, module_name: str, module: Module, mapping: dict
|
|
211
208
|
):
|
|
212
209
|
# TODO: skip update and just register here, don't do it in initialize
|
|
213
210
|
"""
|
|
@@ -251,8 +248,8 @@ def _load_quant_args_from_mapping(
|
|
|
251
248
|
|
|
252
249
|
|
|
253
250
|
def _scheme_from_targets(
|
|
254
|
-
target_to_scheme:
|
|
255
|
-
targets:
|
|
251
|
+
target_to_scheme: OrderedDict[str, QuantizationScheme],
|
|
252
|
+
targets: list[str],
|
|
256
253
|
name: str,
|
|
257
254
|
) -> QuantizationScheme:
|
|
258
255
|
# return the first scheme (the prioritized one,
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
-
from typing import Optional, Tuple, Union
|
|
18
17
|
|
|
19
18
|
import torch
|
|
20
19
|
from compressed_tensors.modeling import (
|
|
@@ -60,7 +59,7 @@ _LOGGER = logging.getLogger(__name__)
|
|
|
60
59
|
|
|
61
60
|
def initialize_module_for_quantization(
|
|
62
61
|
module: Module,
|
|
63
|
-
scheme:
|
|
62
|
+
scheme: QuantizationScheme | None = None,
|
|
64
63
|
force_zero_point: bool = True,
|
|
65
64
|
):
|
|
66
65
|
"""
|
|
@@ -148,6 +147,7 @@ def is_attention_module(module: Module):
|
|
|
148
147
|
hasattr(module, "k_proj")
|
|
149
148
|
or hasattr(module, "v_proj")
|
|
150
149
|
or hasattr(module, "qkv_proj")
|
|
150
|
+
or hasattr(module, "kv_b_proj")
|
|
151
151
|
)
|
|
152
152
|
|
|
153
153
|
|
|
@@ -155,7 +155,7 @@ def initialize_qparams(
|
|
|
155
155
|
module: Module,
|
|
156
156
|
base_name: str,
|
|
157
157
|
quantization_args: QuantizationArgs,
|
|
158
|
-
observed_shape:
|
|
158
|
+
observed_shape: tuple[int | None, ...],
|
|
159
159
|
observed_dtype: torch.dtype,
|
|
160
160
|
force_zero_point: bool = True,
|
|
161
161
|
):
|
|
@@ -279,8 +279,8 @@ def initialize_attn_qparams(
|
|
|
279
279
|
):
|
|
280
280
|
"""Initlaize k_scale, v_scale for self_attn"""
|
|
281
281
|
|
|
282
|
-
impl:
|
|
283
|
-
kv_cache:
|
|
282
|
+
impl: QuantizedAttentionImpl | None = getattr(module, IMPL_ATTR, None)
|
|
283
|
+
kv_cache: QuantizedKVCache | None = getattr(module, KV_CACHE_ATTR, None)
|
|
284
284
|
|
|
285
285
|
if impl is None and kv_cache is None:
|
|
286
286
|
raise ValueError(
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import warnings
|
|
16
16
|
from enum import Enum
|
|
17
|
-
from typing import Any
|
|
17
|
+
from typing import Any
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
from compressed_tensors.utils import Aliasable
|
|
@@ -48,10 +48,10 @@ __all__ = [
|
|
|
48
48
|
class FloatArgs:
|
|
49
49
|
exponent: int
|
|
50
50
|
mantissa: int
|
|
51
|
-
bits:
|
|
52
|
-
max:
|
|
53
|
-
min:
|
|
54
|
-
dtype:
|
|
51
|
+
bits: int | None = None
|
|
52
|
+
max: float | None = None
|
|
53
|
+
min: float | None = None
|
|
54
|
+
dtype: torch.dtype | None = None
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
class FP4_E2M1_DATA(FloatArgs):
|
|
@@ -147,7 +147,7 @@ class ActivationOrdering(Aliasable, str, Enum):
|
|
|
147
147
|
STATIC = "static"
|
|
148
148
|
|
|
149
149
|
@staticmethod
|
|
150
|
-
def get_aliases() ->
|
|
150
|
+
def get_aliases() -> dict[str, str]:
|
|
151
151
|
return {
|
|
152
152
|
"dynamic": "group",
|
|
153
153
|
"static": "weight",
|
|
@@ -178,21 +178,21 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
178
178
|
num_bits: int = 8
|
|
179
179
|
type: QuantizationType = QuantizationType.INT
|
|
180
180
|
symmetric: bool = True
|
|
181
|
-
group_size:
|
|
182
|
-
strategy:
|
|
183
|
-
block_structure:
|
|
184
|
-
dynamic:
|
|
185
|
-
actorder:
|
|
186
|
-
scale_dtype:
|
|
187
|
-
zp_dtype:
|
|
188
|
-
observer:
|
|
181
|
+
group_size: int | None = None
|
|
182
|
+
strategy: QuantizationStrategy | None = None
|
|
183
|
+
block_structure: list[int] | None = None
|
|
184
|
+
dynamic: DynamicType | bool = False
|
|
185
|
+
actorder: ActivationOrdering | bool | None = None
|
|
186
|
+
scale_dtype: TorchDtype | None = None
|
|
187
|
+
zp_dtype: TorchDtype | None = None
|
|
188
|
+
observer: str | None = Field(
|
|
189
189
|
default=None,
|
|
190
190
|
description=(
|
|
191
191
|
"Determines the method of computing quantization parameters (scales and "
|
|
192
192
|
"zero-points). Defaults to min-max when not using dynamic quantization"
|
|
193
193
|
),
|
|
194
194
|
)
|
|
195
|
-
observer_kwargs:
|
|
195
|
+
observer_kwargs: dict[str, Any] = Field(
|
|
196
196
|
default_factory=dict,
|
|
197
197
|
description=(
|
|
198
198
|
"optional dict of kwargs to be passed directly to torch quantization "
|
|
@@ -214,7 +214,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
214
214
|
return value
|
|
215
215
|
|
|
216
216
|
@field_validator("group_size", mode="before")
|
|
217
|
-
def validate_group(cls, value) ->
|
|
217
|
+
def validate_group(cls, value) -> int | None:
|
|
218
218
|
if value is None:
|
|
219
219
|
return value
|
|
220
220
|
|
|
@@ -227,7 +227,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
227
227
|
return value
|
|
228
228
|
|
|
229
229
|
@field_validator("block_structure", mode="before")
|
|
230
|
-
def validate_block_structure(cls, value) ->
|
|
230
|
+
def validate_block_structure(cls, value) -> list[int] | None:
|
|
231
231
|
if value is None:
|
|
232
232
|
return value
|
|
233
233
|
# For backward compatibility, allow string format "2x4", "8x16", etc.
|
|
@@ -251,14 +251,14 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
251
251
|
)
|
|
252
252
|
|
|
253
253
|
@field_validator("strategy", mode="before")
|
|
254
|
-
def validate_strategy(cls, value) ->
|
|
254
|
+
def validate_strategy(cls, value) -> QuantizationStrategy | None:
|
|
255
255
|
if isinstance(value, str):
|
|
256
256
|
return QuantizationStrategy(value.lower())
|
|
257
257
|
|
|
258
258
|
return value
|
|
259
259
|
|
|
260
260
|
@field_validator("actorder", mode="before")
|
|
261
|
-
def validate_actorder(cls, value) ->
|
|
261
|
+
def validate_actorder(cls, value) -> ActivationOrdering | None:
|
|
262
262
|
if isinstance(value, bool):
|
|
263
263
|
return ActivationOrdering.GROUP if value else None
|
|
264
264
|
|
|
@@ -268,7 +268,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
268
268
|
return value
|
|
269
269
|
|
|
270
270
|
@field_validator("dynamic", mode="before")
|
|
271
|
-
def validate_dynamic(cls, value) ->
|
|
271
|
+
def validate_dynamic(cls, value) -> DynamicType | bool:
|
|
272
272
|
if isinstance(value, str):
|
|
273
273
|
return DynamicType(value.lower())
|
|
274
274
|
return value
|
|
@@ -329,10 +329,13 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
329
329
|
raise ValueError(f"Block structure requires block strategy\n{model}")
|
|
330
330
|
|
|
331
331
|
# validate activation ordering and strategy
|
|
332
|
-
if actorder is not None and strategy
|
|
332
|
+
if actorder is not None and strategy not in (
|
|
333
|
+
QuantizationStrategy.GROUP,
|
|
334
|
+
QuantizationStrategy.TENSOR_GROUP,
|
|
335
|
+
):
|
|
333
336
|
raise ValueError(
|
|
334
|
-
"Must use group quantization strategy in
|
|
335
|
-
"activation ordering"
|
|
337
|
+
"Must use group or tensor_group quantization strategy in "
|
|
338
|
+
"order to apply activation ordering"
|
|
336
339
|
)
|
|
337
340
|
|
|
338
341
|
# infer observer w.r.t. dynamic
|
|
@@ -369,7 +372,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
369
372
|
|
|
370
373
|
elif observer is None:
|
|
371
374
|
# default to minmax for non-dynamic cases
|
|
372
|
-
observer = "
|
|
375
|
+
observer = "memoryless_minmax"
|
|
373
376
|
|
|
374
377
|
if zp_dtype is None:
|
|
375
378
|
if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
|
|
@@ -409,7 +412,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
|
409
412
|
def round_to_quantized_type_dtype(
|
|
410
413
|
tensor: torch.Tensor,
|
|
411
414
|
dtype: torch.dtype,
|
|
412
|
-
cast_to_original_dtype:
|
|
415
|
+
cast_to_original_dtype: bool = True,
|
|
413
416
|
) -> torch.Tensor:
|
|
414
417
|
"""
|
|
415
418
|
Rounds an input tensor to the nearest quantized representation given a dtype.
|
|
@@ -439,7 +442,7 @@ def round_to_quantized_type_args(
|
|
|
439
442
|
args: QuantizationArgs,
|
|
440
443
|
min: torch.Tensor,
|
|
441
444
|
max: torch.Tensor,
|
|
442
|
-
cast_to_original_dtype:
|
|
445
|
+
cast_to_original_dtype: bool = True,
|
|
443
446
|
) -> torch.Tensor:
|
|
444
447
|
"""
|
|
445
448
|
Rounds an input tensor to the nearest quantized representation given
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from collections import defaultdict
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Annotated, Any
|
|
16
|
+
from typing import Annotated, Any
|
|
17
17
|
|
|
18
18
|
from compressed_tensors.config import CompressionFormat
|
|
19
19
|
from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
|
|
@@ -55,7 +55,7 @@ class QuantizationStatus(str, Enum):
|
|
|
55
55
|
COMPRESSED = "compressed"
|
|
56
56
|
|
|
57
57
|
@classmethod
|
|
58
|
-
def lifecycle_order(cls) ->
|
|
58
|
+
def lifecycle_order(cls) -> list["QuantizationStatus"]:
|
|
59
59
|
"""
|
|
60
60
|
:return: list of correct quantization lifecycle order
|
|
61
61
|
"""
|
|
@@ -131,13 +131,13 @@ class QuantizationConfig(BaseModel):
|
|
|
131
131
|
are not quantized even if they match up with a target in config_groups
|
|
132
132
|
"""
|
|
133
133
|
|
|
134
|
-
config_groups:
|
|
134
|
+
config_groups: dict[str, QuantizationScheme | list[str]]
|
|
135
135
|
quant_method: str = DEFAULT_QUANTIZATION_METHOD
|
|
136
|
-
kv_cache_scheme:
|
|
136
|
+
kv_cache_scheme: QuantizationArgs | None = None
|
|
137
137
|
format: str = DEFAULT_QUANTIZATION_FORMAT
|
|
138
138
|
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
|
|
139
|
-
global_compression_ratio:
|
|
140
|
-
ignore:
|
|
139
|
+
global_compression_ratio: float | None = None
|
|
140
|
+
ignore: list[str] | None = Field(default_factory=list)
|
|
141
141
|
# `run_compressed` is a dummy, unused arg for backwards compatibility
|
|
142
142
|
# see: https://github.com/huggingface/transformers/pull/39324
|
|
143
143
|
run_compressed: Annotated[Any, Field(exclude=True)] = None
|
|
@@ -161,8 +161,8 @@ class QuantizationConfig(BaseModel):
|
|
|
161
161
|
|
|
162
162
|
@staticmethod
|
|
163
163
|
def from_pretrained(
|
|
164
|
-
model: Module, format:
|
|
165
|
-
) ->
|
|
164
|
+
model: Module, format: str | list | None = None
|
|
165
|
+
) -> "QuantizationConfig | None":
|
|
166
166
|
"""
|
|
167
167
|
Converts a model into its associated QuantizationConfig based on the
|
|
168
168
|
QuantizationScheme attached to each quantized module
|
|
@@ -177,21 +177,21 @@ class QuantizationConfig(BaseModel):
|
|
|
177
177
|
|
|
178
178
|
# set of all quantization schemes
|
|
179
179
|
# TODO: make quant config/scheme/args frozen/hashable and use a set
|
|
180
|
-
quantization_schemes:
|
|
180
|
+
quantization_schemes: list[QuantizationScheme] = list()
|
|
181
181
|
|
|
182
182
|
# use any status from modules (in practice, use the last module)
|
|
183
183
|
model_status = None
|
|
184
184
|
|
|
185
185
|
# set of all quantized types
|
|
186
186
|
# this is later used to create the ignore list
|
|
187
|
-
quantization_type_names:
|
|
187
|
+
quantization_type_names: set[str] = set()
|
|
188
188
|
|
|
189
189
|
# maps types to names which are not quantized
|
|
190
190
|
# this is later used to create the ignore list
|
|
191
|
-
ignore:
|
|
191
|
+
ignore: dict[str, list[str]] = defaultdict(list)
|
|
192
192
|
|
|
193
193
|
# this keeps track of any kvcache schemes
|
|
194
|
-
kv_cache_scheme:
|
|
194
|
+
kv_cache_scheme: QuantizationArgs | None = None
|
|
195
195
|
|
|
196
196
|
for name, submodule in model.named_modules():
|
|
197
197
|
layer_type: str = module_type(submodule)
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import warnings
|
|
15
15
|
from copy import deepcopy
|
|
16
|
-
from typing import List, Optional
|
|
17
16
|
|
|
18
17
|
import torch
|
|
19
18
|
from compressed_tensors.config import CompressionFormat
|
|
@@ -47,11 +46,11 @@ class QuantizationScheme(BaseModel):
|
|
|
47
46
|
:param format: CompressionFormat for the layer
|
|
48
47
|
"""
|
|
49
48
|
|
|
50
|
-
targets:
|
|
51
|
-
weights:
|
|
52
|
-
input_activations:
|
|
53
|
-
output_activations:
|
|
54
|
-
format:
|
|
49
|
+
targets: list[str]
|
|
50
|
+
weights: QuantizationArgs | None = None
|
|
51
|
+
input_activations: QuantizationArgs | None = None
|
|
52
|
+
output_activations: QuantizationArgs | None = None
|
|
53
|
+
format: str | None = None
|
|
55
54
|
|
|
56
55
|
@model_validator(mode="after")
|
|
57
56
|
def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
|
|
@@ -121,7 +120,7 @@ Pre-Set Quantization Scheme Args
|
|
|
121
120
|
"""
|
|
122
121
|
|
|
123
122
|
|
|
124
|
-
def preset_name_to_scheme(name: str, targets:
|
|
123
|
+
def preset_name_to_scheme(name: str, targets: list[str]) -> QuantizationScheme:
|
|
125
124
|
"""
|
|
126
125
|
:param name: preset quantization settings name. must exist in upper case in
|
|
127
126
|
PRESET_SCHEMES
|
|
@@ -175,7 +174,6 @@ NVFP4 = dict(
|
|
|
175
174
|
symmetric=True,
|
|
176
175
|
dynamic=False,
|
|
177
176
|
group_size=16,
|
|
178
|
-
observer="static_minmax",
|
|
179
177
|
scale_dtype=FP8_E4M3_DATA.dtype,
|
|
180
178
|
zp_dtype=FP8_E4M3_DATA.dtype,
|
|
181
179
|
),
|
|
@@ -244,7 +242,6 @@ INT8_W8A8 = dict(
|
|
|
244
242
|
strategy=QuantizationStrategy.TOKEN,
|
|
245
243
|
symmetric=True,
|
|
246
244
|
dynamic=True,
|
|
247
|
-
observer=None,
|
|
248
245
|
),
|
|
249
246
|
)
|
|
250
247
|
|
|
@@ -299,7 +296,6 @@ INT8_W4A8 = dict(
|
|
|
299
296
|
strategy=QuantizationStrategy.TOKEN,
|
|
300
297
|
symmetric=True,
|
|
301
298
|
dynamic=True,
|
|
302
|
-
observer=None,
|
|
303
299
|
),
|
|
304
300
|
)
|
|
305
301
|
|
|
@@ -356,7 +352,6 @@ FP8_DYNAMIC = dict(
|
|
|
356
352
|
strategy=QuantizationStrategy.TOKEN,
|
|
357
353
|
symmetric=True,
|
|
358
354
|
dynamic=True,
|
|
359
|
-
observer=None,
|
|
360
355
|
),
|
|
361
356
|
)
|
|
362
357
|
|
|
@@ -378,7 +373,6 @@ FP8_BLOCK = dict(
|
|
|
378
373
|
strategy=QuantizationStrategy.GROUP,
|
|
379
374
|
symmetric=True,
|
|
380
375
|
dynamic=True,
|
|
381
|
-
observer=None,
|
|
382
376
|
group_size=128,
|
|
383
377
|
),
|
|
384
378
|
)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import math
|
|
17
|
-
from
|
|
17
|
+
from collections.abc import Generator
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
20
|
from compressed_tensors.quantization.quant_args import (
|
|
@@ -66,8 +66,8 @@ def calculate_qparams(
|
|
|
66
66
|
min_vals: Tensor,
|
|
67
67
|
max_vals: Tensor,
|
|
68
68
|
quantization_args: QuantizationArgs,
|
|
69
|
-
global_scale:
|
|
70
|
-
) ->
|
|
69
|
+
global_scale: Tensor | None = None,
|
|
70
|
+
) -> tuple[FloatTensor, IntTensor]:
|
|
71
71
|
"""
|
|
72
72
|
:param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
|
|
73
73
|
from
|
|
@@ -152,7 +152,7 @@ def compute_dynamic_scales_and_zp(
|
|
|
152
152
|
value: Tensor,
|
|
153
153
|
args: QuantizationArgs,
|
|
154
154
|
module: torch.nn.Module,
|
|
155
|
-
global_scale:
|
|
155
|
+
global_scale: Tensor | None = None,
|
|
156
156
|
):
|
|
157
157
|
"""
|
|
158
158
|
Returns the computed scales and zero points for dynamic activation
|
|
@@ -207,7 +207,9 @@ def compute_dynamic_scales_and_zp(
|
|
|
207
207
|
return calculate_qparams(min_val, max_val, args, global_scale=global_scale)
|
|
208
208
|
|
|
209
209
|
|
|
210
|
-
def calculate_range(
|
|
210
|
+
def calculate_range(
|
|
211
|
+
quantization_args: QuantizationArgs, device: str
|
|
212
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
211
213
|
"""
|
|
212
214
|
Calculated the effective quantization range for the given Quantization Args
|
|
213
215
|
|
|
@@ -285,7 +287,7 @@ def module_type(module: Module) -> str:
|
|
|
285
287
|
"Please use `model.named_modules()` and filter by "
|
|
286
288
|
"compressed_tensors.InternalModule if neceessary"
|
|
287
289
|
)
|
|
288
|
-
def iter_named_leaf_modules(model: Module) -> Generator[
|
|
290
|
+
def iter_named_leaf_modules(model: Module) -> Generator[tuple[str, Module], None, None]:
|
|
289
291
|
"""
|
|
290
292
|
Yields modules that do not have any submodules except observers. The observers
|
|
291
293
|
themselves are not yielded
|
|
@@ -321,7 +323,7 @@ def iter_named_quantizable_modules(
|
|
|
321
323
|
include_children: bool = True,
|
|
322
324
|
include_attn: bool = False,
|
|
323
325
|
include_mlp: bool = False,
|
|
324
|
-
) -> Generator[
|
|
326
|
+
) -> Generator[tuple[str, Module], None, None]:
|
|
325
327
|
"""
|
|
326
328
|
Yield name and submodule of
|
|
327
329
|
- leaf modules, set by include_children
|
|
@@ -416,9 +418,9 @@ def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
|
|
|
416
418
|
def generate_gparam(
|
|
417
419
|
updated_min_val: torch.Tensor,
|
|
418
420
|
updated_max_val: torch.Tensor,
|
|
419
|
-
scale_data:
|
|
420
|
-
quant_data:
|
|
421
|
-
dtype:
|
|
421
|
+
scale_data: FloatArgs | None = FP8_E4M3_DATA,
|
|
422
|
+
quant_data: FloatArgs | None = FP4_E2M1_DATA,
|
|
423
|
+
dtype: torch.dtype | None = torch.float32,
|
|
422
424
|
):
|
|
423
425
|
"""
|
|
424
426
|
Generate a global scale for an entire tensor (input_tensor).
|
|
@@ -439,7 +441,7 @@ def generate_gparam(
|
|
|
439
441
|
def strategy_cdiv(
|
|
440
442
|
value: int,
|
|
441
443
|
divisor: int,
|
|
442
|
-
strategy:
|
|
444
|
+
strategy: QuantizationStrategy | None,
|
|
443
445
|
strict: bool = False,
|
|
444
446
|
) -> int:
|
|
445
447
|
dividend = math.ceil(value / divisor)
|
|
@@ -14,19 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
import contextlib
|
|
16
16
|
import warnings
|
|
17
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
17
18
|
from functools import wraps
|
|
18
19
|
from types import MappingProxyType
|
|
19
|
-
from typing import
|
|
20
|
-
TYPE_CHECKING,
|
|
21
|
-
Any,
|
|
22
|
-
Callable,
|
|
23
|
-
Dict,
|
|
24
|
-
Iterable,
|
|
25
|
-
List,
|
|
26
|
-
Mapping,
|
|
27
|
-
Optional,
|
|
28
|
-
TypeVar,
|
|
29
|
-
)
|
|
20
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
30
21
|
|
|
31
22
|
import numpy
|
|
32
23
|
import torch
|
|
@@ -66,7 +57,7 @@ FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
|
|
|
66
57
|
|
|
67
58
|
def infer_compressor_from_model_config(
|
|
68
59
|
pretrained_model_name_or_path: str,
|
|
69
|
-
) ->
|
|
60
|
+
) -> "ModelCompressor | None": # noqa: F821
|
|
70
61
|
"""
|
|
71
62
|
Given a path to a model config, extract a sparsity config if it exists and return
|
|
72
63
|
the associated ModelCompressor
|
|
@@ -185,7 +176,7 @@ def getattr_chain(obj: Any, chain_str: str, *args, **kwargs) -> Any:
|
|
|
185
176
|
|
|
186
177
|
|
|
187
178
|
def deprecated(
|
|
188
|
-
future_name:
|
|
179
|
+
future_name: str | None = None, message: str | None = None
|
|
189
180
|
) -> Callable[[T], T]:
|
|
190
181
|
"""
|
|
191
182
|
Decorator to mark functions as deprecated
|
|
@@ -224,7 +215,7 @@ class Aliasable:
|
|
|
224
215
|
"""
|
|
225
216
|
|
|
226
217
|
@staticmethod
|
|
227
|
-
def get_aliases() ->
|
|
218
|
+
def get_aliases() -> dict[str, str]:
|
|
228
219
|
raise NotImplementedError()
|
|
229
220
|
|
|
230
221
|
def __eq__(self, other):
|
|
@@ -246,8 +237,8 @@ class Aliasable:
|
|
|
246
237
|
|
|
247
238
|
|
|
248
239
|
def shard_tensor(
|
|
249
|
-
tensor: torch.Tensor, shard_sizes:
|
|
250
|
-
) ->
|
|
240
|
+
tensor: torch.Tensor, shard_sizes: list[int], dim: int = 0
|
|
241
|
+
) -> list[torch.Tensor]:
|
|
251
242
|
"""
|
|
252
243
|
Shards a tensor into a list of tensors along a given dimension.
|
|
253
244
|
|
|
@@ -277,7 +268,7 @@ def shard_tensor(
|
|
|
277
268
|
return shards
|
|
278
269
|
|
|
279
270
|
|
|
280
|
-
def combine_shards(shards, dim=0):
|
|
271
|
+
def combine_shards(shards: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
|
|
281
272
|
"""
|
|
282
273
|
Combine decompressed shards along a given dimension using `narrow`.
|
|
283
274
|
|
|
@@ -325,7 +316,7 @@ def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
|
|
|
325
316
|
|
|
326
317
|
|
|
327
318
|
def unpack_bitmasks(
|
|
328
|
-
packed_bitmasks: torch.Tensor, original_shape:
|
|
319
|
+
packed_bitmasks: torch.Tensor, original_shape: list[int]
|
|
329
320
|
) -> torch.Tensor:
|
|
330
321
|
"""
|
|
331
322
|
Converts a bitmask tensor back to a bytemask tensor for use during decompression
|