compressed-tensors 0.13.1a20260123__tar.gz → 0.13.1a20260127__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/actions/test/action.yml +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/workflows/test-check.yaml +1 -1
- {compressed_tensors-0.13.1a20260123/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260127}/PKG-INFO +4 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/setup.py +2 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/linear/compressed_linear.py +0 -6
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/__init__.py +7 -6
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/dispatch.py +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/forward.py +18 -19
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/initialize.py +2 -2
- compressed_tensors-0.13.1a20260127/src/compressed_tensors/transform/apply.py +36 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/factory/base.py +3 -11
- compressed_tensors-0.13.1a20260127/src/compressed_tensors/utils/offload.py +195 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/version.py +1 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127/src/compressed_tensors.egg-info}/PKG-INFO +4 -2
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors.egg-info/SOURCES.txt +0 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors.egg-info/requires.txt +3 -1
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_apply.py +2 -12
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_initialize.py +4 -5
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/factory/test_correctness.py +6 -15
- compressed_tensors-0.13.1a20260127/tests/test_transform/factory/test_memory.py +74 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/factory/test_serialization.py +8 -16
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_utils/test_match.py +28 -34
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/testing_utils.py +0 -18
- compressed_tensors-0.13.1a20260123/src/compressed_tensors/transform/apply.py +0 -71
- compressed_tensors-0.13.1a20260123/src/compressed_tensors/utils/offload.py +0 -672
- compressed_tensors-0.13.1a20260123/tests/test_transform/factory/test_memory.py +0 -92
- compressed_tensors-0.13.1a20260123/tests/test_utils/test_offload.py +0 -540
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/.gitkeep +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/mergify.yml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/scripts/step-status +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/workflows/quality-check.yaml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.github/workflows/stale.yml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/.gitignore +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/LICENSE +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/Makefile +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/README.md +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/bit_packing/int4_config.json +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/bitmask_compression.ipynb +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/llama_1.1b/ex_config_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/llama_1.1b/example_quant_config.json +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/examples/quantize_and_pack_int4.ipynb +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/pyproject.toml +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/setup.cfg +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/README.md +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/dense.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/format.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/logger.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/modeling/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/modeling/attention.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/modeling/kvcache.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/cache/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/cache/base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/cache/cpu.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/cache/device.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/module.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/offload/utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/apply.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/quant_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/factory/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/transform/utils/matrix.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/binary_search.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/internal.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/match.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/safetensors_load.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors/utils/type.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/src/compressed_tensors.egg-info/top_level.txt +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/mock_observer.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/model_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_configs/test_base.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_configs/test_infer_quant.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_linear/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_linear/test_compressed_linear.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_modeling/test_attention_and_cache.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_offload/cache/test_cpu.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_offload/test_dispatch.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_offload/test_interface.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_offload/test_module.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_forward.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_configs/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_configs/test_strategies.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_quant_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_quant_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_quant_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_registry.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/conftest.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/test_transform_args.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/test_transform_config.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/test_transform_scheme.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_transform/utils/test_hadamard.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_utils/__init__.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_utils/test_helpers.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_utils/test_safetensors_load.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/tests/test_utils/test_type.py +0 -0
- {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260127}/utils/copyright.py +0 -0
|
@@ -30,7 +30,7 @@ jobs:
|
|
|
30
30
|
- name: Set Env
|
|
31
31
|
run: pip3 install --upgrade pip setuptools
|
|
32
32
|
- name: "⚙️ Install dependencies"
|
|
33
|
-
run: pip3 install .[dev
|
|
33
|
+
run: pip3 install .[dev]
|
|
34
34
|
- name: clean up
|
|
35
35
|
run: |
|
|
36
36
|
echo "cleaning up disk space as GHA runner has limited disk size."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1a20260127
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -9,7 +9,7 @@ License: Apache 2.0
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: torch<=2.9.1,>=1.7.0
|
|
12
|
-
Requires-Dist: transformers
|
|
12
|
+
Requires-Dist: transformers<5.0.0
|
|
13
13
|
Requires-Dist: pydantic>=2.0
|
|
14
14
|
Requires-Dist: loguru
|
|
15
15
|
Provides-Extra: dev
|
|
@@ -19,6 +19,8 @@ Requires-Dist: wheel>=0.36.2; extra == "dev"
|
|
|
19
19
|
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
|
20
20
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
21
21
|
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
|
22
|
+
Requires-Dist: transformers<5.0; extra == "dev"
|
|
23
|
+
Requires-Dist: accelerate; extra == "dev"
|
|
22
24
|
Provides-Extra: accelerate
|
|
23
25
|
Requires-Dist: accelerate; extra == "accelerate"
|
|
24
26
|
Dynamic: author
|
|
@@ -88,11 +88,11 @@ def _setup_packages() -> List:
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
def _setup_install_requires() -> List:
|
|
91
|
-
return ["torch>=1.7.0,<=2.9.1", "transformers", "pydantic>=2.0", "loguru"]
|
|
91
|
+
return ["torch>=1.7.0,<=2.9.1", "transformers<5.0.0", "pydantic>=2.0", "loguru"]
|
|
92
92
|
|
|
93
93
|
def _setup_extras() -> Dict:
|
|
94
94
|
return {
|
|
95
|
-
"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"],
|
|
95
|
+
"dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3", "transformers<5.0", "accelerate"],
|
|
96
96
|
"accelerate": ["accelerate"]
|
|
97
97
|
}
|
|
98
98
|
|
|
@@ -87,12 +87,6 @@ class CompressedLinear(Linear):
|
|
|
87
87
|
# mark module as compressed
|
|
88
88
|
module.quantization_status = QuantizationStatus.COMPRESSED
|
|
89
89
|
|
|
90
|
-
# handles case where forward is wrapped in new_forward by accelerate hooks
|
|
91
|
-
if hasattr(module, "_old_forward"):
|
|
92
|
-
module._old_forward = CompressedLinear.forward.__get__(
|
|
93
|
-
module, CompressedLinear
|
|
94
|
-
)
|
|
95
|
-
|
|
96
90
|
return module
|
|
97
91
|
|
|
98
92
|
def forward(self, input: Tensor) -> Tensor:
|
|
@@ -135,9 +135,7 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
|
|
|
135
135
|
"""
|
|
136
136
|
cache = base._parameters
|
|
137
137
|
if isinstance(cache, OffloadCache):
|
|
138
|
-
offload_module(
|
|
139
|
-
module, cache.onload_device, cache.offload_device, no_split=False
|
|
140
|
-
)
|
|
138
|
+
offload_module(module, cache.onload_device, cache.offload_device)
|
|
141
139
|
|
|
142
140
|
base.register_module(name, module)
|
|
143
141
|
|
|
@@ -178,9 +176,12 @@ def align_module_device(
|
|
|
178
176
|
if isinstance(module._parameters, OffloadCache):
|
|
179
177
|
assert isinstance(module._buffers, OffloadCache)
|
|
180
178
|
with module._parameters.disable_offloading():
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
179
|
+
if execution_device is not None:
|
|
180
|
+
with patch_attr(
|
|
181
|
+
module._parameters, "onload_device", execution_device
|
|
182
|
+
), patch_attr(module._buffers, "onload_device", execution_device):
|
|
183
|
+
yield
|
|
184
|
+
else:
|
|
184
185
|
yield
|
|
185
186
|
|
|
186
187
|
else:
|
|
@@ -39,7 +39,7 @@ ModelType = TypeVar("ModelType", bound=torch.nn.Module)
|
|
|
39
39
|
def offload_model(
|
|
40
40
|
model: ModelType,
|
|
41
41
|
onload_device: torch.device | str,
|
|
42
|
-
offload_device:
|
|
42
|
+
offload_device: torch.device | str | Literal["disk"] = torch.device("cpu"),
|
|
43
43
|
) -> ModelType:
|
|
44
44
|
"""
|
|
45
45
|
Offload a model to the `offload_device`. During forward passes, model weights will
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
from functools import wraps
|
|
16
16
|
from math import ceil
|
|
17
|
-
from typing import Optional
|
|
18
17
|
|
|
19
18
|
import torch
|
|
20
19
|
from compressed_tensors.quantization.quant_args import (
|
|
@@ -47,9 +46,9 @@ def quantize(
|
|
|
47
46
|
scale: torch.Tensor,
|
|
48
47
|
zero_point: torch.Tensor,
|
|
49
48
|
args: QuantizationArgs,
|
|
50
|
-
dtype:
|
|
51
|
-
g_idx:
|
|
52
|
-
global_scale:
|
|
49
|
+
dtype: torch.dtype | None = None,
|
|
50
|
+
g_idx: torch.Tensor | None = None,
|
|
51
|
+
global_scale: torch.Tensor | None = None,
|
|
53
52
|
) -> torch.Tensor:
|
|
54
53
|
"""
|
|
55
54
|
Quantize the input tensor x using the QuantizationStrategy specified in args.
|
|
@@ -85,11 +84,11 @@ def quantize(
|
|
|
85
84
|
def dequantize(
|
|
86
85
|
x_q: torch.Tensor,
|
|
87
86
|
scale: torch.Tensor,
|
|
88
|
-
zero_point:
|
|
89
|
-
args:
|
|
90
|
-
dtype:
|
|
91
|
-
g_idx:
|
|
92
|
-
global_scale:
|
|
87
|
+
zero_point: torch.Tensor | None = None,
|
|
88
|
+
args: QuantizationArgs | None = None,
|
|
89
|
+
dtype: torch.dtype | None = None,
|
|
90
|
+
g_idx: torch.Tensor | None = None,
|
|
91
|
+
global_scale: torch.Tensor | None = None,
|
|
93
92
|
) -> torch.Tensor:
|
|
94
93
|
"""
|
|
95
94
|
Dequantize a quantized input tensor x_q based on the strategy specified in args. If
|
|
@@ -159,8 +158,8 @@ def fake_quantize(
|
|
|
159
158
|
scale: torch.Tensor,
|
|
160
159
|
zero_point: torch.Tensor,
|
|
161
160
|
args: QuantizationArgs,
|
|
162
|
-
g_idx:
|
|
163
|
-
global_scale:
|
|
161
|
+
g_idx: torch.Tensor | None = None,
|
|
162
|
+
global_scale: torch.Tensor | None = None,
|
|
164
163
|
) -> torch.Tensor:
|
|
165
164
|
"""
|
|
166
165
|
Fake quantize the input tensor x by quantizing then dequantizing with
|
|
@@ -195,11 +194,11 @@ def _process_quantization(
|
|
|
195
194
|
scale: torch.Tensor,
|
|
196
195
|
zero_point: torch.Tensor,
|
|
197
196
|
args: QuantizationArgs,
|
|
198
|
-
g_idx:
|
|
199
|
-
dtype:
|
|
197
|
+
g_idx: torch.Tensor | None = None,
|
|
198
|
+
dtype: torch.dtype | None = None,
|
|
200
199
|
do_quantize: bool = True,
|
|
201
200
|
do_dequantize: bool = True,
|
|
202
|
-
global_scale:
|
|
201
|
+
global_scale: torch.Tensor | None = None,
|
|
203
202
|
) -> torch.Tensor:
|
|
204
203
|
q_min, q_max = calculate_range(args, x.device)
|
|
205
204
|
group_size = args.group_size
|
|
@@ -457,8 +456,8 @@ def _quantize(
|
|
|
457
456
|
q_min: torch.Tensor,
|
|
458
457
|
q_max: torch.Tensor,
|
|
459
458
|
args: QuantizationArgs,
|
|
460
|
-
dtype:
|
|
461
|
-
global_scale:
|
|
459
|
+
dtype: torch.dtype | None = None,
|
|
460
|
+
global_scale: torch.Tensor | None = None,
|
|
462
461
|
) -> torch.Tensor:
|
|
463
462
|
|
|
464
463
|
# if a global scale is optionally provided, use it
|
|
@@ -486,9 +485,9 @@ def _quantize(
|
|
|
486
485
|
def _dequantize(
|
|
487
486
|
x_q: torch.Tensor,
|
|
488
487
|
scale: torch.Tensor,
|
|
489
|
-
zero_point: torch.Tensor = None,
|
|
490
|
-
dtype:
|
|
491
|
-
global_scale:
|
|
488
|
+
zero_point: torch.Tensor | None = None,
|
|
489
|
+
dtype: torch.dtype | None = None,
|
|
490
|
+
global_scale: torch.Tensor | None = None,
|
|
492
491
|
) -> torch.Tensor:
|
|
493
492
|
|
|
494
493
|
# if a global scale is optionally provided, use it
|
|
@@ -23,6 +23,7 @@ from compressed_tensors.modeling import (
|
|
|
23
23
|
QuantizedAttentionImpl,
|
|
24
24
|
QuantizedKVCache,
|
|
25
25
|
)
|
|
26
|
+
from compressed_tensors.offload import unwrap_offload_forward
|
|
26
27
|
from compressed_tensors.quantization import (
|
|
27
28
|
ActivationOrdering,
|
|
28
29
|
DynamicType,
|
|
@@ -37,7 +38,6 @@ from compressed_tensors.quantization.lifecycle.forward import (
|
|
|
37
38
|
)
|
|
38
39
|
from compressed_tensors.quantization.utils import strategy_cdiv
|
|
39
40
|
from compressed_tensors.utils import (
|
|
40
|
-
disable_hf_hook,
|
|
41
41
|
get_execution_device,
|
|
42
42
|
get_head_dim,
|
|
43
43
|
get_num_attn_heads,
|
|
@@ -134,7 +134,7 @@ def initialize_module_for_quantization(
|
|
|
134
134
|
force_zero_point=force_zero_point,
|
|
135
135
|
)
|
|
136
136
|
|
|
137
|
-
with
|
|
137
|
+
with unwrap_offload_forward(module):
|
|
138
138
|
# wrap forward call of module to perform
|
|
139
139
|
# quantized actions based on calltime status
|
|
140
140
|
wrap_module_forward_quantized(module, scheme)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import torch
|
|
16
|
+
from compressed_tensors import TRANSFORM_CONFIG_NAME
|
|
17
|
+
from compressed_tensors.transform import TransformConfig, TransformFactory
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = ["apply_transform_config"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def apply_transform_config(model: torch.nn.Module, config: TransformConfig):
|
|
24
|
+
"""
|
|
25
|
+
Apply a transform config to a model. Weight transforms are fused into weights, while
|
|
26
|
+
activation transforms are attached as submodules and trigger via pytorch hooks
|
|
27
|
+
|
|
28
|
+
:param model: model to apply config to
|
|
29
|
+
:param config: transform config to apply
|
|
30
|
+
"""
|
|
31
|
+
for name, scheme in config.config_groups.items():
|
|
32
|
+
factory = TransformFactory.from_scheme(scheme, name=name)
|
|
33
|
+
factory.apply_to_model(model)
|
|
34
|
+
|
|
35
|
+
# attach config to model for compression/serialization
|
|
36
|
+
setattr(model, TRANSFORM_CONFIG_NAME, config)
|
|
@@ -26,6 +26,7 @@ from compressed_tensors.modeling.kvcache import (
|
|
|
26
26
|
initialize_hooked_kv_cache,
|
|
27
27
|
register_key_hook,
|
|
28
28
|
)
|
|
29
|
+
from compressed_tensors.offload import OffloadCache
|
|
29
30
|
from compressed_tensors.registry.registry import RegistryMixin, T
|
|
30
31
|
from compressed_tensors.transform import (
|
|
31
32
|
TransformArgs,
|
|
@@ -34,8 +35,6 @@ from compressed_tensors.transform import (
|
|
|
34
35
|
)
|
|
35
36
|
from compressed_tensors.utils import (
|
|
36
37
|
align_module_device,
|
|
37
|
-
delete_offload_module,
|
|
38
|
-
has_offloaded_params,
|
|
39
38
|
match_named_modules,
|
|
40
39
|
patch_attr,
|
|
41
40
|
register_offload_module,
|
|
@@ -116,13 +115,6 @@ class TransformFactory(RegistryMixin, ABC):
|
|
|
116
115
|
:param module: target module to apply transforms to
|
|
117
116
|
:param args: defines how the transform will be applied to the target module
|
|
118
117
|
"""
|
|
119
|
-
if has_offloaded_params(module):
|
|
120
|
-
if module._hf_hook.place_submodules:
|
|
121
|
-
raise NotImplementedError(
|
|
122
|
-
"Applying transforms to offloaded submodules with "
|
|
123
|
-
"`place_submodules=True` is not supported"
|
|
124
|
-
)
|
|
125
|
-
|
|
126
118
|
# create transform as submodule
|
|
127
119
|
transform_name = f"{self.name}_{args.location}"
|
|
128
120
|
transform = self.create_transform(module, args)
|
|
@@ -150,13 +142,13 @@ class TransformFactory(RegistryMixin, ABC):
|
|
|
150
142
|
if self.scheme.requires_grad:
|
|
151
143
|
# for training, the weight changes with every forward pass
|
|
152
144
|
# so we can leverage parametrization to propagate the gradient
|
|
153
|
-
if
|
|
145
|
+
if isinstance(module._parameters, OffloadCache):
|
|
154
146
|
raise ValueError("Offloaded training is not supported")
|
|
155
147
|
P.register_parametrization(module, "weight", transform)
|
|
156
148
|
|
|
157
149
|
else:
|
|
158
150
|
# transform is no longer needed (unfusing is not supported)
|
|
159
|
-
|
|
151
|
+
delattr(module, transform_name)
|
|
160
152
|
|
|
161
153
|
# register output transformation hook
|
|
162
154
|
elif args.location == TransformLocation.OUTPUT:
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""
|
|
15
|
+
Utilities associated with offloading functionality
|
|
16
|
+
|
|
17
|
+
| ------------------------------------------------------------------------------------------------------ | # noqa: E501
|
|
18
|
+
| Operation | Without offloading support | With offloading support | # noqa: E501
|
|
19
|
+
| ---------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501
|
|
20
|
+
| Update | module.name.data.copy_(new_data) | update_offload_parameter(module, name, new_data) | # noqa: E501
|
|
21
|
+
| ------------------------------------------------------------------------------------------------------ | # noqa: E501
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import contextlib
|
|
25
|
+
from typing import Literal, Optional
|
|
26
|
+
|
|
27
|
+
import torch
|
|
28
|
+
from compressed_tensors.offload import (
|
|
29
|
+
align_module_device,
|
|
30
|
+
align_modules,
|
|
31
|
+
disable_offloading,
|
|
32
|
+
get_execution_device,
|
|
33
|
+
get_offloaded_device,
|
|
34
|
+
offload_model,
|
|
35
|
+
register_offload_module,
|
|
36
|
+
remove_dispatch,
|
|
37
|
+
update_offload_parameter,
|
|
38
|
+
)
|
|
39
|
+
from compressed_tensors.utils.helpers import deprecated
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"get_execution_device",
|
|
44
|
+
"get_offloaded_device",
|
|
45
|
+
"update_parameter_data",
|
|
46
|
+
"register_offload_parameter",
|
|
47
|
+
"update_offload_parameter",
|
|
48
|
+
"delete_offload_parameter",
|
|
49
|
+
"has_offloaded_params",
|
|
50
|
+
"disable_hf_hook",
|
|
51
|
+
"disable_offload",
|
|
52
|
+
"align_modules",
|
|
53
|
+
"align_module_device",
|
|
54
|
+
"register_offload_module",
|
|
55
|
+
"delete_offload_module",
|
|
56
|
+
"offloaded_dispatch",
|
|
57
|
+
"disable_offloading",
|
|
58
|
+
"remove_dispatch",
|
|
59
|
+
"cast_to_device",
|
|
60
|
+
"offload_to_weights_map",
|
|
61
|
+
"delete_from_weights_map",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def update_parameter_data(
|
|
66
|
+
module: torch.nn.Module, new_param_data: torch.Tensor, param_name: str
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Update the data of an existing parameter and its offload dict. Supports both
|
|
70
|
+
parameters of offloaded modules and non-offloaded modules
|
|
71
|
+
|
|
72
|
+
:param module: module containing the parameter to update
|
|
73
|
+
:param new_param_data: tensor to update parameter with
|
|
74
|
+
:param param_name: name of module parameter to update
|
|
75
|
+
"""
|
|
76
|
+
update_offload_parameter(module, param_name, new_param_data)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
""" Candidates for Upstreaming """
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@deprecated()
|
|
83
|
+
def cast_to_device(device_spec: int | torch.device) -> torch.device:
|
|
84
|
+
"""
|
|
85
|
+
Convert an integer device index or torch.device into a torch.device object.
|
|
86
|
+
|
|
87
|
+
:param device_spec: Device index (int) or torch.device object.
|
|
88
|
+
Negative integers map to CPU.
|
|
89
|
+
:return: torch.device corresponding to the given device specification.
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(device_spec, int):
|
|
92
|
+
return torch.device(f"cuda:{device_spec}" if device_spec >= 0 else "cpu")
|
|
93
|
+
return device_spec
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@deprecated("module.register_parameter(name, parameter)")
|
|
97
|
+
def register_offload_parameter(
|
|
98
|
+
module: torch.nn.Module,
|
|
99
|
+
name: str,
|
|
100
|
+
parameter: torch.nn.Parameter,
|
|
101
|
+
offload_device: Optional[torch.device | Literal["disk"]] = None,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Register a parameter to the given module which may be offloaded
|
|
105
|
+
|
|
106
|
+
:param module: maybe offloaded module
|
|
107
|
+
:param name: name of newly registered parameter
|
|
108
|
+
:param parameter: parameter being registered
|
|
109
|
+
:param offload_device: device on which weight will be offloaded to. If None is
|
|
110
|
+
provided, then infer device from parameters on module
|
|
111
|
+
"""
|
|
112
|
+
if offload_device == "disk":
|
|
113
|
+
raise NotImplementedError("Disk offloading is not currently supported")
|
|
114
|
+
|
|
115
|
+
module.register_parameter(name, parameter)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@deprecated("delattr(module, name)")
|
|
119
|
+
def delete_offload_parameter(module: torch.nn.Module, name: str):
|
|
120
|
+
"""
|
|
121
|
+
Delete a parameter from a module which may be offloaded,
|
|
122
|
+
including any metadata in _hf_hook
|
|
123
|
+
|
|
124
|
+
:param module: maybe offloaded module
|
|
125
|
+
:param name: name of parameter being deleted
|
|
126
|
+
"""
|
|
127
|
+
delattr(module, name)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@deprecated("compressed_tensors.offload::unwrap_offload")
|
|
131
|
+
@contextlib.contextmanager
|
|
132
|
+
def disable_hf_hook(module: torch.nn.Module):
|
|
133
|
+
raise ValueError()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@deprecated("delattr(base, name)")
|
|
137
|
+
def delete_offload_module(base: torch.nn.Module, name: str):
|
|
138
|
+
"""
|
|
139
|
+
Delete a submodule from a model which may contain offloading
|
|
140
|
+
:param base: parent module to delete submodule from
|
|
141
|
+
:param name: name of submodule on parent
|
|
142
|
+
"""
|
|
143
|
+
delattr(base, name)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@deprecated("compressed_tensors.offload::offload_model")
|
|
147
|
+
def offloaded_dispatch(
|
|
148
|
+
module: torch.nn.Module,
|
|
149
|
+
execution_device: torch.device,
|
|
150
|
+
offload_device: Optional[torch.device | Literal["disk"]] = None,
|
|
151
|
+
) -> torch.nn.Module:
|
|
152
|
+
"""
|
|
153
|
+
Dispatch a model, keeping device parameters offloaded on their current device
|
|
154
|
+
|
|
155
|
+
:param module: module containing parameters to offload
|
|
156
|
+
:param execution_device: device that modules will onload and execute on
|
|
157
|
+
:param offload_device: device that module parameters will offload to
|
|
158
|
+
:return: module with offloading device hooks
|
|
159
|
+
"""
|
|
160
|
+
if offload_device is not None:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"Passing offload_device to offloaded_dispatch is no longer supported"
|
|
163
|
+
)
|
|
164
|
+
offload_model(module, execution_device)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@deprecated("compressed_tensors.offload::align_module_device")
|
|
168
|
+
def disable_offload(module: torch.nn.Module):
|
|
169
|
+
raise ValueError()
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@deprecated()
|
|
173
|
+
def offload_to_weights_map(*args, **kwargs):
|
|
174
|
+
raise ValueError()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@deprecated()
|
|
178
|
+
def delete_from_weights_map(*args, **kwargs):
|
|
179
|
+
raise ValueError()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@deprecated()
|
|
183
|
+
def has_offloaded_params(module: torch.nn.Module) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Checks if a module has offloaded parameters by checking if the given module has a
|
|
186
|
+
AlignDevicesHook attached with offloading enabled
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
module (`torch.nn.Module`): The module to check for an offload hook.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
bool: `True` if the module has an offload hook and offloading is enabled,
|
|
193
|
+
`False` otherwise.
|
|
194
|
+
"""
|
|
195
|
+
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: compressed-tensors
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1a20260127
|
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
|
5
5
|
Home-page: https://github.com/vllm-project/compressed-tensors
|
|
6
6
|
Author: Neuralmagic, Inc.
|
|
@@ -9,7 +9,7 @@ License: Apache 2.0
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: torch<=2.9.1,>=1.7.0
|
|
12
|
-
Requires-Dist: transformers
|
|
12
|
+
Requires-Dist: transformers<5.0.0
|
|
13
13
|
Requires-Dist: pydantic>=2.0
|
|
14
14
|
Requires-Dist: loguru
|
|
15
15
|
Provides-Extra: dev
|
|
@@ -19,6 +19,8 @@ Requires-Dist: wheel>=0.36.2; extra == "dev"
|
|
|
19
19
|
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
|
20
20
|
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
21
21
|
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
|
22
|
+
Requires-Dist: transformers<5.0; extra == "dev"
|
|
23
|
+
Requires-Dist: accelerate; extra == "dev"
|
|
22
24
|
Provides-Extra: accelerate
|
|
23
25
|
Requires-Dist: accelerate; extra == "accelerate"
|
|
24
26
|
Dynamic: author
|
|
@@ -166,7 +166,6 @@ tests/test_transform/utils/test_hadamard.py
|
|
|
166
166
|
tests/test_utils/__init__.py
|
|
167
167
|
tests/test_utils/test_helpers.py
|
|
168
168
|
tests/test_utils/test_match.py
|
|
169
|
-
tests/test_utils/test_offload.py
|
|
170
169
|
tests/test_utils/test_safetensors_load.py
|
|
171
170
|
tests/test_utils/test_type.py
|
|
172
171
|
utils/copyright.py
|
|
@@ -32,7 +32,6 @@ from compressed_tensors.quantization import (
|
|
|
32
32
|
)
|
|
33
33
|
from compressed_tensors.quantization.lifecycle import apply_quantization_config
|
|
34
34
|
from compressed_tensors.utils import is_match, match_named_modules
|
|
35
|
-
from tests.testing_utils import requires_accelerate
|
|
36
35
|
from transformers import AutoModelForCausalLM
|
|
37
36
|
|
|
38
37
|
|
|
@@ -322,7 +321,6 @@ def get_sample_tinyllama_quant_config(
|
|
|
322
321
|
return QuantizationConfig.model_validate(config_dict)
|
|
323
322
|
|
|
324
323
|
|
|
325
|
-
@requires_accelerate()
|
|
326
324
|
@pytest.mark.parametrize(
|
|
327
325
|
"target,should_raise_warning",
|
|
328
326
|
[
|
|
@@ -462,12 +460,8 @@ def test_multi_apply_quantization_config():
|
|
|
462
460
|
)
|
|
463
461
|
|
|
464
462
|
|
|
465
|
-
@requires_accelerate()
|
|
466
463
|
def test_apply_kv_cache():
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
with init_empty_weights():
|
|
470
|
-
model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
|
|
464
|
+
model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
|
|
471
465
|
|
|
472
466
|
args = QuantizationArgs(
|
|
473
467
|
num_bits=8,
|
|
@@ -486,12 +480,8 @@ def test_apply_kv_cache():
|
|
|
486
480
|
assert hasattr(layer.self_attn, "v_scale")
|
|
487
481
|
|
|
488
482
|
|
|
489
|
-
@requires_accelerate()
|
|
490
483
|
def test_apply_attention():
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
with init_empty_weights():
|
|
494
|
-
model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
|
|
484
|
+
model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
|
|
495
485
|
|
|
496
486
|
scheme = QuantizationScheme(
|
|
497
487
|
targets=["LlamaAttention"],
|
|
@@ -17,6 +17,7 @@ import math
|
|
|
17
17
|
|
|
18
18
|
import pytest
|
|
19
19
|
import torch
|
|
20
|
+
from compressed_tensors.offload import offload_model
|
|
20
21
|
from compressed_tensors.quantization import (
|
|
21
22
|
FP8_E4M3_DATA,
|
|
22
23
|
ActivationOrdering,
|
|
@@ -28,7 +29,7 @@ from compressed_tensors.quantization import (
|
|
|
28
29
|
from compressed_tensors.quantization.lifecycle.initialize import (
|
|
29
30
|
initialize_module_for_quantization,
|
|
30
31
|
)
|
|
31
|
-
from tests.testing_utils import
|
|
32
|
+
from tests.testing_utils import requires_gpu
|
|
32
33
|
from torch.nn import Linear
|
|
33
34
|
|
|
34
35
|
|
|
@@ -98,7 +99,7 @@ def test_initialize_module_for_quantization(
|
|
|
98
99
|
assert layer.quantization_status == QuantizationStatus.INITIALIZED
|
|
99
100
|
|
|
100
101
|
|
|
101
|
-
@
|
|
102
|
+
@requires_gpu
|
|
102
103
|
@pytest.mark.parametrize(
|
|
103
104
|
"weights,input_activations",
|
|
104
105
|
[
|
|
@@ -119,9 +120,7 @@ def test_initialize_module_for_quantization(
|
|
|
119
120
|
def test_initialize_module_for_quantization_offloaded(
|
|
120
121
|
create_quantization_scheme, weights, input_activations, layer
|
|
121
122
|
):
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
attach_align_device_hook(layer, offload=True)
|
|
123
|
+
offload_model(layer, "cuda:0")
|
|
125
124
|
|
|
126
125
|
test_initialize_module_for_quantization(
|
|
127
126
|
create_quantization_scheme,
|