compressed-tensors 0.13.1a20260127__tar.gz → 0.13.1a20260130__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/test-check.yaml +1 -1
  2. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/Makefile +1 -1
  3. {compressed_tensors-0.13.1a20260127/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260130}/PKG-INFO +1 -1
  4. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/base.py +3 -0
  5. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/device.py +2 -2
  6. compressed_tensors-0.13.1a20260130/src/compressed_tensors/offload/cache/dist_cpu.py +53 -0
  7. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/apply.py +6 -9
  8. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/initialize.py +5 -5
  9. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_args.py +29 -26
  10. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_config.py +12 -12
  11. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_scheme.py +6 -12
  12. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/helpers.py +13 -11
  13. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/helpers.py +9 -18
  14. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/match.py +20 -21
  15. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/offload.py +3 -3
  16. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/safetensors_load.py +12 -12
  17. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/version.py +1 -1
  18. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
  19. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/SOURCES.txt +5 -0
  20. compressed_tensors-0.13.1a20260130/tests/test_modeling/test_deepseekv3_kvcache_quant.py +100 -0
  21. compressed_tensors-0.13.1a20260127/tests/test_offload/cache/test_cpu.py → compressed_tensors-0.13.1a20260130/tests/test_offload/cache/helpers.py +30 -49
  22. compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_cpu.py +80 -0
  23. compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_dist_cpu.py +139 -0
  24. compressed_tensors-0.13.1a20260130/tests/test_offload/conftest.py +76 -0
  25. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/testing_utils.py +30 -4
  26. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/.gitkeep +0 -0
  27. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/actions/test/action.yml +0 -0
  28. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/mergify.yml +0 -0
  29. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/scripts/step-status +0 -0
  30. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/quality-check.yaml +0 -0
  31. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.github/workflows/stale.yml +0 -0
  32. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/.gitignore +0 -0
  33. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/LICENSE +0 -0
  34. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/README.md +0 -0
  35. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
  36. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/int4_config.json +0 -0
  37. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/bitmask_compression.ipynb +0 -0
  38. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_config_quantization.py +0 -0
  39. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
  40. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_config.json +0 -0
  41. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
  42. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/examples/quantize_and_pack_int4.ipynb +0 -0
  43. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/pyproject.toml +0 -0
  44. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/setup.cfg +0 -0
  45. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/setup.py +0 -0
  46. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/__init__.py +0 -0
  47. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/README.md +0 -0
  48. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/__init__.py +0 -0
  49. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/base.py +0 -0
  50. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/__init__.py +0 -0
  51. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/base.py +0 -0
  52. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/helpers.py +0 -0
  53. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
  54. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
  55. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
  56. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
  57. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +0 -0
  58. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
  59. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
  60. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
  61. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
  62. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
  63. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
  64. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
  65. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
  66. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
  67. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/__init__.py +0 -0
  68. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/base.py +0 -0
  69. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/dense.py +0 -0
  70. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/format.py +0 -0
  71. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
  72. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  73. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/__init__.py +0 -0
  74. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/compressed_linear.py +0 -0
  75. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/logger.py +0 -0
  76. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/__init__.py +0 -0
  77. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/attention.py +0 -0
  78. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/kvcache.py +0 -0
  79. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/__init__.py +0 -0
  80. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/__init__.py +0 -0
  81. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/cpu.py +0 -0
  82. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/dispatch.py +0 -0
  83. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/module.py +0 -0
  84. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/utils.py +0 -0
  85. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/__init__.py +0 -0
  86. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
  87. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
  88. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
  89. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
  90. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
  91. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  92. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
  93. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/__init__.py +0 -0
  94. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/registry.py +0 -0
  95. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/__init__.py +0 -0
  96. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/apply.py +0 -0
  97. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/__init__.py +0 -0
  98. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/base.py +0 -0
  99. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
  100. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
  101. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
  102. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_args.py +0 -0
  103. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_config.py +0 -0
  104. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_scheme.py +0 -0
  105. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/__init__.py +0 -0
  106. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
  107. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
  108. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/matrix.py +0 -0
  109. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/__init__.py +0 -0
  110. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/binary_search.py +0 -0
  111. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/internal.py +0 -0
  112. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/permutations_24.py +0 -0
  113. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
  114. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/type.py +0 -0
  115. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  116. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/requires.txt +0 -0
  117. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/top_level.txt +0 -0
  118. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/__init__.py +0 -0
  119. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/conftest.py +0 -0
  120. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/mock_observer.py +0 -0
  121. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/__init__.py +0 -0
  122. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/__init__.py +0 -0
  123. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
  124. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
  125. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
  126. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
  127. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
  128. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
  129. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
  130. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
  131. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
  132. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
  133. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
  134. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
  135. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/__init__.py +0 -0
  136. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_base.py +0 -0
  137. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_infer_quant.py +0 -0
  138. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
  139. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_linear/__init__.py +0 -0
  140. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_linear/test_compressed_linear.py +0 -0
  141. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_modeling/test_attention_and_cache.py +0 -0
  142. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_dispatch.py +0 -0
  143. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_interface.py +0 -0
  144. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_module.py +0 -0
  145. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/__init__.py +0 -0
  146. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/__init__.py +0 -0
  147. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/conftest.py +0 -0
  148. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_apply.py +0 -0
  149. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
  150. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
  151. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_forward.py +0 -0
  152. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
  153. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
  154. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
  155. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/__init__.py +0 -0
  156. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
  157. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_strategies.py +0 -0
  158. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_args.py +0 -0
  159. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_config.py +0 -0
  160. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_scheme.py +0 -0
  161. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_helpers.py +0 -0
  162. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
  163. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_registry.py +0 -0
  164. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/conftest.py +0 -0
  165. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_correctness.py +0 -0
  166. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_memory.py +0 -0
  167. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_serialization.py +0 -0
  168. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_args.py +0 -0
  169. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_config.py +0 -0
  170. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_scheme.py +0 -0
  171. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_transform/utils/test_hadamard.py +0 -0
  172. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/__init__.py +0 -0
  173. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_helpers.py +0 -0
  174. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_match.py +0 -0
  175. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_safetensors_load.py +0 -0
  176. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_type.py +0 -0
  177. {compressed_tensors-0.13.1a20260127 → compressed_tensors-0.13.1a20260130}/utils/copyright.py +0 -0
@@ -12,7 +12,7 @@ on:
12
12
 
13
13
  jobs:
14
14
  python-tests:
15
- runs-on: ibm-wdc-k8s-vllm-h100-solo
15
+ runs-on: gcp-k8s-vllm-l4-duo
16
16
  env:
17
17
  HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
18
18
  steps:
@@ -23,7 +23,7 @@ style:
23
23
  # run tests for the repo
24
24
  test:
25
25
  @echo "Running python tests";
26
- pytest tests;
26
+ pytest -ra tests;
27
27
 
28
28
  # creates wheel file
29
29
  build:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compressed-tensors
3
- Version: 0.13.1a20260127
3
+ Version: 0.13.1a20260130
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/vllm-project/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -67,6 +67,7 @@ class OffloadCache(MutableMapping, ABC):
67
67
  """
68
68
  from compressed_tensors.offload.cache.cpu import CPUCache
69
69
  from compressed_tensors.offload.cache.device import DeviceCache
70
+ from compressed_tensors.offload.cache.dist_cpu import DistributedCPUCache
70
71
 
71
72
  device_type = torch.device(device).type if device != "disk" else "disk"
72
73
  distributed = dist.is_available() and dist.is_initialized()
@@ -74,6 +75,8 @@ class OffloadCache(MutableMapping, ABC):
74
75
  match (device_type, distributed):
75
76
  case ("cpu", False):
76
77
  return CPUCache
78
+ case ("cpu", True):
79
+ return DistributedCPUCache
77
80
  case ("cuda", False):
78
81
  return DeviceCache
79
82
  case _:
@@ -35,8 +35,8 @@ class DeviceCache(OffloadCache):
35
35
  :param key: cpu tensor to onload
36
36
  :return: device tensor
37
37
  """
38
- assert offloaded.device == self.onload_device
39
- return offloaded
38
+ # move because onload_device might be modified after init
39
+ return send_tensors(offloaded, device=self.onload_device, copy=False)
40
40
 
41
41
  def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
42
42
  """
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch
16
+ import torch.distributed as dist
17
+ from compressed_tensors.offload.cache.cpu import CPUCache
18
+
19
+
20
+ class DistributedCPUCache(CPUCache):
21
+ """
22
+ Handles offloading and onloading tensors from/to cpu memory shared across processes
23
+ """
24
+
25
+ offload_device = torch.device("cpu")
26
+
27
+ def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
28
+ if tensor is None:
29
+ return None
30
+
31
+ # slight runtime cost for views
32
+ tensor = tensor.contiguous()
33
+
34
+ if dist.get_rank() == 0:
35
+ # create shared memory cpu tensor
36
+ tensor = super().offload(tensor).share_memory_()
37
+ (handle, filename, nbytes) = tensor.untyped_storage()._share_filename_cpu_()
38
+ broadcast_obj = [handle, filename, nbytes]
39
+ else:
40
+ broadcast_obj = [None, None, None]
41
+
42
+ # receive shared memory file handle
43
+ dist.broadcast_object_list(broadcast_obj, src=0)
44
+
45
+ if dist.get_rank() != 0:
46
+ # reconstruct tensor from shared memory file handle
47
+ tensor = torch.empty_like(tensor, device=self.offload_device)
48
+ tensor.set_(torch.UntypedStorage._new_shared_filename_cpu(*broadcast_obj))
49
+
50
+ # ensure that rank 0 does not garbage collect before other ranks reconstruct
51
+ dist.barrier()
52
+
53
+ return tensor
@@ -14,9 +14,6 @@
14
14
 
15
15
  from collections import OrderedDict
16
16
  from copy import deepcopy
17
- from typing import Dict, List, Optional
18
- from typing import OrderedDict as OrderedDictType
19
- from typing import Union
20
17
 
21
18
  import torch
22
19
  from compressed_tensors.config import CompressionFormat
@@ -60,8 +57,8 @@ from compressed_tensors.utils.safetensors_load import (
60
57
 
61
58
  def load_pretrained_quantization_parameters(
62
59
  model: Module,
63
- model_name_or_path: Optional[str] = None,
64
- load_weight_qparams: Optional[bool] = False,
60
+ model_name_or_path: str | None = None,
61
+ load_weight_qparams: bool = False,
65
62
  ):
66
63
  """
67
64
  Loads the quantization parameters (scale and zero point) from model_name_or_path to
@@ -110,7 +107,7 @@ def load_pretrained_quantization_parameters(
110
107
 
111
108
 
112
109
  def apply_quantization_config(
113
- model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
110
+ model: Module, config: QuantizationConfig | None, run_compressed: bool = False
114
111
  ):
115
112
  """
116
113
  Initializes the model for quantization in-place based on the given config.
@@ -207,7 +204,7 @@ def _apply_kv_cache_scheme(
207
204
 
208
205
 
209
206
  def _load_quant_args_from_mapping(
210
- base_name: str, module_name: str, module: Module, mapping: Dict
207
+ base_name: str, module_name: str, module: Module, mapping: dict
211
208
  ):
212
209
  # TODO: skip update and just register here, don't do it in initialize
213
210
  """
@@ -251,8 +248,8 @@ def _load_quant_args_from_mapping(
251
248
 
252
249
 
253
250
  def _scheme_from_targets(
254
- target_to_scheme: OrderedDictType[str, QuantizationScheme],
255
- targets: List[str],
251
+ target_to_scheme: OrderedDict[str, QuantizationScheme],
252
+ targets: list[str],
256
253
  name: str,
257
254
  ) -> QuantizationScheme:
258
255
  # return the first scheme (the prioritized one,
@@ -14,7 +14,6 @@
14
14
 
15
15
 
16
16
  import logging
17
- from typing import Optional, Tuple, Union
18
17
 
19
18
  import torch
20
19
  from compressed_tensors.modeling import (
@@ -60,7 +59,7 @@ _LOGGER = logging.getLogger(__name__)
60
59
 
61
60
  def initialize_module_for_quantization(
62
61
  module: Module,
63
- scheme: Optional[QuantizationScheme] = None,
62
+ scheme: QuantizationScheme | None = None,
64
63
  force_zero_point: bool = True,
65
64
  ):
66
65
  """
@@ -148,6 +147,7 @@ def is_attention_module(module: Module):
148
147
  hasattr(module, "k_proj")
149
148
  or hasattr(module, "v_proj")
150
149
  or hasattr(module, "qkv_proj")
150
+ or hasattr(module, "kv_b_proj")
151
151
  )
152
152
 
153
153
 
@@ -155,7 +155,7 @@ def initialize_qparams(
155
155
  module: Module,
156
156
  base_name: str,
157
157
  quantization_args: QuantizationArgs,
158
- observed_shape: Tuple[Union[int, None]],
158
+ observed_shape: tuple[int | None, ...],
159
159
  observed_dtype: torch.dtype,
160
160
  force_zero_point: bool = True,
161
161
  ):
@@ -279,8 +279,8 @@ def initialize_attn_qparams(
279
279
  ):
280
280
  """Initlaize k_scale, v_scale for self_attn"""
281
281
 
282
- impl: Optional[QuantizedAttentionImpl] = getattr(module, IMPL_ATTR, None)
283
- kv_cache: Optional[QuantizedKVCache] = getattr(module, KV_CACHE_ATTR, None)
282
+ impl: QuantizedAttentionImpl | None = getattr(module, IMPL_ATTR, None)
283
+ kv_cache: QuantizedKVCache | None = getattr(module, KV_CACHE_ATTR, None)
284
284
 
285
285
  if impl is None and kv_cache is None:
286
286
  raise ValueError(
@@ -14,7 +14,7 @@
14
14
 
15
15
  import warnings
16
16
  from enum import Enum
17
- from typing import Any, Dict, List, Optional, Union
17
+ from typing import Any
18
18
 
19
19
  import torch
20
20
  from compressed_tensors.utils import Aliasable
@@ -48,10 +48,10 @@ __all__ = [
48
48
  class FloatArgs:
49
49
  exponent: int
50
50
  mantissa: int
51
- bits: Optional[int] = None
52
- max: Optional[float] = None
53
- min: Optional[float] = None
54
- dtype: Optional[torch.dtype] = None
51
+ bits: int | None = None
52
+ max: float | None = None
53
+ min: float | None = None
54
+ dtype: torch.dtype | None = None
55
55
 
56
56
 
57
57
  class FP4_E2M1_DATA(FloatArgs):
@@ -147,7 +147,7 @@ class ActivationOrdering(Aliasable, str, Enum):
147
147
  STATIC = "static"
148
148
 
149
149
  @staticmethod
150
- def get_aliases() -> Dict[str, str]:
150
+ def get_aliases() -> dict[str, str]:
151
151
  return {
152
152
  "dynamic": "group",
153
153
  "static": "weight",
@@ -178,21 +178,21 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
178
178
  num_bits: int = 8
179
179
  type: QuantizationType = QuantizationType.INT
180
180
  symmetric: bool = True
181
- group_size: Optional[int] = None
182
- strategy: Optional[QuantizationStrategy] = None
183
- block_structure: Optional[List[int]] = None
184
- dynamic: Union[DynamicType, bool] = False
185
- actorder: Union[ActivationOrdering, bool, None] = None
186
- scale_dtype: Optional[TorchDtype] = None
187
- zp_dtype: Optional[TorchDtype] = None
188
- observer: Optional[str] = Field(
181
+ group_size: int | None = None
182
+ strategy: QuantizationStrategy | None = None
183
+ block_structure: list[int] | None = None
184
+ dynamic: DynamicType | bool = False
185
+ actorder: ActivationOrdering | bool | None = None
186
+ scale_dtype: TorchDtype | None = None
187
+ zp_dtype: TorchDtype | None = None
188
+ observer: str | None = Field(
189
189
  default=None,
190
190
  description=(
191
191
  "Determines the method of computing quantization parameters (scales and "
192
192
  "zero-points). Defaults to min-max when not using dynamic quantization"
193
193
  ),
194
194
  )
195
- observer_kwargs: Dict[str, Any] = Field(
195
+ observer_kwargs: dict[str, Any] = Field(
196
196
  default_factory=dict,
197
197
  description=(
198
198
  "optional dict of kwargs to be passed directly to torch quantization "
@@ -214,7 +214,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
214
214
  return value
215
215
 
216
216
  @field_validator("group_size", mode="before")
217
- def validate_group(cls, value) -> Union[int, None]:
217
+ def validate_group(cls, value) -> int | None:
218
218
  if value is None:
219
219
  return value
220
220
 
@@ -227,7 +227,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
227
227
  return value
228
228
 
229
229
  @field_validator("block_structure", mode="before")
230
- def validate_block_structure(cls, value) -> Optional[List[int]]:
230
+ def validate_block_structure(cls, value) -> list[int] | None:
231
231
  if value is None:
232
232
  return value
233
233
  # For backward compatibility, allow string format "2x4", "8x16", etc.
@@ -251,14 +251,14 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
251
251
  )
252
252
 
253
253
  @field_validator("strategy", mode="before")
254
- def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
254
+ def validate_strategy(cls, value) -> QuantizationStrategy | None:
255
255
  if isinstance(value, str):
256
256
  return QuantizationStrategy(value.lower())
257
257
 
258
258
  return value
259
259
 
260
260
  @field_validator("actorder", mode="before")
261
- def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
261
+ def validate_actorder(cls, value) -> ActivationOrdering | None:
262
262
  if isinstance(value, bool):
263
263
  return ActivationOrdering.GROUP if value else None
264
264
 
@@ -268,7 +268,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
268
268
  return value
269
269
 
270
270
  @field_validator("dynamic", mode="before")
271
- def validate_dynamic(cls, value) -> Union[DynamicType, bool]:
271
+ def validate_dynamic(cls, value) -> DynamicType | bool:
272
272
  if isinstance(value, str):
273
273
  return DynamicType(value.lower())
274
274
  return value
@@ -329,10 +329,13 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
329
329
  raise ValueError(f"Block structure requires block strategy\n{model}")
330
330
 
331
331
  # validate activation ordering and strategy
332
- if actorder is not None and strategy != QuantizationStrategy.GROUP:
332
+ if actorder is not None and strategy not in (
333
+ QuantizationStrategy.GROUP,
334
+ QuantizationStrategy.TENSOR_GROUP,
335
+ ):
333
336
  raise ValueError(
334
- "Must use group quantization strategy in order to apply "
335
- "activation ordering"
337
+ "Must use group or tensor_group quantization strategy in "
338
+ "order to apply activation ordering"
336
339
  )
337
340
 
338
341
  # infer observer w.r.t. dynamic
@@ -369,7 +372,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
369
372
 
370
373
  elif observer is None:
371
374
  # default to minmax for non-dynamic cases
372
- observer = "minmax"
375
+ observer = "memoryless_minmax"
373
376
 
374
377
  if zp_dtype is None:
375
378
  if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
@@ -409,7 +412,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
409
412
  def round_to_quantized_type_dtype(
410
413
  tensor: torch.Tensor,
411
414
  dtype: torch.dtype,
412
- cast_to_original_dtype: Optional[bool] = True,
415
+ cast_to_original_dtype: bool = True,
413
416
  ) -> torch.Tensor:
414
417
  """
415
418
  Rounds an input tensor to the nearest quantized representation given a dtype.
@@ -439,7 +442,7 @@ def round_to_quantized_type_args(
439
442
  args: QuantizationArgs,
440
443
  min: torch.Tensor,
441
444
  max: torch.Tensor,
442
- cast_to_original_dtype: Optional[bool] = True,
445
+ cast_to_original_dtype: bool = True,
443
446
  ) -> torch.Tensor:
444
447
  """
445
448
  Rounds an input tensor to the nearest quantized representation given
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  from collections import defaultdict
15
15
  from enum import Enum
16
- from typing import Annotated, Any, Dict, List, Optional, Set, Union
16
+ from typing import Annotated, Any
17
17
 
18
18
  from compressed_tensors.config import CompressionFormat
19
19
  from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
@@ -55,7 +55,7 @@ class QuantizationStatus(str, Enum):
55
55
  COMPRESSED = "compressed"
56
56
 
57
57
  @classmethod
58
- def lifecycle_order(cls) -> List["QuantizationStatus"]:
58
+ def lifecycle_order(cls) -> list["QuantizationStatus"]:
59
59
  """
60
60
  :return: list of correct quantization lifecycle order
61
61
  """
@@ -131,13 +131,13 @@ class QuantizationConfig(BaseModel):
131
131
  are not quantized even if they match up with a target in config_groups
132
132
  """
133
133
 
134
- config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
134
+ config_groups: dict[str, QuantizationScheme | list[str]]
135
135
  quant_method: str = DEFAULT_QUANTIZATION_METHOD
136
- kv_cache_scheme: Optional[QuantizationArgs] = None
136
+ kv_cache_scheme: QuantizationArgs | None = None
137
137
  format: str = DEFAULT_QUANTIZATION_FORMAT
138
138
  quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
139
- global_compression_ratio: Optional[float] = None
140
- ignore: Optional[List[str]] = Field(default_factory=list)
139
+ global_compression_ratio: float | None = None
140
+ ignore: list[str] | None = Field(default_factory=list)
141
141
  # `run_compressed` is a dummy, unused arg for backwards compatibility
142
142
  # see: https://github.com/huggingface/transformers/pull/39324
143
143
  run_compressed: Annotated[Any, Field(exclude=True)] = None
@@ -161,8 +161,8 @@ class QuantizationConfig(BaseModel):
161
161
 
162
162
  @staticmethod
163
163
  def from_pretrained(
164
- model: Module, format: Optional[Union[str, list]] = None
165
- ) -> Optional["QuantizationConfig"]:
164
+ model: Module, format: str | list | None = None
165
+ ) -> "QuantizationConfig | None":
166
166
  """
167
167
  Converts a model into its associated QuantizationConfig based on the
168
168
  QuantizationScheme attached to each quantized module
@@ -177,21 +177,21 @@ class QuantizationConfig(BaseModel):
177
177
 
178
178
  # set of all quantization schemes
179
179
  # TODO: make quant config/scheme/args frozen/hashable and use a set
180
- quantization_schemes: List[QuantizationScheme] = list()
180
+ quantization_schemes: list[QuantizationScheme] = list()
181
181
 
182
182
  # use any status from modules (in practice, use the last module)
183
183
  model_status = None
184
184
 
185
185
  # set of all quantized types
186
186
  # this is later used to create the ignore list
187
- quantization_type_names: Set[str] = set()
187
+ quantization_type_names: set[str] = set()
188
188
 
189
189
  # maps types to names which are not quantized
190
190
  # this is later used to create the ignore list
191
- ignore: Dict[str, List[str]] = defaultdict(list)
191
+ ignore: dict[str, list[str]] = defaultdict(list)
192
192
 
193
193
  # this keeps track of any kvcache schemes
194
- kv_cache_scheme: Optional[QuantizationArgs] = None
194
+ kv_cache_scheme: QuantizationArgs | None = None
195
195
 
196
196
  for name, submodule in model.named_modules():
197
197
  layer_type: str = module_type(submodule)
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  import warnings
15
15
  from copy import deepcopy
16
- from typing import List, Optional
17
16
 
18
17
  import torch
19
18
  from compressed_tensors.config import CompressionFormat
@@ -47,11 +46,11 @@ class QuantizationScheme(BaseModel):
47
46
  :param format: CompressionFormat for the layer
48
47
  """
49
48
 
50
- targets: List[str]
51
- weights: Optional[QuantizationArgs] = None
52
- input_activations: Optional[QuantizationArgs] = None
53
- output_activations: Optional[QuantizationArgs] = None
54
- format: Optional[str] = None
49
+ targets: list[str]
50
+ weights: QuantizationArgs | None = None
51
+ input_activations: QuantizationArgs | None = None
52
+ output_activations: QuantizationArgs | None = None
53
+ format: str | None = None
55
54
 
56
55
  @model_validator(mode="after")
57
56
  def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
@@ -121,7 +120,7 @@ Pre-Set Quantization Scheme Args
121
120
  """
122
121
 
123
122
 
124
- def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme:
123
+ def preset_name_to_scheme(name: str, targets: list[str]) -> QuantizationScheme:
125
124
  """
126
125
  :param name: preset quantization settings name. must exist in upper case in
127
126
  PRESET_SCHEMES
@@ -175,7 +174,6 @@ NVFP4 = dict(
175
174
  symmetric=True,
176
175
  dynamic=False,
177
176
  group_size=16,
178
- observer="static_minmax",
179
177
  scale_dtype=FP8_E4M3_DATA.dtype,
180
178
  zp_dtype=FP8_E4M3_DATA.dtype,
181
179
  ),
@@ -244,7 +242,6 @@ INT8_W8A8 = dict(
244
242
  strategy=QuantizationStrategy.TOKEN,
245
243
  symmetric=True,
246
244
  dynamic=True,
247
- observer=None,
248
245
  ),
249
246
  )
250
247
 
@@ -299,7 +296,6 @@ INT8_W4A8 = dict(
299
296
  strategy=QuantizationStrategy.TOKEN,
300
297
  symmetric=True,
301
298
  dynamic=True,
302
- observer=None,
303
299
  ),
304
300
  )
305
301
 
@@ -356,7 +352,6 @@ FP8_DYNAMIC = dict(
356
352
  strategy=QuantizationStrategy.TOKEN,
357
353
  symmetric=True,
358
354
  dynamic=True,
359
- observer=None,
360
355
  ),
361
356
  )
362
357
 
@@ -378,7 +373,6 @@ FP8_BLOCK = dict(
378
373
  strategy=QuantizationStrategy.GROUP,
379
374
  symmetric=True,
380
375
  dynamic=True,
381
- observer=None,
382
376
  group_size=128,
383
377
  ),
384
378
  )
@@ -14,7 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import math
17
- from typing import Generator, Optional, Tuple
17
+ from collections.abc import Generator
18
18
 
19
19
  import torch
20
20
  from compressed_tensors.quantization.quant_args import (
@@ -66,8 +66,8 @@ def calculate_qparams(
66
66
  min_vals: Tensor,
67
67
  max_vals: Tensor,
68
68
  quantization_args: QuantizationArgs,
69
- global_scale: Optional[Tensor] = None,
70
- ) -> Tuple[FloatTensor, IntTensor]:
69
+ global_scale: Tensor | None = None,
70
+ ) -> tuple[FloatTensor, IntTensor]:
71
71
  """
72
72
  :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
73
73
  from
@@ -152,7 +152,7 @@ def compute_dynamic_scales_and_zp(
152
152
  value: Tensor,
153
153
  args: QuantizationArgs,
154
154
  module: torch.nn.Module,
155
- global_scale: Optional[Tensor] = None,
155
+ global_scale: Tensor | None = None,
156
156
  ):
157
157
  """
158
158
  Returns the computed scales and zero points for dynamic activation
@@ -207,7 +207,9 @@ def compute_dynamic_scales_and_zp(
207
207
  return calculate_qparams(min_val, max_val, args, global_scale=global_scale)
208
208
 
209
209
 
210
- def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
210
+ def calculate_range(
211
+ quantization_args: QuantizationArgs, device: str
212
+ ) -> tuple[torch.Tensor, torch.Tensor]:
211
213
  """
212
214
  Calculated the effective quantization range for the given Quantization Args
213
215
 
@@ -285,7 +287,7 @@ def module_type(module: Module) -> str:
285
287
  "Please use `model.named_modules()` and filter by "
286
288
  "compressed_tensors.InternalModule if neceessary"
287
289
  )
288
- def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None, None]:
290
+ def iter_named_leaf_modules(model: Module) -> Generator[tuple[str, Module], None, None]:
289
291
  """
290
292
  Yields modules that do not have any submodules except observers. The observers
291
293
  themselves are not yielded
@@ -321,7 +323,7 @@ def iter_named_quantizable_modules(
321
323
  include_children: bool = True,
322
324
  include_attn: bool = False,
323
325
  include_mlp: bool = False,
324
- ) -> Generator[Tuple[str, Module], None, None]:
326
+ ) -> Generator[tuple[str, Module], None, None]:
325
327
  """
326
328
  Yield name and submodule of
327
329
  - leaf modules, set by include_children
@@ -416,9 +418,9 @@ def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool:
416
418
  def generate_gparam(
417
419
  updated_min_val: torch.Tensor,
418
420
  updated_max_val: torch.Tensor,
419
- scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
420
- quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
421
- dtype: Optional[torch.dtype] = torch.float32,
421
+ scale_data: FloatArgs | None = FP8_E4M3_DATA,
422
+ quant_data: FloatArgs | None = FP4_E2M1_DATA,
423
+ dtype: torch.dtype | None = torch.float32,
422
424
  ):
423
425
  """
424
426
  Generate a global scale for an entire tensor (input_tensor).
@@ -439,7 +441,7 @@ def generate_gparam(
439
441
  def strategy_cdiv(
440
442
  value: int,
441
443
  divisor: int,
442
- strategy: Optional[QuantizationStrategy],
444
+ strategy: QuantizationStrategy | None,
443
445
  strict: bool = False,
444
446
  ) -> int:
445
447
  dividend = math.ceil(value / divisor)
@@ -14,19 +14,10 @@
14
14
 
15
15
  import contextlib
16
16
  import warnings
17
+ from collections.abc import Callable, Iterable, Mapping
17
18
  from functools import wraps
18
19
  from types import MappingProxyType
19
- from typing import (
20
- TYPE_CHECKING,
21
- Any,
22
- Callable,
23
- Dict,
24
- Iterable,
25
- List,
26
- Mapping,
27
- Optional,
28
- TypeVar,
29
- )
20
+ from typing import TYPE_CHECKING, Any, TypeVar
30
21
 
31
22
  import numpy
32
23
  import torch
@@ -66,7 +57,7 @@ FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
66
57
 
67
58
  def infer_compressor_from_model_config(
68
59
  pretrained_model_name_or_path: str,
69
- ) -> Optional["ModelCompressor"]: # noqa: F821
60
+ ) -> "ModelCompressor | None": # noqa: F821
70
61
  """
71
62
  Given a path to a model config, extract a sparsity config if it exists and return
72
63
  the associated ModelCompressor
@@ -185,7 +176,7 @@ def getattr_chain(obj: Any, chain_str: str, *args, **kwargs) -> Any:
185
176
 
186
177
 
187
178
  def deprecated(
188
- future_name: Optional[str] = None, message: Optional[str] = None
179
+ future_name: str | None = None, message: str | None = None
189
180
  ) -> Callable[[T], T]:
190
181
  """
191
182
  Decorator to mark functions as deprecated
@@ -224,7 +215,7 @@ class Aliasable:
224
215
  """
225
216
 
226
217
  @staticmethod
227
- def get_aliases() -> Dict[str, str]:
218
+ def get_aliases() -> dict[str, str]:
228
219
  raise NotImplementedError()
229
220
 
230
221
  def __eq__(self, other):
@@ -246,8 +237,8 @@ class Aliasable:
246
237
 
247
238
 
248
239
  def shard_tensor(
249
- tensor: torch.Tensor, shard_sizes: List[int], dim: int = 0
250
- ) -> List[torch.Tensor]:
240
+ tensor: torch.Tensor, shard_sizes: list[int], dim: int = 0
241
+ ) -> list[torch.Tensor]:
251
242
  """
252
243
  Shards a tensor into a list of tensors along a given dimension.
253
244
 
@@ -277,7 +268,7 @@ def shard_tensor(
277
268
  return shards
278
269
 
279
270
 
280
- def combine_shards(shards, dim=0):
271
+ def combine_shards(shards: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
281
272
  """
282
273
  Combine decompressed shards along a given dimension using `narrow`.
283
274
 
@@ -325,7 +316,7 @@ def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
325
316
 
326
317
 
327
318
  def unpack_bitmasks(
328
- packed_bitmasks: torch.Tensor, original_shape: List[int]
319
+ packed_bitmasks: torch.Tensor, original_shape: list[int]
329
320
  ) -> torch.Tensor:
330
321
  """
331
322
  Converts a bitmask tensor back to a bytemask tensor for use during decompression