compressed-tensors 0.13.1a20260123__tar.gz → 0.13.1a20260130__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/actions/test/action.yml +1 -1
  2. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/test-check.yaml +2 -2
  3. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/Makefile +1 -1
  4. {compressed_tensors-0.13.1a20260123/src/compressed_tensors.egg-info → compressed_tensors-0.13.1a20260130}/PKG-INFO +4 -2
  5. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/setup.py +2 -2
  6. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/compressed_linear.py +0 -6
  7. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/__init__.py +7 -6
  8. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/base.py +3 -0
  9. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/device.py +2 -2
  10. compressed_tensors-0.13.1a20260130/src/compressed_tensors/offload/cache/dist_cpu.py +53 -0
  11. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/dispatch.py +1 -1
  12. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/apply.py +6 -9
  13. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/forward.py +18 -19
  14. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/initialize.py +7 -7
  15. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_args.py +29 -26
  16. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_config.py +12 -12
  17. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_scheme.py +6 -12
  18. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/helpers.py +13 -11
  19. compressed_tensors-0.13.1a20260130/src/compressed_tensors/transform/apply.py +36 -0
  20. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/base.py +3 -11
  21. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/helpers.py +9 -18
  22. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/match.py +20 -21
  23. compressed_tensors-0.13.1a20260130/src/compressed_tensors/utils/offload.py +195 -0
  24. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/safetensors_load.py +12 -12
  25. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/version.py +1 -1
  26. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130/src/compressed_tensors.egg-info}/PKG-INFO +4 -2
  27. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/SOURCES.txt +5 -1
  28. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/requires.txt +3 -1
  29. compressed_tensors-0.13.1a20260130/tests/test_modeling/test_deepseekv3_kvcache_quant.py +100 -0
  30. compressed_tensors-0.13.1a20260123/tests/test_offload/cache/test_cpu.py → compressed_tensors-0.13.1a20260130/tests/test_offload/cache/helpers.py +30 -49
  31. compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_cpu.py +80 -0
  32. compressed_tensors-0.13.1a20260130/tests/test_offload/cache/test_dist_cpu.py +139 -0
  33. compressed_tensors-0.13.1a20260130/tests/test_offload/conftest.py +76 -0
  34. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_apply.py +2 -12
  35. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_initialize.py +4 -5
  36. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_correctness.py +6 -15
  37. compressed_tensors-0.13.1a20260130/tests/test_transform/factory/test_memory.py +74 -0
  38. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/factory/test_serialization.py +8 -16
  39. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_match.py +28 -34
  40. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/testing_utils.py +30 -22
  41. compressed_tensors-0.13.1a20260123/src/compressed_tensors/transform/apply.py +0 -71
  42. compressed_tensors-0.13.1a20260123/src/compressed_tensors/utils/offload.py +0 -672
  43. compressed_tensors-0.13.1a20260123/tests/test_transform/factory/test_memory.py +0 -92
  44. compressed_tensors-0.13.1a20260123/tests/test_utils/test_offload.py +0 -540
  45. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/.gitkeep +0 -0
  46. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/mergify.yml +0 -0
  47. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/scripts/step-status +0 -0
  48. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/quality-check.yaml +0 -0
  49. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.github/workflows/stale.yml +0 -0
  50. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/.gitignore +0 -0
  51. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/LICENSE +0 -0
  52. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/README.md +0 -0
  53. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
  54. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bit_packing/int4_config.json +0 -0
  55. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/bitmask_compression.ipynb +0 -0
  56. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_config_quantization.py +0 -0
  57. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
  58. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_config.json +0 -0
  59. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
  60. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/examples/quantize_and_pack_int4.ipynb +0 -0
  61. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/pyproject.toml +0 -0
  62. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/setup.cfg +0 -0
  63. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/__init__.py +0 -0
  64. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/README.md +0 -0
  65. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/__init__.py +0 -0
  66. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/base.py +0 -0
  67. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/__init__.py +0 -0
  68. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/base.py +0 -0
  69. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/helpers.py +0 -0
  70. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
  71. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
  72. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
  73. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
  74. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/fp4_quantized.py +0 -0
  75. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
  76. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
  77. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
  78. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
  79. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
  80. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
  81. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
  82. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
  83. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
  84. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/__init__.py +0 -0
  85. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/base.py +0 -0
  86. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/dense.py +0 -0
  87. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/format.py +0 -0
  88. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
  89. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  90. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/linear/__init__.py +0 -0
  91. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/logger.py +0 -0
  92. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/__init__.py +0 -0
  93. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/attention.py +0 -0
  94. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/modeling/kvcache.py +0 -0
  95. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/__init__.py +0 -0
  96. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/cache/cpu.py +0 -0
  97. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/module.py +0 -0
  98. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/offload/utils.py +0 -0
  99. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/__init__.py +0 -0
  100. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
  101. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
  102. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
  103. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/quant_metadata.py +0 -0
  104. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  105. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/quantization/utils/mxfp4_utils.py +0 -0
  106. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/__init__.py +0 -0
  107. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/registry/registry.py +0 -0
  108. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/__init__.py +0 -0
  109. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/__init__.py +0 -0
  110. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/hadamard.py +0 -0
  111. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/matrix_multiply.py +0 -0
  112. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/factory/random_hadamard.py +0 -0
  113. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_args.py +0 -0
  114. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_config.py +0 -0
  115. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/transform_scheme.py +0 -0
  116. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/__init__.py +0 -0
  117. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamard.py +0 -0
  118. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/hadamards.safetensors +0 -0
  119. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/transform/utils/matrix.py +0 -0
  120. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/__init__.py +0 -0
  121. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/binary_search.py +0 -0
  122. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/internal.py +0 -0
  123. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/permutations_24.py +0 -0
  124. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
  125. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors/utils/type.py +0 -0
  126. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  127. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/src/compressed_tensors.egg-info/top_level.txt +0 -0
  128. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/__init__.py +0 -0
  129. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/conftest.py +0 -0
  130. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/mock_observer.py +0 -0
  131. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/__init__.py +0 -0
  132. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/__init__.py +0 -0
  133. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/model_compressors/test_model_compressor.py +0 -0
  134. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
  135. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp4_quant.py +0 -0
  136. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +0 -0
  137. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_int_quant.py +0 -0
  138. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_pack_quant.py +0 -0
  139. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/quantized_compressors/test_packed_asym_decompression.py +0 -0
  140. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
  141. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
  142. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
  143. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
  144. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
  145. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/__init__.py +0 -0
  146. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_base.py +0 -0
  147. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_configs/test_infer_quant.py +0 -0
  148. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
  149. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_linear/__init__.py +0 -0
  150. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_linear/test_compressed_linear.py +0 -0
  151. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_modeling/test_attention_and_cache.py +0 -0
  152. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_dispatch.py +0 -0
  153. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_interface.py +0 -0
  154. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_offload/test_module.py +0 -0
  155. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/__init__.py +0 -0
  156. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/__init__.py +0 -0
  157. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/conftest.py +0 -0
  158. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
  159. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
  160. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_forward.py +0 -0
  161. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
  162. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/lifecycle/test_static_lifecycle.py +0 -0
  163. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/__init__.py +0 -0
  164. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
  165. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_configs/test_strategies.py +0 -0
  166. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_args.py +0 -0
  167. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_config.py +0 -0
  168. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_quant_scheme.py +0 -0
  169. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_helpers.py +0 -0
  170. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_quantization/test_utils/test_mxfp4_utils.py +0 -0
  171. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_registry.py +0 -0
  172. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/conftest.py +0 -0
  173. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_args.py +0 -0
  174. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_config.py +0 -0
  175. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/test_transform_scheme.py +0 -0
  176. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_transform/utils/test_hadamard.py +0 -0
  177. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/__init__.py +0 -0
  178. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_helpers.py +0 -0
  179. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_safetensors_load.py +0 -0
  180. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/tests/test_utils/test_type.py +0 -0
  181. {compressed_tensors-0.13.1a20260123 → compressed_tensors-0.13.1a20260130}/utils/copyright.py +0 -0
@@ -23,7 +23,7 @@ runs:
23
23
  with:
24
24
  venv: ${{ inputs.venv }}
25
25
  name: compressed
26
- extra: "[dev,accelerate]"
26
+ extra: "[dev]"
27
27
 
28
28
  - name: clean up
29
29
  run: |
@@ -12,7 +12,7 @@ on:
12
12
 
13
13
  jobs:
14
14
  python-tests:
15
- runs-on: ibm-wdc-k8s-vllm-h100-solo
15
+ runs-on: gcp-k8s-vllm-l4-duo
16
16
  env:
17
17
  HF_TOKEN: ${{ secrets.HF_RED_HAT_READ_ONLY }}
18
18
  steps:
@@ -30,7 +30,7 @@ jobs:
30
30
  - name: Set Env
31
31
  run: pip3 install --upgrade pip setuptools
32
32
  - name: "⚙️ Install dependencies"
33
- run: pip3 install .[dev,accelerate]
33
+ run: pip3 install .[dev]
34
34
  - name: clean up
35
35
  run: |
36
36
  echo "cleaning up disk space as GHA runner has limited disk size."
@@ -23,7 +23,7 @@ style:
23
23
  # run tests for the repo
24
24
  test:
25
25
  @echo "Running python tests";
26
- pytest tests;
26
+ pytest -ra tests;
27
27
 
28
28
  # creates wheel file
29
29
  build:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compressed-tensors
3
- Version: 0.13.1a20260123
3
+ Version: 0.13.1a20260130
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/vllm-project/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -9,7 +9,7 @@ License: Apache 2.0
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: torch<=2.9.1,>=1.7.0
12
- Requires-Dist: transformers
12
+ Requires-Dist: transformers<5.0.0
13
13
  Requires-Dist: pydantic>=2.0
14
14
  Requires-Dist: loguru
15
15
  Provides-Extra: dev
@@ -19,6 +19,8 @@ Requires-Dist: wheel>=0.36.2; extra == "dev"
19
19
  Requires-Dist: flake8>=3.8.3; extra == "dev"
20
20
  Requires-Dist: pytest>=6.0.0; extra == "dev"
21
21
  Requires-Dist: nbconvert>=7.16.3; extra == "dev"
22
+ Requires-Dist: transformers<5.0; extra == "dev"
23
+ Requires-Dist: accelerate; extra == "dev"
22
24
  Provides-Extra: accelerate
23
25
  Requires-Dist: accelerate; extra == "accelerate"
24
26
  Dynamic: author
@@ -88,11 +88,11 @@ def _setup_packages() -> List:
88
88
  )
89
89
 
90
90
  def _setup_install_requires() -> List:
91
- return ["torch>=1.7.0,<=2.9.1", "transformers", "pydantic>=2.0", "loguru"]
91
+ return ["torch>=1.7.0,<=2.9.1", "transformers<5.0.0", "pydantic>=2.0", "loguru"]
92
92
 
93
93
  def _setup_extras() -> Dict:
94
94
  return {
95
- "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"],
95
+ "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3", "transformers<5.0", "accelerate"],
96
96
  "accelerate": ["accelerate"]
97
97
  }
98
98
 
@@ -87,12 +87,6 @@ class CompressedLinear(Linear):
87
87
  # mark module as compressed
88
88
  module.quantization_status = QuantizationStatus.COMPRESSED
89
89
 
90
- # handles case where forward is wrapped in new_forward by accelerate hooks
91
- if hasattr(module, "_old_forward"):
92
- module._old_forward = CompressedLinear.forward.__get__(
93
- module, CompressedLinear
94
- )
95
-
96
90
  return module
97
91
 
98
92
  def forward(self, input: Tensor) -> Tensor:
@@ -135,9 +135,7 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
135
135
  """
136
136
  cache = base._parameters
137
137
  if isinstance(cache, OffloadCache):
138
- offload_module(
139
- module, cache.onload_device, cache.offload_device, no_split=False
140
- )
138
+ offload_module(module, cache.onload_device, cache.offload_device)
141
139
 
142
140
  base.register_module(name, module)
143
141
 
@@ -178,9 +176,12 @@ def align_module_device(
178
176
  if isinstance(module._parameters, OffloadCache):
179
177
  assert isinstance(module._buffers, OffloadCache)
180
178
  with module._parameters.disable_offloading():
181
- with patch_attr(
182
- module._parameters, "onload_device", execution_device
183
- ), patch_attr(module._buffers, "onload_device", execution_device):
179
+ if execution_device is not None:
180
+ with patch_attr(
181
+ module._parameters, "onload_device", execution_device
182
+ ), patch_attr(module._buffers, "onload_device", execution_device):
183
+ yield
184
+ else:
184
185
  yield
185
186
 
186
187
  else:
@@ -67,6 +67,7 @@ class OffloadCache(MutableMapping, ABC):
67
67
  """
68
68
  from compressed_tensors.offload.cache.cpu import CPUCache
69
69
  from compressed_tensors.offload.cache.device import DeviceCache
70
+ from compressed_tensors.offload.cache.dist_cpu import DistributedCPUCache
70
71
 
71
72
  device_type = torch.device(device).type if device != "disk" else "disk"
72
73
  distributed = dist.is_available() and dist.is_initialized()
@@ -74,6 +75,8 @@ class OffloadCache(MutableMapping, ABC):
74
75
  match (device_type, distributed):
75
76
  case ("cpu", False):
76
77
  return CPUCache
78
+ case ("cpu", True):
79
+ return DistributedCPUCache
77
80
  case ("cuda", False):
78
81
  return DeviceCache
79
82
  case _:
@@ -35,8 +35,8 @@ class DeviceCache(OffloadCache):
35
35
  :param key: cpu tensor to onload
36
36
  :return: device tensor
37
37
  """
38
- assert offloaded.device == self.onload_device
39
- return offloaded
38
+ # move because onload_device might be modified after init
39
+ return send_tensors(offloaded, device=self.onload_device, copy=False)
40
40
 
41
41
  def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
42
42
  """
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch
16
+ import torch.distributed as dist
17
+ from compressed_tensors.offload.cache.cpu import CPUCache
18
+
19
+
20
+ class DistributedCPUCache(CPUCache):
21
+ """
22
+ Handles offloading and onloading tensors from/to cpu memory shared across processes
23
+ """
24
+
25
+ offload_device = torch.device("cpu")
26
+
27
+ def offload(self, tensor: torch.Tensor | None) -> torch.Tensor:
28
+ if tensor is None:
29
+ return None
30
+
31
+ # slight runtime cost for views
32
+ tensor = tensor.contiguous()
33
+
34
+ if dist.get_rank() == 0:
35
+ # create shared memory cpu tensor
36
+ tensor = super().offload(tensor).share_memory_()
37
+ (handle, filename, nbytes) = tensor.untyped_storage()._share_filename_cpu_()
38
+ broadcast_obj = [handle, filename, nbytes]
39
+ else:
40
+ broadcast_obj = [None, None, None]
41
+
42
+ # receive shared memory file handle
43
+ dist.broadcast_object_list(broadcast_obj, src=0)
44
+
45
+ if dist.get_rank() != 0:
46
+ # reconstruct tensor from shared memory file handle
47
+ tensor = torch.empty_like(tensor, device=self.offload_device)
48
+ tensor.set_(torch.UntypedStorage._new_shared_filename_cpu(*broadcast_obj))
49
+
50
+ # ensure that rank 0 does not garbage collect before other ranks reconstruct
51
+ dist.barrier()
52
+
53
+ return tensor
@@ -39,7 +39,7 @@ ModelType = TypeVar("ModelType", bound=torch.nn.Module)
39
39
  def offload_model(
40
40
  model: ModelType,
41
41
  onload_device: torch.device | str,
42
- offload_device: Optional[torch.device | str | Literal["disk"]] = None,
42
+ offload_device: torch.device | str | Literal["disk"] = torch.device("cpu"),
43
43
  ) -> ModelType:
44
44
  """
45
45
  Offload a model to the `offload_device`. During forward passes, model weights will
@@ -14,9 +14,6 @@
14
14
 
15
15
  from collections import OrderedDict
16
16
  from copy import deepcopy
17
- from typing import Dict, List, Optional
18
- from typing import OrderedDict as OrderedDictType
19
- from typing import Union
20
17
 
21
18
  import torch
22
19
  from compressed_tensors.config import CompressionFormat
@@ -60,8 +57,8 @@ from compressed_tensors.utils.safetensors_load import (
60
57
 
61
58
  def load_pretrained_quantization_parameters(
62
59
  model: Module,
63
- model_name_or_path: Optional[str] = None,
64
- load_weight_qparams: Optional[bool] = False,
60
+ model_name_or_path: str | None = None,
61
+ load_weight_qparams: bool = False,
65
62
  ):
66
63
  """
67
64
  Loads the quantization parameters (scale and zero point) from model_name_or_path to
@@ -110,7 +107,7 @@ def load_pretrained_quantization_parameters(
110
107
 
111
108
 
112
109
  def apply_quantization_config(
113
- model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
110
+ model: Module, config: QuantizationConfig | None, run_compressed: bool = False
114
111
  ):
115
112
  """
116
113
  Initializes the model for quantization in-place based on the given config.
@@ -207,7 +204,7 @@ def _apply_kv_cache_scheme(
207
204
 
208
205
 
209
206
  def _load_quant_args_from_mapping(
210
- base_name: str, module_name: str, module: Module, mapping: Dict
207
+ base_name: str, module_name: str, module: Module, mapping: dict
211
208
  ):
212
209
  # TODO: skip update and just register here, don't do it in initialize
213
210
  """
@@ -251,8 +248,8 @@ def _load_quant_args_from_mapping(
251
248
 
252
249
 
253
250
  def _scheme_from_targets(
254
- target_to_scheme: OrderedDictType[str, QuantizationScheme],
255
- targets: List[str],
251
+ target_to_scheme: OrderedDict[str, QuantizationScheme],
252
+ targets: list[str],
256
253
  name: str,
257
254
  ) -> QuantizationScheme:
258
255
  # return the first scheme (the prioritized one,
@@ -14,7 +14,6 @@
14
14
 
15
15
  from functools import wraps
16
16
  from math import ceil
17
- from typing import Optional
18
17
 
19
18
  import torch
20
19
  from compressed_tensors.quantization.quant_args import (
@@ -47,9 +46,9 @@ def quantize(
47
46
  scale: torch.Tensor,
48
47
  zero_point: torch.Tensor,
49
48
  args: QuantizationArgs,
50
- dtype: Optional[torch.dtype] = None,
51
- g_idx: Optional[torch.Tensor] = None,
52
- global_scale: Optional[torch.Tensor] = None,
49
+ dtype: torch.dtype | None = None,
50
+ g_idx: torch.Tensor | None = None,
51
+ global_scale: torch.Tensor | None = None,
53
52
  ) -> torch.Tensor:
54
53
  """
55
54
  Quantize the input tensor x using the QuantizationStrategy specified in args.
@@ -85,11 +84,11 @@ def quantize(
85
84
  def dequantize(
86
85
  x_q: torch.Tensor,
87
86
  scale: torch.Tensor,
88
- zero_point: Optional[torch.Tensor] = None,
89
- args: Optional[QuantizationArgs] = None,
90
- dtype: Optional[torch.dtype] = None,
91
- g_idx: Optional[torch.Tensor] = None,
92
- global_scale: Optional[torch.Tensor] = None,
87
+ zero_point: torch.Tensor | None = None,
88
+ args: QuantizationArgs | None = None,
89
+ dtype: torch.dtype | None = None,
90
+ g_idx: torch.Tensor | None = None,
91
+ global_scale: torch.Tensor | None = None,
93
92
  ) -> torch.Tensor:
94
93
  """
95
94
  Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -159,8 +158,8 @@ def fake_quantize(
159
158
  scale: torch.Tensor,
160
159
  zero_point: torch.Tensor,
161
160
  args: QuantizationArgs,
162
- g_idx: Optional[torch.Tensor] = None,
163
- global_scale: Optional[torch.Tensor] = None,
161
+ g_idx: torch.Tensor | None = None,
162
+ global_scale: torch.Tensor | None = None,
164
163
  ) -> torch.Tensor:
165
164
  """
166
165
  Fake quantize the input tensor x by quantizing then dequantizing with
@@ -195,11 +194,11 @@ def _process_quantization(
195
194
  scale: torch.Tensor,
196
195
  zero_point: torch.Tensor,
197
196
  args: QuantizationArgs,
198
- g_idx: Optional[torch.Tensor] = None,
199
- dtype: Optional[torch.dtype] = None,
197
+ g_idx: torch.Tensor | None = None,
198
+ dtype: torch.dtype | None = None,
200
199
  do_quantize: bool = True,
201
200
  do_dequantize: bool = True,
202
- global_scale: Optional[torch.Tensor] = None,
201
+ global_scale: torch.Tensor | None = None,
203
202
  ) -> torch.Tensor:
204
203
  q_min, q_max = calculate_range(args, x.device)
205
204
  group_size = args.group_size
@@ -457,8 +456,8 @@ def _quantize(
457
456
  q_min: torch.Tensor,
458
457
  q_max: torch.Tensor,
459
458
  args: QuantizationArgs,
460
- dtype: Optional[torch.dtype] = None,
461
- global_scale: Optional[torch.Tensor] = None,
459
+ dtype: torch.dtype | None = None,
460
+ global_scale: torch.Tensor | None = None,
462
461
  ) -> torch.Tensor:
463
462
 
464
463
  # if a global scale is optionally provided, use it
@@ -486,9 +485,9 @@ def _quantize(
486
485
  def _dequantize(
487
486
  x_q: torch.Tensor,
488
487
  scale: torch.Tensor,
489
- zero_point: torch.Tensor = None,
490
- dtype: Optional[torch.dtype] = None,
491
- global_scale: Optional[torch.Tensor] = None,
488
+ zero_point: torch.Tensor | None = None,
489
+ dtype: torch.dtype | None = None,
490
+ global_scale: torch.Tensor | None = None,
492
491
  ) -> torch.Tensor:
493
492
 
494
493
  # if a global scale is optionally provided, use it
@@ -14,7 +14,6 @@
14
14
 
15
15
 
16
16
  import logging
17
- from typing import Optional, Tuple, Union
18
17
 
19
18
  import torch
20
19
  from compressed_tensors.modeling import (
@@ -23,6 +22,7 @@ from compressed_tensors.modeling import (
23
22
  QuantizedAttentionImpl,
24
23
  QuantizedKVCache,
25
24
  )
25
+ from compressed_tensors.offload import unwrap_offload_forward
26
26
  from compressed_tensors.quantization import (
27
27
  ActivationOrdering,
28
28
  DynamicType,
@@ -37,7 +37,6 @@ from compressed_tensors.quantization.lifecycle.forward import (
37
37
  )
38
38
  from compressed_tensors.quantization.utils import strategy_cdiv
39
39
  from compressed_tensors.utils import (
40
- disable_hf_hook,
41
40
  get_execution_device,
42
41
  get_head_dim,
43
42
  get_num_attn_heads,
@@ -60,7 +59,7 @@ _LOGGER = logging.getLogger(__name__)
60
59
 
61
60
  def initialize_module_for_quantization(
62
61
  module: Module,
63
- scheme: Optional[QuantizationScheme] = None,
62
+ scheme: QuantizationScheme | None = None,
64
63
  force_zero_point: bool = True,
65
64
  ):
66
65
  """
@@ -134,7 +133,7 @@ def initialize_module_for_quantization(
134
133
  force_zero_point=force_zero_point,
135
134
  )
136
135
 
137
- with disable_hf_hook(module):
136
+ with unwrap_offload_forward(module):
138
137
  # wrap forward call of module to perform
139
138
  # quantized actions based on calltime status
140
139
  wrap_module_forward_quantized(module, scheme)
@@ -148,6 +147,7 @@ def is_attention_module(module: Module):
148
147
  hasattr(module, "k_proj")
149
148
  or hasattr(module, "v_proj")
150
149
  or hasattr(module, "qkv_proj")
150
+ or hasattr(module, "kv_b_proj")
151
151
  )
152
152
 
153
153
 
@@ -155,7 +155,7 @@ def initialize_qparams(
155
155
  module: Module,
156
156
  base_name: str,
157
157
  quantization_args: QuantizationArgs,
158
- observed_shape: Tuple[Union[int, None]],
158
+ observed_shape: tuple[int | None, ...],
159
159
  observed_dtype: torch.dtype,
160
160
  force_zero_point: bool = True,
161
161
  ):
@@ -279,8 +279,8 @@ def initialize_attn_qparams(
279
279
  ):
280
280
  """Initlaize k_scale, v_scale for self_attn"""
281
281
 
282
- impl: Optional[QuantizedAttentionImpl] = getattr(module, IMPL_ATTR, None)
283
- kv_cache: Optional[QuantizedKVCache] = getattr(module, KV_CACHE_ATTR, None)
282
+ impl: QuantizedAttentionImpl | None = getattr(module, IMPL_ATTR, None)
283
+ kv_cache: QuantizedKVCache | None = getattr(module, KV_CACHE_ATTR, None)
284
284
 
285
285
  if impl is None and kv_cache is None:
286
286
  raise ValueError(
@@ -14,7 +14,7 @@
14
14
 
15
15
  import warnings
16
16
  from enum import Enum
17
- from typing import Any, Dict, List, Optional, Union
17
+ from typing import Any
18
18
 
19
19
  import torch
20
20
  from compressed_tensors.utils import Aliasable
@@ -48,10 +48,10 @@ __all__ = [
48
48
  class FloatArgs:
49
49
  exponent: int
50
50
  mantissa: int
51
- bits: Optional[int] = None
52
- max: Optional[float] = None
53
- min: Optional[float] = None
54
- dtype: Optional[torch.dtype] = None
51
+ bits: int | None = None
52
+ max: float | None = None
53
+ min: float | None = None
54
+ dtype: torch.dtype | None = None
55
55
 
56
56
 
57
57
  class FP4_E2M1_DATA(FloatArgs):
@@ -147,7 +147,7 @@ class ActivationOrdering(Aliasable, str, Enum):
147
147
  STATIC = "static"
148
148
 
149
149
  @staticmethod
150
- def get_aliases() -> Dict[str, str]:
150
+ def get_aliases() -> dict[str, str]:
151
151
  return {
152
152
  "dynamic": "group",
153
153
  "static": "weight",
@@ -178,21 +178,21 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
178
178
  num_bits: int = 8
179
179
  type: QuantizationType = QuantizationType.INT
180
180
  symmetric: bool = True
181
- group_size: Optional[int] = None
182
- strategy: Optional[QuantizationStrategy] = None
183
- block_structure: Optional[List[int]] = None
184
- dynamic: Union[DynamicType, bool] = False
185
- actorder: Union[ActivationOrdering, bool, None] = None
186
- scale_dtype: Optional[TorchDtype] = None
187
- zp_dtype: Optional[TorchDtype] = None
188
- observer: Optional[str] = Field(
181
+ group_size: int | None = None
182
+ strategy: QuantizationStrategy | None = None
183
+ block_structure: list[int] | None = None
184
+ dynamic: DynamicType | bool = False
185
+ actorder: ActivationOrdering | bool | None = None
186
+ scale_dtype: TorchDtype | None = None
187
+ zp_dtype: TorchDtype | None = None
188
+ observer: str | None = Field(
189
189
  default=None,
190
190
  description=(
191
191
  "Determines the method of computing quantization parameters (scales and "
192
192
  "zero-points). Defaults to min-max when not using dynamic quantization"
193
193
  ),
194
194
  )
195
- observer_kwargs: Dict[str, Any] = Field(
195
+ observer_kwargs: dict[str, Any] = Field(
196
196
  default_factory=dict,
197
197
  description=(
198
198
  "optional dict of kwargs to be passed directly to torch quantization "
@@ -214,7 +214,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
214
214
  return value
215
215
 
216
216
  @field_validator("group_size", mode="before")
217
- def validate_group(cls, value) -> Union[int, None]:
217
+ def validate_group(cls, value) -> int | None:
218
218
  if value is None:
219
219
  return value
220
220
 
@@ -227,7 +227,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
227
227
  return value
228
228
 
229
229
  @field_validator("block_structure", mode="before")
230
- def validate_block_structure(cls, value) -> Optional[List[int]]:
230
+ def validate_block_structure(cls, value) -> list[int] | None:
231
231
  if value is None:
232
232
  return value
233
233
  # For backward compatibility, allow string format "2x4", "8x16", etc.
@@ -251,14 +251,14 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
251
251
  )
252
252
 
253
253
  @field_validator("strategy", mode="before")
254
- def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
254
+ def validate_strategy(cls, value) -> QuantizationStrategy | None:
255
255
  if isinstance(value, str):
256
256
  return QuantizationStrategy(value.lower())
257
257
 
258
258
  return value
259
259
 
260
260
  @field_validator("actorder", mode="before")
261
- def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
261
+ def validate_actorder(cls, value) -> ActivationOrdering | None:
262
262
  if isinstance(value, bool):
263
263
  return ActivationOrdering.GROUP if value else None
264
264
 
@@ -268,7 +268,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
268
268
  return value
269
269
 
270
270
  @field_validator("dynamic", mode="before")
271
- def validate_dynamic(cls, value) -> Union[DynamicType, bool]:
271
+ def validate_dynamic(cls, value) -> DynamicType | bool:
272
272
  if isinstance(value, str):
273
273
  return DynamicType(value.lower())
274
274
  return value
@@ -329,10 +329,13 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
329
329
  raise ValueError(f"Block structure requires block strategy\n{model}")
330
330
 
331
331
  # validate activation ordering and strategy
332
- if actorder is not None and strategy != QuantizationStrategy.GROUP:
332
+ if actorder is not None and strategy not in (
333
+ QuantizationStrategy.GROUP,
334
+ QuantizationStrategy.TENSOR_GROUP,
335
+ ):
333
336
  raise ValueError(
334
- "Must use group quantization strategy in order to apply "
335
- "activation ordering"
337
+ "Must use group or tensor_group quantization strategy in "
338
+ "order to apply activation ordering"
336
339
  )
337
340
 
338
341
  # infer observer w.r.t. dynamic
@@ -369,7 +372,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
369
372
 
370
373
  elif observer is None:
371
374
  # default to minmax for non-dynamic cases
372
- observer = "minmax"
375
+ observer = "memoryless_minmax"
373
376
 
374
377
  if zp_dtype is None:
375
378
  if model.num_bits == 4 and model.type == QuantizationType.FLOAT:
@@ -409,7 +412,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
409
412
  def round_to_quantized_type_dtype(
410
413
  tensor: torch.Tensor,
411
414
  dtype: torch.dtype,
412
- cast_to_original_dtype: Optional[bool] = True,
415
+ cast_to_original_dtype: bool = True,
413
416
  ) -> torch.Tensor:
414
417
  """
415
418
  Rounds an input tensor to the nearest quantized representation given a dtype.
@@ -439,7 +442,7 @@ def round_to_quantized_type_args(
439
442
  args: QuantizationArgs,
440
443
  min: torch.Tensor,
441
444
  max: torch.Tensor,
442
- cast_to_original_dtype: Optional[bool] = True,
445
+ cast_to_original_dtype: bool = True,
443
446
  ) -> torch.Tensor:
444
447
  """
445
448
  Rounds an input tensor to the nearest quantized representation given
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  from collections import defaultdict
15
15
  from enum import Enum
16
- from typing import Annotated, Any, Dict, List, Optional, Set, Union
16
+ from typing import Annotated, Any
17
17
 
18
18
  from compressed_tensors.config import CompressionFormat
19
19
  from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
@@ -55,7 +55,7 @@ class QuantizationStatus(str, Enum):
55
55
  COMPRESSED = "compressed"
56
56
 
57
57
  @classmethod
58
- def lifecycle_order(cls) -> List["QuantizationStatus"]:
58
+ def lifecycle_order(cls) -> list["QuantizationStatus"]:
59
59
  """
60
60
  :return: list of correct quantization lifecycle order
61
61
  """
@@ -131,13 +131,13 @@ class QuantizationConfig(BaseModel):
131
131
  are not quantized even if they match up with a target in config_groups
132
132
  """
133
133
 
134
- config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
134
+ config_groups: dict[str, QuantizationScheme | list[str]]
135
135
  quant_method: str = DEFAULT_QUANTIZATION_METHOD
136
- kv_cache_scheme: Optional[QuantizationArgs] = None
136
+ kv_cache_scheme: QuantizationArgs | None = None
137
137
  format: str = DEFAULT_QUANTIZATION_FORMAT
138
138
  quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
139
- global_compression_ratio: Optional[float] = None
140
- ignore: Optional[List[str]] = Field(default_factory=list)
139
+ global_compression_ratio: float | None = None
140
+ ignore: list[str] | None = Field(default_factory=list)
141
141
  # `run_compressed` is a dummy, unused arg for backwards compatibility
142
142
  # see: https://github.com/huggingface/transformers/pull/39324
143
143
  run_compressed: Annotated[Any, Field(exclude=True)] = None
@@ -161,8 +161,8 @@ class QuantizationConfig(BaseModel):
161
161
 
162
162
  @staticmethod
163
163
  def from_pretrained(
164
- model: Module, format: Optional[Union[str, list]] = None
165
- ) -> Optional["QuantizationConfig"]:
164
+ model: Module, format: str | list | None = None
165
+ ) -> "QuantizationConfig | None":
166
166
  """
167
167
  Converts a model into its associated QuantizationConfig based on the
168
168
  QuantizationScheme attached to each quantized module
@@ -177,21 +177,21 @@ class QuantizationConfig(BaseModel):
177
177
 
178
178
  # set of all quantization schemes
179
179
  # TODO: make quant config/scheme/args frozen/hashable and use a set
180
- quantization_schemes: List[QuantizationScheme] = list()
180
+ quantization_schemes: list[QuantizationScheme] = list()
181
181
 
182
182
  # use any status from modules (in practice, use the last module)
183
183
  model_status = None
184
184
 
185
185
  # set of all quantized types
186
186
  # this is later used to create the ignore list
187
- quantization_type_names: Set[str] = set()
187
+ quantization_type_names: set[str] = set()
188
188
 
189
189
  # maps types to names which are not quantized
190
190
  # this is later used to create the ignore list
191
- ignore: Dict[str, List[str]] = defaultdict(list)
191
+ ignore: dict[str, list[str]] = defaultdict(list)
192
192
 
193
193
  # this keeps track of any kvcache schemes
194
- kv_cache_scheme: Optional[QuantizationArgs] = None
194
+ kv_cache_scheme: QuantizationArgs | None = None
195
195
 
196
196
  for name, submodule in model.named_modules():
197
197
  layer_type: str = module_type(submodule)