compressed-tensors 0.9.4a20250414__tar.gz → 0.9.5a20250424__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/build.yml +1 -1
  2. {compressed_tensors-0.9.4a20250414/src/compressed_tensors.egg-info → compressed_tensors-0.9.5a20250424}/PKG-INFO +1 -1
  3. compressed_tensors-0.9.5a20250424/pyproject.toml +3 -0
  4. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/setup.py +1 -0
  5. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/base.py +6 -1
  6. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +90 -7
  7. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/base.py +21 -6
  8. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +88 -21
  9. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/base.py +21 -4
  10. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/apply.py +65 -30
  11. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/initialize.py +13 -2
  12. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/offload.py +20 -17
  13. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/safetensors_load.py +10 -8
  14. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/version.py +2 -2
  15. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424/src/compressed_tensors.egg-info}/PKG-INFO +1 -1
  16. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/model_compressors/test_model_compressor.py +2 -0
  17. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/quantized_compressors/test_fp8_quant.py +1 -1
  18. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/quantized_compressors/test_int_quant.py +2 -2
  19. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/quantized_compressors/test_pack_quant.py +62 -4
  20. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_utils/test_offload.py +46 -1
  21. compressed_tensors-0.9.4a20250414/pyproject.toml +0 -10
  22. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/.gitkeep +0 -0
  23. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/actions/test/action.yml +0 -0
  24. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/scripts/step-status +0 -0
  25. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/build-test.yml +0 -0
  26. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/report.yml +0 -0
  27. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/test-check.yaml +0 -0
  28. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/test.yml +0 -0
  29. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/trigger-all.yml +0 -0
  30. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.github/workflows/upload.yml +0 -0
  31. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/.gitignore +0 -0
  32. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/LICENSE +0 -0
  33. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/Makefile +0 -0
  34. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/README.md +0 -0
  35. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/bit_packing/ex_quantize_and_pack.py +0 -0
  36. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/bit_packing/int4_config.json +0 -0
  37. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/bitmask_compression.ipynb +0 -0
  38. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/llama_1.1b/ex_config_quantization.py +0 -0
  39. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/llama_1.1b/ex_llmcompressor_quantization.py +0 -0
  40. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/llama_1.1b/example_quant_config.json +0 -0
  41. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/llama_1.1b/example_quant_recipe.yaml +0 -0
  42. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/examples/quantize_and_pack_int4.ipynb +0 -0
  43. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/setup.cfg +0 -0
  44. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/__init__.py +0 -0
  45. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/README.md +0 -0
  46. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/__init__.py +0 -0
  47. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/base.py +0 -0
  48. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/__init__.py +0 -0
  49. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/helpers.py +0 -0
  50. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
  51. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
  52. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
  53. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
  54. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
  55. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +0 -0
  56. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
  57. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
  58. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
  59. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/config/__init__.py +0 -0
  60. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/config/base.py +0 -0
  61. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/config/dense.py +0 -0
  62. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
  63. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  64. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/linear/__init__.py +0 -0
  65. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/linear/compressed_linear.py +0 -0
  66. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/__init__.py +0 -0
  67. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
  68. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
  69. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
  70. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
  71. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/quant_args.py +0 -0
  72. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/quant_config.py +0 -0
  73. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
  74. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  75. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
  76. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/registry/__init__.py +0 -0
  77. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/registry/registry.py +0 -0
  78. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/__init__.py +0 -0
  79. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/helpers.py +0 -0
  80. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/permutations_24.py +0 -0
  81. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/permute.py +0 -0
  82. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
  83. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors.egg-info/SOURCES.txt +0 -0
  84. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
  85. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors.egg-info/requires.txt +0 -0
  86. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/src/compressed_tensors.egg-info/top_level.txt +0 -0
  87. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/__init__.py +0 -0
  88. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/conftest.py +0 -0
  89. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/__init__.py +0 -0
  90. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/model_compressors/__init__.py +0 -0
  91. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/quantized_compressors/__init__.py +0 -0
  92. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/sparse_compressors/__init__.py +0 -0
  93. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/sparse_compressors/test_bitmask.py +0 -0
  94. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py +0 -0
  95. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/sparse_quantized_compressors/__init__.py +0 -0
  96. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py +0 -0
  97. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_configs/__init__.py +0 -0
  98. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_configs/test_base.py +0 -0
  99. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_examples/test_bitmask_compression_ipynb.py +0 -0
  100. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_linear/__init__.py +0 -0
  101. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_linear/test_compressed_linear.py +0 -0
  102. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/__init__.py +0 -0
  103. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/__init__.py +0 -0
  104. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/conftest.py +0 -0
  105. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_apply.py +0 -0
  106. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py +0 -0
  107. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_enabled.py +0 -0
  108. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_forward.py +0 -0
  109. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_helpers.py +0 -0
  110. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_initialize.py +0 -0
  111. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/lifecycle/test_lifecycle.py +0 -0
  112. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_configs/__init__.py +0 -0
  113. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_configs/test_bit_depths.py +0 -0
  114. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_configs/test_strategies.py +0 -0
  115. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_quant_args.py +0 -0
  116. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_quant_config.py +0 -0
  117. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_quant_scheme.py +0 -0
  118. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_quantization/test_utils/test_helpers.py +0 -0
  119. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_registry.py +0 -0
  120. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_utils/__init__.py +0 -0
  121. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_utils/test_helpers.py +0 -0
  122. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/test_utils/test_safetensors_load.py +0 -0
  123. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/tests/testing_utils.py +0 -0
  124. {compressed_tensors-0.9.4a20250414 → compressed_tensors-0.9.5a20250424}/utils/copyright.py +0 -0
@@ -76,7 +76,7 @@ jobs:
76
76
 
77
77
  - name: build
78
78
  id: build
79
- uses: neuralmagic/nm-actions/actions/build-ml-whl@v1.18.0
79
+ uses: neuralmagic/nm-actions/actions/build-ml-whl@fix-whl-checks
80
80
  with:
81
81
  dev: false
82
82
  release: ${{ inputs.wf_category == 'RELEASE' }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compressed-tensors
3
- Version: 0.9.4a20250414
3
+ Version: 0.9.5a20250424
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -0,0 +1,3 @@
1
+ [tool.black]
2
+ line-length = 88
3
+ target-version = ['py36']
@@ -101,6 +101,7 @@ setup(
101
101
  use_scm_version={
102
102
  "version_scheme": version_func,
103
103
  "local_scheme": localversion_func,
104
+ "version_file": "src/compressed_tensors/version.py",
104
105
  },
105
106
  author="Neuralmagic, Inc.",
106
107
  author_email="support@neuralmagic.com",
@@ -19,6 +19,7 @@ import torch
19
19
  from compressed_tensors.config import SparsityCompressionConfig
20
20
  from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
21
21
  from compressed_tensors.registry import RegistryMixin
22
+ from compressed_tensors.utils import has_offloaded_params
22
23
  from torch import Tensor
23
24
  from torch.nn import Module
24
25
 
@@ -169,6 +170,10 @@ class BaseCompressor(RegistryMixin, ABC):
169
170
  :param module: PyTorch module to decompress
170
171
  :return: tensor of the decompressed weight, or None if module is not quantized
171
172
  """
173
+
174
+ params_device = next(module.parameters()).device
175
+ device = "cpu" if has_offloaded_params(module) else params_device
176
+
172
177
  if not hasattr(module, "quantization_scheme"):
173
178
  return None # module is not quantized
174
179
  quantization_scheme = module.quantization_scheme
@@ -182,7 +187,7 @@ class BaseCompressor(RegistryMixin, ABC):
182
187
 
183
188
  return self.decompress_weight(
184
189
  compressed_data=compressed_data, quantization_args=quantization_args
185
- )
190
+ ).to(device)
186
191
 
187
192
  def decompress_weight(
188
193
  self, compressed_data: Dict[str, Tensor], **kwargs
@@ -31,13 +31,14 @@ from compressed_tensors.base import (
31
31
  SPARSITY_CONFIG_NAME,
32
32
  )
33
33
  from compressed_tensors.compressors.base import BaseCompressor
34
+ from compressed_tensors.compressors.sparse_compressors import DenseCompressor
34
35
  from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
35
36
  from compressed_tensors.quantization import (
36
37
  DEFAULT_QUANTIZATION_METHOD,
37
38
  QuantizationConfig,
38
39
  QuantizationStatus,
39
40
  apply_quantization_config,
40
- load_pretrained_quantization,
41
+ load_pretrained_quantization_parameters,
41
42
  )
42
43
  from compressed_tensors.quantization.lifecycle import expand_target_names
43
44
  from compressed_tensors.quantization.quant_args import QuantizationArgs
@@ -47,7 +48,9 @@ from compressed_tensors.quantization.utils import (
47
48
  )
48
49
  from compressed_tensors.utils import (
49
50
  get_safetensors_folder,
51
+ has_offloaded_params,
50
52
  merge_names,
53
+ register_offload_parameter,
51
54
  update_parameter_data,
52
55
  )
53
56
  from compressed_tensors.utils.helpers import (
@@ -382,6 +385,7 @@ class ModelCompressor:
382
385
  compressed_state_dict = self.quantization_compressor.compress(
383
386
  state_dict, names_to_scheme=quantized_modules_to_args
384
387
  )
388
+
385
389
  if self.quantization_config.format != CompressionFormat.dense.value:
386
390
  self.quantization_config.quantization_status = (
387
391
  QuantizationStatus.COMPRESSED
@@ -411,6 +415,13 @@ class ModelCompressor:
411
415
 
412
416
  :param model_path: path to compressed weights
413
417
  :param model: pytorch model to load decompressed weights into
418
+
419
+ Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
420
+ The variations in these methods are a result of the subtle variations between the sparsity
421
+ and quantization compressors. Specifically, quantization compressors return not just the
422
+ decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
423
+ compressors only return the decompressed weight.
424
+
414
425
  """
415
426
  model_path = get_safetensors_folder(model_path)
416
427
  sparse_decompressed = False
@@ -419,9 +430,16 @@ class ModelCompressor:
419
430
  self.sparsity_compressor is not None
420
431
  and self.sparsity_config.format != CompressionFormat.dense.value
421
432
  ):
433
+ params_to_ignore = None
434
+ if self.quantization_compressor is not None:
435
+ params_to_ignore = self.quantization_compressor.compression_param_names
422
436
  # Sparse decompression is applied on the model_path
423
- dense_gen = self.sparsity_compressor.decompress(model_path)
424
- self._replace_weights(dense_gen, model)
437
+ # The compressor will try and load any quantization parameters as well
438
+ # params_to_skip_load will skip over quantization params from being loaded
439
+ dense_gen = self.sparsity_compressor.decompress(
440
+ model_path, params_to_skip_load=params_to_ignore
441
+ )
442
+ self._replace_sparsity_weights(dense_gen, model)
425
443
  setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
426
444
  sparse_decompressed = True
427
445
 
@@ -430,13 +448,27 @@ class ModelCompressor:
430
448
  # quantization during apply_quantization_config. This ensures
431
449
  # that the dtypes of the weights are not unintentionally updated.
432
450
  # The status is restored after quantization params are loaded.
451
+
433
452
  with override_quantization_status(
434
453
  self.quantization_config, QuantizationStatus.FROZEN
435
454
  ):
455
+
436
456
  names_to_scheme = apply_quantization_config(
437
457
  model, self.quantization_config
438
458
  )
439
- load_pretrained_quantization(model, model_path)
459
+ # Load activation scales/zp or any other quantization parameters
460
+ # Conditionally load the weight quantization parameters if we have a dense compressor
461
+ # Or if a sparsity compressor has already been applied
462
+ load_pretrained_quantization_parameters(
463
+ model,
464
+ model_path,
465
+ # TODO: all weight quantization params will be moved to the compressor in a follow-up
466
+ # including initialization
467
+ load_weight_quantization=(
468
+ sparse_decompressed
469
+ or isinstance(self.quantization_compressor, DenseCompressor)
470
+ ),
471
+ )
440
472
 
441
473
  model_path_or_state_dict = (
442
474
  model.state_dict() if sparse_decompressed else model_path
@@ -445,6 +477,8 @@ class ModelCompressor:
445
477
  dense_gen = self.quantization_compressor.decompress(
446
478
  model_path_or_state_dict, names_to_scheme=names_to_scheme
447
479
  )
480
+ # TODO: all weight quantization params will be moved to the compressor
481
+ # to prevent duplicate parameter updates in update_parameter_data
448
482
  self._replace_weights(dense_gen, model)
449
483
 
450
484
  def freeze_quantization_status(module):
@@ -500,7 +534,7 @@ class ModelCompressor:
500
534
  with open(config_file_path, "w") as config_file:
501
535
  json.dump(config_data, config_file, indent=2, sort_keys=True)
502
536
 
503
- def _replace_weights(self, dense_weight_generator, model: Module):
537
+ def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
504
538
  """
505
539
  Replace the weights of the model with the
506
540
  provided dense weights.
@@ -515,11 +549,60 @@ class ModelCompressor:
515
549
  :param model: The model whose weights are to be updated.
516
550
  """
517
551
  for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
552
+
518
553
  split_name = name.split(".")
519
554
  prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
520
555
  module = operator.attrgetter(prefix)(model)
521
- if hasattr(module, param_name):
522
- update_parameter_data(module, data, param_name)
556
+
557
+ params_device = next(module.parameters()).device
558
+ device = "cpu" if has_offloaded_params(module) else params_device
559
+ delattr(module, param_name)
560
+ requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
561
+ param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
562
+ register_offload_parameter(module, param_name, param)
563
+
564
+ def _replace_weights(self, dense_weight_generator, model: Module):
565
+ """
566
+ Replace the weights of the model with the
567
+ provided dense weights.
568
+
569
+ This method iterates over the dense_weight_generator and
570
+ updates the corresponding weights in the model. If a parameter
571
+ name does not exist in the model, it will be skipped.
572
+
573
+ :param dense_weight_generator (generator): A generator that yields
574
+ tuples of (name, data), where 'name' is the parameter name and
575
+ 'data' is the updated param data
576
+ :param model: The model whose weights are to be updated.
577
+ """
578
+
579
+ for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
580
+ module = operator.attrgetter(name)(model)
581
+
582
+ params_device = next(module.parameters()).device
583
+ device = "cpu" if has_offloaded_params(module) else params_device
584
+
585
+ for param_name, param_data in data.items():
586
+ if hasattr(module, param_name):
587
+ # If compressed, will have an incorrect dtype for transformers >4.49
588
+ # TODO: we can also just skip initialization of scales/zp if in decompression in init
589
+ # to be consistent with loading which happens later as well
590
+ # however, update_data does a good shape check - should be moved to the compressor
591
+ if param_name == "weight":
592
+ delattr(module, param_name)
593
+ requires_grad = param_data.dtype in (
594
+ torch.float16,
595
+ torch.float32,
596
+ torch.bfloat16,
597
+ )
598
+ param = torch.nn.Parameter(
599
+ param_data.to(device), requires_grad=requires_grad
600
+ )
601
+ register_offload_parameter(module, param_name, param)
602
+ else:
603
+ # Should already be registered to the correct device for
604
+ # for scales/zero-points
605
+ update_parameter_data(module, param_data, param_name)
523
606
 
524
607
 
525
608
  def map_modules_to_quant_args(
@@ -14,11 +14,11 @@
14
14
 
15
15
  import logging
16
16
  from pathlib import Path
17
- from typing import Any, Dict, Generator, Tuple, Union
17
+ from typing import Any, Dict, Generator, Optional, Tuple, Union
18
18
 
19
19
  import torch
20
20
  from compressed_tensors.compressors.base import BaseCompressor
21
- from compressed_tensors.quantization import QuantizationArgs
21
+ from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
22
22
  from compressed_tensors.utils import (
23
23
  get_nested_mappings_from_state_dict,
24
24
  get_nested_weight_mappings,
@@ -132,8 +132,10 @@ class BaseQuantizationCompressor(BaseCompressor):
132
132
  compressed_dict[merge_names(prefix, key)] = value
133
133
  else:
134
134
  compressed_dict[name] = value.to("cpu")
135
- # only save if asym
136
- elif is_weight_zp and quant_args_zp.symmetric:
135
+ # only save zp if asym and not packed zp
136
+ elif is_weight_zp and (
137
+ quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
138
+ ):
137
139
  continue
138
140
  # only save if asym
139
141
  elif is_input_zp and input_args_zp.symmetric:
@@ -145,6 +147,17 @@ class BaseQuantizationCompressor(BaseCompressor):
145
147
 
146
148
  return compressed_dict
147
149
 
150
+ def _check_if_zp_pack_quantized(self, quant_args):
151
+ from compressed_tensors.compressors import PackedQuantizationCompressor
152
+
153
+ if isinstance(self, PackedQuantizationCompressor):
154
+ if not quant_args.symmetric and quant_args.strategy in [
155
+ QuantizationStrategy.GROUP.value,
156
+ QuantizationStrategy.CHANNEL.value,
157
+ ]:
158
+ return True
159
+ return False
160
+
148
161
  def decompress(
149
162
  self,
150
163
  path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
@@ -186,7 +199,8 @@ class BaseQuantizationCompressor(BaseCompressor):
186
199
  decompressed = self.decompress_weight(
187
200
  compressed_data=weight_data, quantization_args=quant_args
188
201
  )
189
- yield merge_names(weight_name, "weight"), decompressed
202
+ weight_data["weight"] = decompressed
203
+ yield weight_name, weight_data
190
204
 
191
205
  def _decompress_from_state_dict(self, state_dict, names_to_scheme):
192
206
  weight_mappings = get_nested_mappings_from_state_dict(
@@ -202,4 +216,5 @@ class BaseQuantizationCompressor(BaseCompressor):
202
216
  decompressed = self.decompress_weight(
203
217
  compressed_data=weight_data, quantization_args=quant_args
204
218
  )
205
- yield merge_names(weight_name, "weight"), decompressed
219
+ weight_data["weight"] = decompressed
220
+ yield weight_name, weight_data
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import math
15
- from typing import Dict, Optional, Tuple
15
+ from typing import Dict, Literal, Optional, Tuple, Union
16
16
 
17
17
  import numpy as np
18
18
  import torch
@@ -21,7 +21,7 @@ from compressed_tensors.compressors.quantized_compressors.base import (
21
21
  BaseQuantizationCompressor,
22
22
  )
23
23
  from compressed_tensors.config import CompressionFormat
24
- from compressed_tensors.quantization import QuantizationArgs
24
+ from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
25
25
  from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
26
26
  from compressed_tensors.quantization.utils import can_quantize
27
27
  from torch import Tensor
@@ -65,10 +65,26 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
65
65
  """
66
66
  pack_factor = 32 // quantization_args.num_bits
67
67
  packed_size = math.ceil(weight_shape[1] / pack_factor)
68
- return {
68
+ packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
69
+ output = {
69
70
  "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
70
71
  "weight_shape": (torch.Size((2,)), torch.int32),
71
72
  }
73
+ if not quantization_args.symmetric and quantization_args.strategy in [
74
+ QuantizationStrategy.GROUP.value,
75
+ QuantizationStrategy.CHANNEL.value,
76
+ ]:
77
+ zp_factor = (
78
+ quantization_args.group_size
79
+ if quantization_args.strategy == QuantizationStrategy.GROUP.value
80
+ else weight_shape[-1]
81
+ )
82
+
83
+ output["weight_zero_point"] = (
84
+ torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
85
+ torch.int32,
86
+ )
87
+ return output
72
88
 
73
89
  def compress_weight(
74
90
  self,
@@ -104,6 +120,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
104
120
  quantized_weight = weight
105
121
 
106
122
  packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
123
+
107
124
  weight_shape = torch.tensor(weight.shape)
108
125
  if device is not None:
109
126
  packed_weight = packed_weight.to(device)
@@ -112,6 +129,15 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
112
129
  compressed_dict["weight_shape"] = weight_shape
113
130
  compressed_dict["weight_packed"] = packed_weight
114
131
 
132
+ # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
133
+ if not quantization_args.symmetric and quantization_args.strategy in [
134
+ QuantizationStrategy.GROUP.value,
135
+ QuantizationStrategy.CHANNEL.value,
136
+ ]:
137
+ packed_zp = pack_to_int32(
138
+ zero_point, quantization_args.num_bits, packed_dim=0
139
+ )
140
+ compressed_dict["weight_zero_point"] = packed_zp
115
141
  return compressed_dict
116
142
 
117
143
  def decompress_weight(
@@ -133,6 +159,21 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
133
159
  original_shape = torch.Size(compressed_data["weight_shape"])
134
160
  num_bits = quantization_args.num_bits
135
161
  unpacked = unpack_from_int32(weight, num_bits, original_shape)
162
+
163
+ # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
164
+ if not quantization_args.symmetric and quantization_args.strategy in [
165
+ QuantizationStrategy.GROUP.value,
166
+ QuantizationStrategy.CHANNEL.value,
167
+ ]:
168
+ raise ValueError(
169
+ "Decompression of packed zero points is currently not supported"
170
+ )
171
+ assert zero_point is not None
172
+ original_zp_shape = (original_shape[0], scale.shape[-1])
173
+ zero_point = unpack_from_int32(
174
+ zero_point, num_bits, original_zp_shape, packed_dim=0
175
+ )
176
+
136
177
  decompressed_weight = dequantize(
137
178
  x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
138
179
  )
@@ -140,7 +181,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
140
181
  return decompressed_weight
141
182
 
142
183
 
143
- def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
184
+ def pack_to_int32(
185
+ value: torch.Tensor,
186
+ num_bits: int,
187
+ packed_dim: Union[Literal[0], Literal[1]] = 1,
188
+ ) -> torch.Tensor:
144
189
  """
145
190
  Packs a tensor of quantized weights stored in int8 into int32s with padding
146
191
 
@@ -176,14 +221,19 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
176
221
  pack_factor = 32 // num_bits
177
222
 
178
223
  # pad input tensor and initialize packed output
179
- packed_size = math.ceil(value.shape[1] / pack_factor)
180
- padding = packed_size * pack_factor - value.shape[1]
224
+ packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
225
+ padding = packed_size * pack_factor - value.shape[packed_dim]
181
226
  value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
182
227
 
183
228
  # pack values
184
- packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
185
- for i in range(pack_factor):
186
- packed |= value[:, i::pack_factor] << num_bits * i
229
+ if packed_dim == 1:
230
+ packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
231
+ for i in range(pack_factor):
232
+ packed |= value[:, i::pack_factor] << num_bits * i
233
+ else:
234
+ packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
235
+ for i in range(pack_factor):
236
+ packed |= value[i::pack_factor, :] << num_bits * i
187
237
 
188
238
  # convert back to signed and torch
189
239
  packed = np.ascontiguousarray(packed).view(np.int32)
@@ -191,7 +241,10 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
191
241
 
192
242
 
193
243
  def unpack_from_int32(
194
- value: torch.Tensor, num_bits: int, shape: torch.Size
244
+ value: torch.Tensor,
245
+ num_bits: int,
246
+ shape: torch.Size,
247
+ packed_dim: Union[Literal[0], Literal[1]] = 1,
195
248
  ) -> torch.Tensor:
196
249
  """
197
250
  Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
@@ -216,17 +269,31 @@ def unpack_from_int32(
216
269
 
217
270
  # unpack
218
271
  mask = (1 << num_bits) - 1
219
- unpacked = torch.zeros(
220
- (value.shape[0], value.shape[1] * pack_factor),
221
- device=value.device,
222
- dtype=torch.int32,
223
- )
224
- for i in range(pack_factor):
225
- unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
226
-
227
- # remove padding
228
- original_row_size = int(shape[1])
229
- unpacked = unpacked[:, :original_row_size]
272
+
273
+ if packed_dim == 1:
274
+ unpacked = torch.zeros(
275
+ (value.shape[0], value.shape[1] * pack_factor),
276
+ device=value.device,
277
+ dtype=torch.int32,
278
+ )
279
+ for i in range(pack_factor):
280
+ unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
281
+
282
+ # remove padding
283
+ original_row_size = int(shape[1])
284
+ unpacked = unpacked[:, :original_row_size]
285
+ else:
286
+ unpacked = torch.zeros(
287
+ (value.shape[0] * pack_factor, value.shape[1]),
288
+ device=value.device,
289
+ dtype=torch.int32,
290
+ )
291
+ for i in range(pack_factor):
292
+ unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
293
+
294
+ # remove padding
295
+ original_row_size = int(shape[0])
296
+ unpacked = unpacked[:original_row_size, :]
230
297
 
231
298
  # bits are packed in unsigned format, reformat to signed
232
299
  # update the value range from unsigned to signed
@@ -98,7 +98,11 @@ class BaseSparseCompressor(BaseCompressor):
98
98
  return compressed_dict
99
99
 
100
100
  def decompress(
101
- self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
101
+ self,
102
+ path_to_model_or_tensors: str,
103
+ device: str = "cpu",
104
+ params_to_skip_load: Optional[Tuple] = None,
105
+ **kwargs,
102
106
  ) -> Generator[Tuple[str, Tensor], None, None]:
103
107
  """
104
108
  Reads a bitmask compressed state dict located
@@ -108,6 +112,11 @@ class BaseSparseCompressor(BaseCompressor):
108
112
  :param model_path: path to compressed safetensors model (directory with
109
113
  one or more safetensors files) or compressed tensors file
110
114
  :param device: device to load decompressed weights onto
115
+ :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
116
+ parameters) that we want to skip loading. As the sparsity compresssor does
117
+ not handle quantized decompression, this should contain any quantization
118
+ parameters when decompressing stacked compressors. We want these parameters
119
+ to be handled by the quantization decompressor
111
120
  :return: iterator for generating decompressed weights
112
121
  """
113
122
  weight_mappings, ignored_params = get_nested_weight_mappings(
@@ -121,13 +130,21 @@ class BaseSparseCompressor(BaseCompressor):
121
130
  full_name = merge_names(weight_name, param_name)
122
131
  with safe_open(safe_path, framework="pt", device=device) as f:
123
132
  weight_data[param_name] = f.get_tensor(full_name)
133
+
124
134
  decompressed = self.decompress_weight(weight_data)
125
135
  yield merge_names(weight_name, "weight"), decompressed
126
136
 
127
137
  for ignored_param_name, safe_path in ignored_params.items():
128
- with safe_open(safe_path, framework="pt", device=device) as f:
129
- value = f.get_tensor(ignored_param_name)
130
- yield ignored_param_name, value
138
+ should_skip = False
139
+ if params_to_skip_load is not None:
140
+ for param_to_skip in params_to_skip_load:
141
+ if param_to_skip in ignored_param_name:
142
+ should_skip = True
143
+
144
+ if not should_skip:
145
+ with safe_open(safe_path, framework="pt", device=device) as f:
146
+ value = f.get_tensor(ignored_param_name)
147
+ yield ignored_param_name, value
131
148
 
132
149
  @staticmethod
133
150
  def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool: