compressed-tensors 0.11.1a20250821__py3-none-any.whl → 0.11.1a20250902__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -703,9 +703,12 @@ class ModelCompressor:
703
703
  with override_quantization_status(
704
704
  self.quantization_config, QuantizationStatus.FROZEN
705
705
  ):
706
- names_to_scheme = apply_quantization_config(
707
- model, self.quantization_config
708
- )
706
+ apply_quantization_config(model, self.quantization_config)
707
+ names_to_scheme: Set[QuantizationScheme] = {
708
+ name: getattr(module, "quantization_scheme")
709
+ for name, module in model.named_modules()
710
+ if getattr(module, "quantization_scheme", None) is not None
711
+ }
709
712
  # Load activation scales/zp or any other quantization parameters
710
713
  # Conditionally load the weight quantization parameters if we have a
711
714
  # dense compressor or if a sparsity compressor has already been applied
@@ -123,6 +123,7 @@ class NVFP4PackedCompressor(BaseQuantizationCompressor):
123
123
  return decompressed_weight
124
124
 
125
125
 
126
+ @torch.compile(fullgraph=True, dynamic=True)
126
127
  def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
127
128
  """
128
129
  Packs a tensor with values in the fp4 range into uint8.
@@ -145,12 +146,11 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
145
146
 
146
147
  # Find closest valid FP4 value index for each element
147
148
  abs_x = torch.abs(x)
148
- abs_indices = torch.zeros_like(abs_x, dtype=torch.long)
149
- for i, val in enumerate(kE2M1):
150
- abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices)
149
+ abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE2M1) # [m, n, 8]
150
+ abs_indices = torch.argmin(abs_diff_x, dim=-1) # [m, n]
151
151
 
152
152
  # Apply sign bit (bit 3) to get final 4-bit representation
153
- indices = abs_indices + (torch.signbit(x) << 3).to(torch.long)
153
+ indices = abs_indices + (torch.signbit(x).to(torch.long) << 3)
154
154
 
155
155
  # Reshape to prepare for packing pairs of values
156
156
  indices = indices.reshape(-1)
@@ -174,6 +174,7 @@ kE2M1ToFloat = torch.tensor(
174
174
 
175
175
 
176
176
  # reference: : https://github.com/vllm-project/vllm/pull/16362
177
+ @torch.compile(fullgraph=True, dynamic=True)
177
178
  def unpack_fp4_from_uint8(
178
179
  a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
179
180
  ) -> torch.Tensor:
@@ -115,7 +115,7 @@ def load_pretrained_quantization_parameters(
115
115
 
116
116
  def apply_quantization_config(
117
117
  model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
118
- ) -> Dict[str, QuantizationScheme]:
118
+ ):
119
119
  """
120
120
  Initializes the model for quantization in-place based on the given config.
121
121
  Optionally coverts quantizable modules to compressed_linear modules
@@ -125,26 +125,22 @@ def apply_quantization_config(
125
125
  :param run_compressed: Whether the model will be run in compressed mode or
126
126
  decompressed fully on load
127
127
  """
128
- # Workaround for when HF Quantizer passes None, see PR #180
129
- if config is None:
130
- return dict()
128
+ from compressed_tensors.linear.compressed_linear import CompressedLinear
131
129
 
132
- # remove reference to the original `config`
133
- # argument. This function can mutate it, and we'd
134
- # like to keep the original `config` as it is.
135
130
  config = deepcopy(config)
131
+ if config is None: # see PR #180
132
+ return dict()
133
+
134
+ # preprocess to support kv cache scheme
135
+ config = process_quantization_config(config)
136
+
136
137
  # build mapping of targets to schemes for easier matching
137
138
  # use ordered dict to preserve target ordering in config
138
139
  target_to_scheme = OrderedDict()
139
- config = process_quantization_config(config)
140
- names_to_scheme = dict()
141
140
  for scheme in config.config_groups.values():
142
141
  for target in scheme.targets:
143
142
  target_to_scheme[target] = scheme
144
143
 
145
- if run_compressed:
146
- from compressed_tensors.linear.compressed_linear import CompressedLinear
147
-
148
144
  # mark appropriate layers for quantization by setting their quantization schemes
149
145
  for name, submodule in match_named_modules(
150
146
  model, target_to_scheme, config.ignore, warn_on_fail=True
@@ -153,7 +149,12 @@ def apply_quantization_config(
153
149
  # quant scheme to the matching layers
154
150
  matched_targets = match_targets(name, submodule, target_to_scheme)
155
151
  scheme = _scheme_from_targets(target_to_scheme, matched_targets, name)
156
- if run_compressed:
152
+ # target matched - add layer and scheme to target list
153
+ submodule.quantization_scheme = scheme
154
+
155
+ # replace with run compressed if applicable
156
+ # FUTURE: move this to model compressor
157
+ if isinstance(submodule, torch.nn.Linear) and run_compressed:
157
158
  format = config.format
158
159
  if format != CompressionFormat.dense.value:
159
160
  if isinstance(submodule, torch.nn.Linear):
@@ -165,14 +166,8 @@ def apply_quantization_config(
165
166
  )
166
167
  replace_module(model, name, compressed_linear)
167
168
 
168
- # target matched - add layer and scheme to target list
169
- submodule.quantization_scheme = scheme
170
-
171
- names_to_scheme[name] = submodule.quantization_scheme
172
-
173
169
  # apply current quantization status across all targeted layers
174
170
  apply_quantization_status(model, config.quantization_status)
175
- return names_to_scheme
176
171
 
177
172
 
178
173
  def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.11.1.a20250821'
20
+ __version__ = version = '0.11.1.a20250902'
21
21
  __version_tuple__ = version_tuple = (0, 11, 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: compressed-tensors
3
- Version: 0.11.1a20250821
3
+ Version: 0.11.1a20250902
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -1,15 +1,15 @@
1
1
  compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
2
2
  compressed_tensors/base.py,sha256=-gxWvDF4LCkyeDP8YlGzvBBKxo4Dk9h4NINPD61drFU,921
3
- compressed_tensors/version.py,sha256=QiPWK4b5m-LXWHE8_W5EK7VPtKZvorPc5Opz7BYczvA,523
3
+ compressed_tensors/version.py,sha256=f2tYhW3Wm283FrozC12eaBYB44is8v3s2XwTXmhs8kI,523
4
4
  compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
5
5
  compressed_tensors/compressors/base.py,sha256=nvWsv4xEw1Tkxkxth6TmHplDYXfBeP22xWxOsZERyDY,7204
6
6
  compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
7
7
  compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
8
- compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=x2AS1NAPQx51O8uxyLf3wItnp2-_0qU2fI6eQVFBBfY,37388
8
+ compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=mZqpBS5znPHedlVVkKsUsVCs52zK5bAmEiI8cqMBKnY,37618
9
9
  compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=KvaFBL_Q84LxRGJOV035M8OBoCkAx8kOkfphswgkKWk,745
10
10
  compressed_tensors/compressors/quantized_compressors/base.py,sha256=_mqTG_HjAIbHqDGucA3ZR_01OXU3CMFxtrDjfM-kY0g,10301
11
11
  compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=0ANDcuD8aXPqTYNPY6GnX9iS6eXJw6P0TzNV_rYS2l8,5369
12
- compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py,sha256=Z8k2gi5a1F_36DiI0GJsXGc03Gh0qwBRMwMxuKIWkj8,7136
12
+ compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py,sha256=Qq790d5VQQccq6Dj8YhBwhr7S3DqMJNoYPI5S6M1FNo,7183
13
13
  compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=D8h9ltxSIYi1XEKYgbYu1ebbXzCibhPi-eZsBUi0NOg,11245
14
14
  compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
15
15
  compressed_tensors/compressors/sparse_compressors/base.py,sha256=YNZWcHjDleAlqbgRZQ6oJf44MQb_UDNvJGOqhl26uFA,8098
@@ -30,7 +30,7 @@ compressed_tensors/quantization/quant_args.py,sha256=5AxYKqCSlg7CDgz2N8G4ZRVIiSU
30
30
  compressed_tensors/quantization/quant_config.py,sha256=2NgDwKuQn0f-ojiHC8c6tXtYX_zQlk26Rj-bU71QKvA,10598
31
31
  compressed_tensors/quantization/quant_scheme.py,sha256=X5Z7oXMLPXnX8g-UvWXlRjn4YnD_qTk5mXfGzu20k9o,8903
32
32
  compressed_tensors/quantization/lifecycle/__init__.py,sha256=_uItzFWusyV74Zco_pHLOTdE9a83cL-R-ZdyQrBkIyw,772
33
- compressed_tensors/quantization/lifecycle/apply.py,sha256=yc9xCuQIcdhy-MGFh8OmBrB45dzJ8TzZju4mBa3AONg,14909
33
+ compressed_tensors/quantization/lifecycle/apply.py,sha256=TuSjKomSk4N0My-UY9PWk2Nyuze6TilEGPsZELgotzk,14716
34
34
  compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
35
35
  compressed_tensors/quantization/lifecycle/forward.py,sha256=xcLTgaff1wYUWzvQqYKmhWYkshWVI-PhLPtBOyyZro0,17576
36
36
  compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
@@ -63,8 +63,8 @@ compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RK
63
63
  compressed_tensors/utils/safetensors_load.py,sha256=Vql34aCTDHwmTZXJHzCyBISJo7iA7EQ78LdTlMjdpZo,12023
64
64
  compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
65
65
  compressed_tensors/utils/type.py,sha256=bNwoo_FWlvLuDpYAGGzZJITRg0JA_Ngk9LGPo-kvjeU,2554
66
- compressed_tensors-0.11.1a20250821.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
67
- compressed_tensors-0.11.1a20250821.dist-info/METADATA,sha256=jpkjjAiWJwPLa19Ej2tIJm5MEHJ9gwYsPPfvkhF6YYg,7031
68
- compressed_tensors-0.11.1a20250821.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
69
- compressed_tensors-0.11.1a20250821.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
70
- compressed_tensors-0.11.1a20250821.dist-info/RECORD,,
66
+ compressed_tensors-0.11.1a20250902.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
67
+ compressed_tensors-0.11.1a20250902.dist-info/METADATA,sha256=wMAq5uQ2J5ohzgluAlXip2hoGUdDPpKrfbKtXVLFZz8,7031
68
+ compressed_tensors-0.11.1a20250902.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
69
+ compressed_tensors-0.11.1a20250902.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
70
+ compressed_tensors-0.11.1a20250902.dist-info/RECORD,,