compressed-tensors-nightly 0.7.1.20241018__tar.gz → 0.7.1.20241020__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {compressed-tensors-nightly-0.7.1.20241018/src/compressed_tensors_nightly.egg-info → compressed-tensors-nightly-0.7.1.20241020}/PKG-INFO +1 -1
  2. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/cache.py +0 -1
  3. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/calibration.py +12 -0
  4. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/forward.py +57 -26
  5. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/frozen.py +3 -7
  6. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/initialize.py +14 -14
  7. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/mse.py +3 -3
  8. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020/src/compressed_tensors_nightly.egg-info}/PKG-INFO +1 -1
  9. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/LICENSE +0 -0
  10. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/README.md +0 -0
  11. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/pyproject.toml +0 -0
  12. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/setup.cfg +0 -0
  13. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/setup.py +0 -0
  14. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/__init__.py +0 -0
  15. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/base.py +0 -0
  16. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/__init__.py +0 -0
  17. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/base.py +0 -0
  18. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/helpers.py +0 -0
  19. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
  20. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +0 -0
  21. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
  22. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/quantized_compressors/base.py +0 -0
  23. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +0 -0
  24. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +0 -0
  25. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
  26. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_compressors/base.py +0 -0
  27. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_compressors/dense.py +0 -0
  28. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +0 -0
  29. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
  30. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +0 -0
  31. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/config/__init__.py +0 -0
  32. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/config/base.py +0 -0
  33. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/config/dense.py +0 -0
  34. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
  35. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/linear/__init__.py +0 -0
  36. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/linear/compressed_linear.py +0 -0
  37. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/__init__.py +0 -0
  38. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
  39. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/apply.py +0 -0
  40. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
  41. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
  42. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/__init__.py +0 -0
  43. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/base.py +0 -0
  44. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/helpers.py +0 -0
  45. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/observers/min_max.py +0 -0
  46. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/quant_args.py +0 -0
  47. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/quant_config.py +0 -0
  48. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
  49. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
  50. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
  51. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/registry/__init__.py +0 -0
  52. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/registry/registry.py +0 -0
  53. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/__init__.py +0 -0
  54. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/helpers.py +0 -0
  55. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/offload.py +0 -0
  56. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/permutations_24.py +0 -0
  57. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/permute.py +0 -0
  58. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/safetensors_load.py +0 -0
  59. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
  60. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors/version.py +0 -0
  61. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors_nightly.egg-info/SOURCES.txt +0 -0
  62. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors_nightly.egg-info/dependency_links.txt +0 -0
  63. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors_nightly.egg-info/requires.txt +0 -0
  64. {compressed-tensors-nightly-0.7.1.20241018 → compressed-tensors-nightly-0.7.1.20241020}/src/compressed_tensors_nightly.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors-nightly
3
- Version: 0.7.1.20241018
3
+ Version: 0.7.1.20241020
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -28,7 +28,6 @@ class KVCacheScaleType(Enum):
28
28
 
29
29
 
30
30
  class QuantizedKVParameterCache(HFDyanmicCache):
31
-
32
31
  """
33
32
  Quantized KV cache used in the forward call based on HF's dynamic cache.
34
33
  Quantization strategy (tensor, group, channel) set from Quantization arg's strategy
@@ -53,7 +53,19 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
53
53
 
54
54
  if quantize_weights_upfront and module.quantization_scheme.weights is not None:
55
55
  # set weight scale and zero_point up front, calibration data doesn't affect it
56
+ if not hasattr(module, "weight_observer"):
57
+ from compressed_tensors.quantization.lifecycle.initialize import (
58
+ initialize_observers,
59
+ )
60
+
61
+ initialize_observers(
62
+ module=module,
63
+ base_name="weight",
64
+ quantization_args=module.quantization_scheme.weights,
65
+ )
66
+
56
67
  observer = module.weight_observer
68
+
57
69
  g_idx = getattr(module, "weight_g_idx", None)
58
70
 
59
71
  offloaded = is_module_offloaded(module)
@@ -38,7 +38,8 @@ __all__ = [
38
38
  "dequantize",
39
39
  "fake_quantize",
40
40
  "wrap_module_forward_quantized",
41
- "maybe_calibrate_or_quantize",
41
+ "forward_quantize",
42
+ "calibrate_activations",
42
43
  ]
43
44
 
44
45
 
@@ -276,14 +277,24 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
276
277
 
277
278
  if scheme.input_activations is not None:
278
279
  # calibrate and (fake) quantize input activations when applicable
279
- input_ = maybe_calibrate_or_quantize(
280
- module, input_, "input", scheme.input_activations
281
- )
280
+ # NOTE: will be moved out of compressed-tensors
281
+ if (
282
+ module.quantization_status == QuantizationStatus.CALIBRATION
283
+ and not scheme.input_activations.dynamic
284
+ ):
285
+ calibrate_activations(
286
+ module=module,
287
+ value=input_,
288
+ base_name="input",
289
+ quantization_args=scheme.input_activations,
290
+ )
291
+
292
+ input_ = forward_quantize(module, input_, "input", scheme.input_activations)
282
293
 
283
294
  if scheme.weights is not None and not compressed:
284
295
  # calibrate and (fake) quantize weights when applicable
285
296
  unquantized_weight = self.weight.data.clone()
286
- self.weight.data = maybe_calibrate_or_quantize(
297
+ self.weight.data = forward_quantize(
287
298
  module, self.weight, "weight", scheme.weights
288
299
  )
289
300
 
@@ -296,7 +307,19 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
296
307
  # calibrate and (fake) quantize output activations when applicable
297
308
  # kv_cache scales updated on model self_attn forward call in
298
309
  # wrap_module_forward_quantized_attn
299
- output = maybe_calibrate_or_quantize(
310
+
311
+ if (
312
+ module.quantization_status == QuantizationStatus.CALIBRATION
313
+ and not scheme.output_activations.dynamic
314
+ ):
315
+ calibrate_activations(
316
+ module=module,
317
+ value=output,
318
+ base_name="output",
319
+ quantization_args=scheme.ouput_activations,
320
+ )
321
+
322
+ output = forward_quantize(
300
323
  module, output, "output", scheme.output_activations
301
324
  )
302
325
 
@@ -356,12 +379,36 @@ def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationSchem
356
379
  setattr(module, "forward", bound_wrapped_forward)
357
380
 
358
381
 
359
- def maybe_calibrate_or_quantize(
382
+ def calibrate_activations(
383
+ module: Module,
384
+ value: torch.Tensor,
385
+ base_name: str,
386
+ quantization_args: QuantizationArgs,
387
+ ):
388
+ # If empty tensor, can't update zp/scale
389
+ # Case for MoEs
390
+ if value.numel() == 0:
391
+ return
392
+ # calibration mode - get new quant params from observer
393
+ if not hasattr(module, f"{base_name}_observer"):
394
+ from compressed_tensors.quantization.lifecycle import initialize_observers
395
+
396
+ initialize_observers(
397
+ module=module, base_name=base_name, quantization_args=quantization_args
398
+ )
399
+
400
+ observer = getattr(module, f"{base_name}_observer")
401
+
402
+ updated_scale, updated_zero_point = observer(value)
403
+
404
+ # update scale and zero point
405
+ update_parameter_data(module, updated_scale, f"{base_name}_scale")
406
+ update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
407
+
408
+
409
+ def forward_quantize(
360
410
  module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
361
411
  ) -> torch.Tensor:
362
- # don't run quantization if we haven't entered calibration mode
363
- if module.quantization_status == QuantizationStatus.INITIALIZED:
364
- return value
365
412
 
366
413
  # in compressed mode, the weight is already compressed and quantized so we don't
367
414
  # need to run fake quantization
@@ -386,22 +433,6 @@ def maybe_calibrate_or_quantize(
386
433
  scale = getattr(module, f"{base_name}_scale")
387
434
  zero_point = getattr(module, f"{base_name}_zero_point", None)
388
435
 
389
- if (
390
- module.quantization_status == QuantizationStatus.CALIBRATION
391
- and base_name != "weight"
392
- ):
393
- # calibration mode - get new quant params from observer
394
- observer = getattr(module, f"{base_name}_observer")
395
-
396
- updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
397
-
398
- # update scale and zero point
399
- update_parameter_data(module, updated_scale, f"{base_name}_scale")
400
- update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
401
-
402
- scale = updated_scale
403
- zero_point = updated_zero_point
404
-
405
436
  return fake_quantize(
406
437
  x=value,
407
438
  scale=scale,
@@ -41,15 +41,11 @@ def freeze_module_quantization(module: Module):
41
41
  return
42
42
 
43
43
  # delete observers from module if not dynamic
44
- if scheme.input_activations and not scheme.input_activations.dynamic:
44
+ if hasattr(module, "input_observer") and not scheme.input_activations.dynamic:
45
45
  delattr(module, "input_observer")
46
- if scheme.weights and not scheme.weights.dynamic:
46
+ if hasattr(module, "weight_observer") and not scheme.weights.dynamic:
47
47
  delattr(module, "weight_observer")
48
- if (
49
- scheme.output_activations
50
- and not is_kv_cache_quant_scheme(scheme)
51
- and not scheme.output_activations.dynamic
52
- ):
48
+ if hasattr(module, "output_observer") and not scheme.output_activations.dynamic:
53
49
  delattr(module, "output_observer")
54
50
 
55
51
  module.quantization_status = QuantizationStatus.FROZEN
@@ -34,9 +34,7 @@ from compressed_tensors.utils import get_execution_device, is_module_offloaded
34
34
  from torch.nn import Module, Parameter
35
35
 
36
36
 
37
- __all__ = [
38
- "initialize_module_for_quantization",
39
- ]
37
+ __all__ = ["initialize_module_for_quantization", "initialize_observers"]
40
38
 
41
39
 
42
40
  _LOGGER = logging.getLogger(__name__)
@@ -74,7 +72,7 @@ def initialize_module_for_quantization(
74
72
  else:
75
73
 
76
74
  if scheme.input_activations is not None:
77
- _initialize_scale_zero_point_observer(
75
+ _initialize_scale_zero_point(
78
76
  module,
79
77
  "input",
80
78
  scheme.input_activations,
@@ -85,7 +83,7 @@ def initialize_module_for_quantization(
85
83
  weight_shape = None
86
84
  if isinstance(module, torch.nn.Linear):
87
85
  weight_shape = module.weight.shape
88
- _initialize_scale_zero_point_observer(
86
+ _initialize_scale_zero_point(
89
87
  module,
90
88
  "weight",
91
89
  scheme.weights,
@@ -101,7 +99,7 @@ def initialize_module_for_quantization(
101
99
 
102
100
  if scheme.output_activations is not None:
103
101
  if not is_kv_cache_quant_scheme(scheme):
104
- _initialize_scale_zero_point_observer(
102
+ _initialize_scale_zero_point(
105
103
  module, "output", scheme.output_activations
106
104
  )
107
105
 
@@ -146,21 +144,23 @@ def initialize_module_for_quantization(
146
144
  module._hf_hook.weights_map = new_prefix_dict
147
145
 
148
146
 
149
- def _initialize_scale_zero_point_observer(
147
+ def initialize_observers(
150
148
  module: Module,
151
149
  base_name: str,
152
150
  quantization_args: QuantizationArgs,
153
- weight_shape: Optional[torch.Size] = None,
154
- force_zero_point: bool = True,
155
151
  ):
156
-
157
152
  # initialize observer module and attach as submodule
158
153
  observer = quantization_args.get_observer()
159
- # no need to register an observer for dynamic quantization
160
- if observer:
161
- module.register_module(f"{base_name}_observer", observer)
154
+ module.register_module(f"{base_name}_observer", observer)
162
155
 
163
- # no need to register a scale and zero point for a dynamic quantization
156
+
157
+ def _initialize_scale_zero_point(
158
+ module: Module,
159
+ base_name: str,
160
+ quantization_args: QuantizationArgs,
161
+ weight_shape: Optional[torch.Size] = None,
162
+ force_zero_point: bool = True,
163
+ ):
164
164
  if quantization_args.dynamic:
165
165
  return
166
166
 
@@ -70,9 +70,9 @@ class MovingAverageMSEObserver(Observer):
70
70
  absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
71
71
  absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
72
72
 
73
- best = torch.full(absolute_min_val.shape, float("inf"))
74
- min_val = torch.ones(absolute_min_val.shape)
75
- max_val = torch.zeros(absolute_max_val.shape)
73
+ best = torch.full_like(absolute_min_val, torch.finfo(absolute_min_val.dtype).max)
74
+ min_val = torch.ones_like(absolute_min_val)
75
+ max_val = torch.zeros_like(absolute_max_val)
76
76
  for i in range(int(self.maxshrink * self.grid)):
77
77
  p = 1 - i / self.grid
78
78
  shrinked_min_val = p * absolute_min_val
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors-nightly
3
- Version: 0.7.1.20241018
3
+ Version: 0.7.1.20241020
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.