compressed-tensors-nightly 0.7.1.20241017__py3-none-any.whl → 0.7.1.20241020__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/quantization/cache.py +0 -1
- compressed_tensors/quantization/lifecycle/calibration.py +12 -0
- compressed_tensors/quantization/lifecycle/forward.py +57 -26
- compressed_tensors/quantization/lifecycle/frozen.py +3 -7
- compressed_tensors/quantization/lifecycle/initialize.py +14 -14
- compressed_tensors/quantization/observers/mse.py +3 -3
- {compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/METADATA +1 -1
- {compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/RECORD +11 -11
- {compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/LICENSE +0 -0
- {compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/WHEEL +0 -0
- {compressed_tensors_nightly-0.7.1.20241017.dist-info → compressed_tensors_nightly-0.7.1.20241020.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,19 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool =
|
|
53
53
|
|
54
54
|
if quantize_weights_upfront and module.quantization_scheme.weights is not None:
|
55
55
|
# set weight scale and zero_point up front, calibration data doesn't affect it
|
56
|
+
if not hasattr(module, "weight_observer"):
|
57
|
+
from compressed_tensors.quantization.lifecycle.initialize import (
|
58
|
+
initialize_observers,
|
59
|
+
)
|
60
|
+
|
61
|
+
initialize_observers(
|
62
|
+
module=module,
|
63
|
+
base_name="weight",
|
64
|
+
quantization_args=module.quantization_scheme.weights,
|
65
|
+
)
|
66
|
+
|
56
67
|
observer = module.weight_observer
|
68
|
+
|
57
69
|
g_idx = getattr(module, "weight_g_idx", None)
|
58
70
|
|
59
71
|
offloaded = is_module_offloaded(module)
|
@@ -38,7 +38,8 @@ __all__ = [
|
|
38
38
|
"dequantize",
|
39
39
|
"fake_quantize",
|
40
40
|
"wrap_module_forward_quantized",
|
41
|
-
"
|
41
|
+
"forward_quantize",
|
42
|
+
"calibrate_activations",
|
42
43
|
]
|
43
44
|
|
44
45
|
|
@@ -276,14 +277,24 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
|
|
276
277
|
|
277
278
|
if scheme.input_activations is not None:
|
278
279
|
# calibrate and (fake) quantize input activations when applicable
|
279
|
-
|
280
|
-
|
281
|
-
|
280
|
+
# NOTE: will be moved out of compressed-tensors
|
281
|
+
if (
|
282
|
+
module.quantization_status == QuantizationStatus.CALIBRATION
|
283
|
+
and not scheme.input_activations.dynamic
|
284
|
+
):
|
285
|
+
calibrate_activations(
|
286
|
+
module=module,
|
287
|
+
value=input_,
|
288
|
+
base_name="input",
|
289
|
+
quantization_args=scheme.input_activations,
|
290
|
+
)
|
291
|
+
|
292
|
+
input_ = forward_quantize(module, input_, "input", scheme.input_activations)
|
282
293
|
|
283
294
|
if scheme.weights is not None and not compressed:
|
284
295
|
# calibrate and (fake) quantize weights when applicable
|
285
296
|
unquantized_weight = self.weight.data.clone()
|
286
|
-
self.weight.data =
|
297
|
+
self.weight.data = forward_quantize(
|
287
298
|
module, self.weight, "weight", scheme.weights
|
288
299
|
)
|
289
300
|
|
@@ -296,7 +307,19 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
|
|
296
307
|
# calibrate and (fake) quantize output activations when applicable
|
297
308
|
# kv_cache scales updated on model self_attn forward call in
|
298
309
|
# wrap_module_forward_quantized_attn
|
299
|
-
|
310
|
+
|
311
|
+
if (
|
312
|
+
module.quantization_status == QuantizationStatus.CALIBRATION
|
313
|
+
and not scheme.output_activations.dynamic
|
314
|
+
):
|
315
|
+
calibrate_activations(
|
316
|
+
module=module,
|
317
|
+
value=output,
|
318
|
+
base_name="output",
|
319
|
+
quantization_args=scheme.ouput_activations,
|
320
|
+
)
|
321
|
+
|
322
|
+
output = forward_quantize(
|
300
323
|
module, output, "output", scheme.output_activations
|
301
324
|
)
|
302
325
|
|
@@ -356,12 +379,36 @@ def wrap_module_forward_quantized_attn(module: Module, scheme: QuantizationSchem
|
|
356
379
|
setattr(module, "forward", bound_wrapped_forward)
|
357
380
|
|
358
381
|
|
359
|
-
def
|
382
|
+
def calibrate_activations(
|
383
|
+
module: Module,
|
384
|
+
value: torch.Tensor,
|
385
|
+
base_name: str,
|
386
|
+
quantization_args: QuantizationArgs,
|
387
|
+
):
|
388
|
+
# If empty tensor, can't update zp/scale
|
389
|
+
# Case for MoEs
|
390
|
+
if value.numel() == 0:
|
391
|
+
return
|
392
|
+
# calibration mode - get new quant params from observer
|
393
|
+
if not hasattr(module, f"{base_name}_observer"):
|
394
|
+
from compressed_tensors.quantization.lifecycle import initialize_observers
|
395
|
+
|
396
|
+
initialize_observers(
|
397
|
+
module=module, base_name=base_name, quantization_args=quantization_args
|
398
|
+
)
|
399
|
+
|
400
|
+
observer = getattr(module, f"{base_name}_observer")
|
401
|
+
|
402
|
+
updated_scale, updated_zero_point = observer(value)
|
403
|
+
|
404
|
+
# update scale and zero point
|
405
|
+
update_parameter_data(module, updated_scale, f"{base_name}_scale")
|
406
|
+
update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
|
407
|
+
|
408
|
+
|
409
|
+
def forward_quantize(
|
360
410
|
module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
|
361
411
|
) -> torch.Tensor:
|
362
|
-
# don't run quantization if we haven't entered calibration mode
|
363
|
-
if module.quantization_status == QuantizationStatus.INITIALIZED:
|
364
|
-
return value
|
365
412
|
|
366
413
|
# in compressed mode, the weight is already compressed and quantized so we don't
|
367
414
|
# need to run fake quantization
|
@@ -386,22 +433,6 @@ def maybe_calibrate_or_quantize(
|
|
386
433
|
scale = getattr(module, f"{base_name}_scale")
|
387
434
|
zero_point = getattr(module, f"{base_name}_zero_point", None)
|
388
435
|
|
389
|
-
if (
|
390
|
-
module.quantization_status == QuantizationStatus.CALIBRATION
|
391
|
-
and base_name != "weight"
|
392
|
-
):
|
393
|
-
# calibration mode - get new quant params from observer
|
394
|
-
observer = getattr(module, f"{base_name}_observer")
|
395
|
-
|
396
|
-
updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
|
397
|
-
|
398
|
-
# update scale and zero point
|
399
|
-
update_parameter_data(module, updated_scale, f"{base_name}_scale")
|
400
|
-
update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
|
401
|
-
|
402
|
-
scale = updated_scale
|
403
|
-
zero_point = updated_zero_point
|
404
|
-
|
405
436
|
return fake_quantize(
|
406
437
|
x=value,
|
407
438
|
scale=scale,
|
@@ -41,15 +41,11 @@ def freeze_module_quantization(module: Module):
|
|
41
41
|
return
|
42
42
|
|
43
43
|
# delete observers from module if not dynamic
|
44
|
-
if
|
44
|
+
if hasattr(module, "input_observer") and not scheme.input_activations.dynamic:
|
45
45
|
delattr(module, "input_observer")
|
46
|
-
if
|
46
|
+
if hasattr(module, "weight_observer") and not scheme.weights.dynamic:
|
47
47
|
delattr(module, "weight_observer")
|
48
|
-
if (
|
49
|
-
scheme.output_activations
|
50
|
-
and not is_kv_cache_quant_scheme(scheme)
|
51
|
-
and not scheme.output_activations.dynamic
|
52
|
-
):
|
48
|
+
if hasattr(module, "output_observer") and not scheme.output_activations.dynamic:
|
53
49
|
delattr(module, "output_observer")
|
54
50
|
|
55
51
|
module.quantization_status = QuantizationStatus.FROZEN
|
@@ -34,9 +34,7 @@ from compressed_tensors.utils import get_execution_device, is_module_offloaded
|
|
34
34
|
from torch.nn import Module, Parameter
|
35
35
|
|
36
36
|
|
37
|
-
__all__ = [
|
38
|
-
"initialize_module_for_quantization",
|
39
|
-
]
|
37
|
+
__all__ = ["initialize_module_for_quantization", "initialize_observers"]
|
40
38
|
|
41
39
|
|
42
40
|
_LOGGER = logging.getLogger(__name__)
|
@@ -74,7 +72,7 @@ def initialize_module_for_quantization(
|
|
74
72
|
else:
|
75
73
|
|
76
74
|
if scheme.input_activations is not None:
|
77
|
-
|
75
|
+
_initialize_scale_zero_point(
|
78
76
|
module,
|
79
77
|
"input",
|
80
78
|
scheme.input_activations,
|
@@ -85,7 +83,7 @@ def initialize_module_for_quantization(
|
|
85
83
|
weight_shape = None
|
86
84
|
if isinstance(module, torch.nn.Linear):
|
87
85
|
weight_shape = module.weight.shape
|
88
|
-
|
86
|
+
_initialize_scale_zero_point(
|
89
87
|
module,
|
90
88
|
"weight",
|
91
89
|
scheme.weights,
|
@@ -101,7 +99,7 @@ def initialize_module_for_quantization(
|
|
101
99
|
|
102
100
|
if scheme.output_activations is not None:
|
103
101
|
if not is_kv_cache_quant_scheme(scheme):
|
104
|
-
|
102
|
+
_initialize_scale_zero_point(
|
105
103
|
module, "output", scheme.output_activations
|
106
104
|
)
|
107
105
|
|
@@ -146,21 +144,23 @@ def initialize_module_for_quantization(
|
|
146
144
|
module._hf_hook.weights_map = new_prefix_dict
|
147
145
|
|
148
146
|
|
149
|
-
def
|
147
|
+
def initialize_observers(
|
150
148
|
module: Module,
|
151
149
|
base_name: str,
|
152
150
|
quantization_args: QuantizationArgs,
|
153
|
-
weight_shape: Optional[torch.Size] = None,
|
154
|
-
force_zero_point: bool = True,
|
155
151
|
):
|
156
|
-
|
157
152
|
# initialize observer module and attach as submodule
|
158
153
|
observer = quantization_args.get_observer()
|
159
|
-
|
160
|
-
if observer:
|
161
|
-
module.register_module(f"{base_name}_observer", observer)
|
154
|
+
module.register_module(f"{base_name}_observer", observer)
|
162
155
|
|
163
|
-
|
156
|
+
|
157
|
+
def _initialize_scale_zero_point(
|
158
|
+
module: Module,
|
159
|
+
base_name: str,
|
160
|
+
quantization_args: QuantizationArgs,
|
161
|
+
weight_shape: Optional[torch.Size] = None,
|
162
|
+
force_zero_point: bool = True,
|
163
|
+
):
|
164
164
|
if quantization_args.dynamic:
|
165
165
|
return
|
166
166
|
|
@@ -70,9 +70,9 @@ class MovingAverageMSEObserver(Observer):
|
|
70
70
|
absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
|
71
71
|
absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
|
72
72
|
|
73
|
-
best = torch.
|
74
|
-
min_val = torch.
|
75
|
-
max_val = torch.
|
73
|
+
best = torch.full_like(absolute_min_val, torch.finfo(absolute_min_val.dtype).max)
|
74
|
+
min_val = torch.ones_like(absolute_min_val)
|
75
|
+
max_val = torch.zeros_like(absolute_max_val)
|
76
76
|
for i in range(int(self.maxshrink * self.grid)):
|
77
77
|
p = 1 - i / self.grid
|
78
78
|
shrinked_min_val = p * absolute_min_val
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors-nightly
|
3
|
-
Version: 0.7.1.
|
3
|
+
Version: 0.7.1.20241020
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -23,23 +23,23 @@ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5y
|
|
23
23
|
compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
|
24
24
|
compressed_tensors/linear/compressed_linear.py,sha256=0jTTf6XxOAjAYs3tvFtgiNMAO4W10sSeR-pdH2M413g,3218
|
25
25
|
compressed_tensors/quantization/__init__.py,sha256=nWP_fsl6Nn0ksEgZPzerGiETdvF-ZfNwPnwGlRiR5pY,805
|
26
|
-
compressed_tensors/quantization/cache.py,sha256=
|
26
|
+
compressed_tensors/quantization/cache.py,sha256=Sf_9Nfe3RpX04V84iUJMgLN9pWNMFIYvZW02LXcPUQw,6830
|
27
27
|
compressed_tensors/quantization/quant_args.py,sha256=k7NuZn8OqjgzmAVaN2-jHPQ1bgDkMuUoLJtLnhkvIOI,9085
|
28
28
|
compressed_tensors/quantization/quant_config.py,sha256=NCiMvUMnnz5kTyAkDylxjtEGQnjgsIYIeNR2zyHEdTQ,10371
|
29
29
|
compressed_tensors/quantization/quant_scheme.py,sha256=5ggPz5sqEfTUgvJJeiPIINA74QtO-08hb3szsm7UHGE,6000
|
30
30
|
compressed_tensors/quantization/lifecycle/__init__.py,sha256=MXE2E7GfIfRRfhrdGy2Og3AZOz5N59B0ZGFcsD89y6c,821
|
31
31
|
compressed_tensors/quantization/lifecycle/apply.py,sha256=czaayvpeUYyWRJhO_klffw6esptOgA9sBKL5TWQcRdw,15805
|
32
|
-
compressed_tensors/quantization/lifecycle/calibration.py,sha256=
|
32
|
+
compressed_tensors/quantization/lifecycle/calibration.py,sha256=gPSD3kiH4VuU6nq-OLbOmhBGaMXsebEwLm4PkEnUhf0,3043
|
33
33
|
compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
|
34
|
-
compressed_tensors/quantization/lifecycle/forward.py,sha256=
|
35
|
-
compressed_tensors/quantization/lifecycle/frozen.py,sha256=
|
34
|
+
compressed_tensors/quantization/lifecycle/forward.py,sha256=8GjOnx4rwOZZqSDTdnejNOY2DVTjNDzH0DfY_rQam6k,16575
|
35
|
+
compressed_tensors/quantization/lifecycle/frozen.py,sha256=8myzxsz5h5Odh5cIB2lDHb7xLRYBYnAhA1PO8YGuCtM,1839
|
36
36
|
compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
|
37
|
-
compressed_tensors/quantization/lifecycle/initialize.py,sha256=
|
37
|
+
compressed_tensors/quantization/lifecycle/initialize.py,sha256=lKoFy18PjbSklyum7f4hoLuWtHShBKax7JDTBzPlCqM,8839
|
38
38
|
compressed_tensors/quantization/observers/__init__.py,sha256=DYrttzq-8MHLZUzpX-xzzm4hrw6HcXkMkux82KBKb1M,738
|
39
39
|
compressed_tensors/quantization/observers/base.py,sha256=5ovQicWPYHjIxr6-EkQ4lgOX0PpI9g23iSzKpxjM1Zg,8420
|
40
40
|
compressed_tensors/quantization/observers/helpers.py,sha256=o9hg4E9b5cCb5PaEAj6jHiUWkNrKtYtv0b1pGg-T9B4,5516
|
41
41
|
compressed_tensors/quantization/observers/min_max.py,sha256=sQXqU3z-voxIDfR_9mQzwQUflZj2sASm_G8CYaXntFw,3865
|
42
|
-
compressed_tensors/quantization/observers/mse.py,sha256=
|
42
|
+
compressed_tensors/quantization/observers/mse.py,sha256=9JRbvXo0VKLrgsTNuVlQ7AV87wwjRUuQludG0v7IJbI,6058
|
43
43
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
44
44
|
compressed_tensors/quantization/utils/helpers.py,sha256=y4LEyC2oUd876ZMdALWKGH3Ct5EgBJZV4id_NUjTGH8,9531
|
45
45
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
@@ -51,8 +51,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
|
|
51
51
|
compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
|
52
52
|
compressed_tensors/utils/safetensors_load.py,sha256=m08ANVuTBxQdoa6LufDgcNJ7wCLDJolyZljB8VEybAU,8578
|
53
53
|
compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
|
54
|
-
compressed_tensors_nightly-0.7.1.
|
55
|
-
compressed_tensors_nightly-0.7.1.
|
56
|
-
compressed_tensors_nightly-0.7.1.
|
57
|
-
compressed_tensors_nightly-0.7.1.
|
58
|
-
compressed_tensors_nightly-0.7.1.
|
54
|
+
compressed_tensors_nightly-0.7.1.20241020.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
55
|
+
compressed_tensors_nightly-0.7.1.20241020.dist-info/METADATA,sha256=Oir-JDg1u_Tfz5K2YbciN27lTbthuV4fU0UmsvDRu9M,6799
|
56
|
+
compressed_tensors_nightly-0.7.1.20241020.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
57
|
+
compressed_tensors_nightly-0.7.1.20241020.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
58
|
+
compressed_tensors_nightly-0.7.1.20241020.dist-info/RECORD,,
|
File without changes
|
File without changes
|