compressed-tensors 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/compressors/model_compressors/model_compressor.py +12 -3
- compressed_tensors/compressors/quantized_compressors/base.py +31 -2
- compressed_tensors/linear/compressed_linear.py +25 -6
- compressed_tensors/quantization/lifecycle/initialize.py +2 -3
- compressed_tensors/version.py +1 -1
- {compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/METADATA +3 -2
- {compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/RECORD +10 -10
- {compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/WHEEL +1 -1
- {compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info/licenses}/LICENSE +0 -0
- {compressed_tensors-0.9.2.dist-info → compressed_tensors-0.9.3.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import os
|
|
19
19
|
import re
|
20
20
|
from contextlib import contextmanager
|
21
21
|
from copy import deepcopy
|
22
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, TypeVar, Union
|
22
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
|
23
23
|
|
24
24
|
import compressed_tensors
|
25
25
|
import torch
|
@@ -522,10 +522,13 @@ class ModelCompressor:
|
|
522
522
|
update_parameter_data(module, data, param_name)
|
523
523
|
|
524
524
|
|
525
|
-
def map_modules_to_quant_args(
|
525
|
+
def map_modules_to_quant_args(
|
526
|
+
model: Module,
|
527
|
+
) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
|
526
528
|
"""
|
527
529
|
Given a pytorch model, map out the submodule name (usually linear layers)
|
528
|
-
|
530
|
+
to the weight QuantizationArgs. If running input activation quantization, will also
|
531
|
+
map to the input QuantizationArgs in a tuple.
|
529
532
|
|
530
533
|
:param model: pytorch model
|
531
534
|
"""
|
@@ -535,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
|
|
535
538
|
if submodule.quantization_scheme.weights is not None:
|
536
539
|
name = fix_fsdp_module_name(name)
|
537
540
|
quantized_modules_to_args[name] = submodule.quantization_scheme.weights
|
541
|
+
if submodule.quantization_scheme.input_activations is not None:
|
542
|
+
weight_args = quantized_modules_to_args.get(name)
|
543
|
+
quantized_modules_to_args[name] = (
|
544
|
+
weight_args,
|
545
|
+
submodule.quantization_scheme.input_activations,
|
546
|
+
)
|
538
547
|
|
539
548
|
return quantized_modules_to_args
|
540
549
|
|
@@ -82,11 +82,32 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
82
82
|
"""
|
83
83
|
compressed_dict = {}
|
84
84
|
weight_suffix = ".weight"
|
85
|
+
input_zp_suffix = ".input_zero_point"
|
86
|
+
weight_zp_suffix = ".weight_zero_point"
|
85
87
|
_LOGGER.debug(
|
86
88
|
f"Compressing model with {len(model_state)} parameterized layers..."
|
87
89
|
)
|
88
90
|
|
89
91
|
for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
|
92
|
+
# check if the parameter we're compressing is the weight zp
|
93
|
+
# or the input zp
|
94
|
+
is_weight_zp = name.endswith(weight_zp_suffix)
|
95
|
+
is_input_zp = name.endswith(input_zp_suffix)
|
96
|
+
|
97
|
+
# if we're saving the weight zp, fetch weight quant args
|
98
|
+
if is_weight_zp:
|
99
|
+
quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
|
100
|
+
if isinstance(quant_args_zp, tuple):
|
101
|
+
# If tuple, first value is weight args, second is input args
|
102
|
+
quant_args_zp = quant_args_zp[0]
|
103
|
+
|
104
|
+
# if we're saving the input zp, fetch input quant args
|
105
|
+
if is_input_zp:
|
106
|
+
input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
|
107
|
+
if isinstance(input_args_zp, tuple):
|
108
|
+
# If tuple, first value is weight args, second is input args
|
109
|
+
input_args_zp = input_args_zp[-1]
|
110
|
+
|
90
111
|
if name.endswith(weight_suffix):
|
91
112
|
prefix = name[: -(len(weight_suffix))]
|
92
113
|
scale = model_state.get(merge_names(prefix, "weight_scale"), None)
|
@@ -94,7 +115,11 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
94
115
|
g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
|
95
116
|
if scale is not None:
|
96
117
|
# weight is quantized, compress it
|
97
|
-
|
118
|
+
if isinstance(names_to_scheme[prefix], tuple):
|
119
|
+
quant_args = names_to_scheme[prefix][0]
|
120
|
+
else:
|
121
|
+
quant_args = names_to_scheme[prefix]
|
122
|
+
|
98
123
|
compressed_data = self.compress_weight(
|
99
124
|
weight=value,
|
100
125
|
scale=scale,
|
@@ -107,7 +132,11 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
107
132
|
compressed_dict[merge_names(prefix, key)] = value
|
108
133
|
else:
|
109
134
|
compressed_dict[name] = value.to("cpu")
|
110
|
-
|
135
|
+
# only save if asym
|
136
|
+
elif is_weight_zp and quant_args_zp.symmetric:
|
137
|
+
continue
|
138
|
+
# only save if asym
|
139
|
+
elif is_input_zp and input_args_zp.symmetric:
|
111
140
|
continue
|
112
141
|
elif name.endswith("g_idx") and torch.any(value <= -1):
|
113
142
|
continue
|
@@ -12,6 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
import warnings
|
15
16
|
from typing import Dict, Tuple
|
16
17
|
|
17
18
|
import torch
|
@@ -21,6 +22,7 @@ from compressed_tensors.quantization import (
|
|
21
22
|
QuantizationStatus,
|
22
23
|
initialize_module_for_quantization,
|
23
24
|
)
|
25
|
+
from compressed_tensors.utils import register_offload_parameter
|
24
26
|
from torch import Tensor
|
25
27
|
from torch.nn import Parameter
|
26
28
|
from torch.nn.functional import linear
|
@@ -32,11 +34,16 @@ class CompressedLinear(Linear):
|
|
32
34
|
Wrapper module for running a compressed forward pass of a quantized Linear module.
|
33
35
|
The wrapped layer will decompressed on each forward call.
|
34
36
|
|
35
|
-
:param module: dense linear module to replace
|
36
|
-
:param quantization_scheme: quantization config for the module to wrap
|
37
|
-
:param quantization_format: compression format module is stored as
|
38
37
|
"""
|
39
38
|
|
39
|
+
def __init__(self, *args, **kwargs) -> None:
|
40
|
+
super().__init__(*args, **kwargs)
|
41
|
+
warnings.warn(
|
42
|
+
"CompressedLinear should not be initialized directly. "
|
43
|
+
"Use the from_linear method instead.",
|
44
|
+
UserWarning,
|
45
|
+
)
|
46
|
+
|
40
47
|
@classmethod
|
41
48
|
@torch.no_grad()
|
42
49
|
def from_linear(
|
@@ -45,6 +52,12 @@ class CompressedLinear(Linear):
|
|
45
52
|
quantization_scheme: QuantizationScheme,
|
46
53
|
quantization_format: str,
|
47
54
|
):
|
55
|
+
"""
|
56
|
+
:param module: dense linear module to replace
|
57
|
+
:param quantization_scheme: quantization config for the module to wrap
|
58
|
+
:param quantization_format: compression format module is stored as
|
59
|
+
:return: CompressedLinear module wrapping the input module
|
60
|
+
"""
|
48
61
|
module.__class__ = CompressedLinear
|
49
62
|
module.compressor = BaseCompressor.load_from_registry(quantization_format)
|
50
63
|
device = next(module.parameters()).device
|
@@ -68,7 +81,7 @@ class CompressedLinear(Linear):
|
|
68
81
|
param = Parameter(
|
69
82
|
torch.empty(shape, device=device, dtype=dtype), requires_grad=False
|
70
83
|
)
|
71
|
-
module
|
84
|
+
register_offload_parameter(module, name, param)
|
72
85
|
|
73
86
|
# mark module as compressed
|
74
87
|
module.quantization_status = QuantizationStatus.COMPRESSED
|
@@ -85,5 +98,11 @@ class CompressedLinear(Linear):
|
|
85
98
|
"""
|
86
99
|
Decompresses the weight, then runs the wrapped forward pass
|
87
100
|
"""
|
88
|
-
|
89
|
-
|
101
|
+
if self.quantization_status == QuantizationStatus.COMPRESSED:
|
102
|
+
weight_data = self.compressor.decompress_module(self)
|
103
|
+
param = Parameter(weight_data, requires_grad=False)
|
104
|
+
register_offload_parameter(self, "weight", param)
|
105
|
+
|
106
|
+
self.quantization_status = QuantizationStatus.FROZEN
|
107
|
+
|
108
|
+
return linear(input, self.weight, self.bias)
|
@@ -203,11 +203,10 @@ def _initialize_attn_scales(module: Module) -> None:
|
|
203
203
|
torch.empty(expected_shape, dtype=scale_dtype, device=device),
|
204
204
|
requires_grad=False,
|
205
205
|
)
|
206
|
-
|
207
|
-
module.register_parameter(KVCacheScaleType.KEY.value, init_scale)
|
206
|
+
register_offload_parameter(module, KVCacheScaleType.KEY.value, init_scale)
|
208
207
|
|
209
208
|
init_scale = Parameter(
|
210
209
|
torch.empty(expected_shape, dtype=scale_dtype, device=device),
|
211
210
|
requires_grad=False,
|
212
211
|
)
|
213
|
-
module
|
212
|
+
register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
|
compressed_tensors/version.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.3
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -26,6 +26,7 @@ Dynamic: description
|
|
26
26
|
Dynamic: description-content-type
|
27
27
|
Dynamic: home-page
|
28
28
|
Dynamic: license
|
29
|
+
Dynamic: license-file
|
29
30
|
Dynamic: provides-extra
|
30
31
|
Dynamic: requires-dist
|
31
32
|
Dynamic: summary
|
@@ -1,13 +1,13 @@
|
|
1
1
|
compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
|
2
2
|
compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
|
3
|
-
compressed_tensors/version.py,sha256=
|
3
|
+
compressed_tensors/version.py,sha256=X4y5lqlF1QFUgl25iumzagpg3dzyVoLP6i82HZEhCJA,1585
|
4
4
|
compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
|
5
5
|
compressed_tensors/compressors/base.py,sha256=x8dQrWVEurynXw03yHJZTaAmrRTOsdZJoHjmvs0IKwk,7002
|
6
6
|
compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
|
7
7
|
compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
|
8
|
-
compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=
|
8
|
+
compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=n0gcrKwefJuO6b4LNjCynJQf7NNqNHDcoLlzZgTCPGc,23080
|
9
9
|
compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=09UJq68Pht6Bf-4iP9xYl3tetKsncNPHD8IAGbePsr4,714
|
10
|
-
compressed_tensors/compressors/quantized_compressors/base.py,sha256=
|
10
|
+
compressed_tensors/compressors/quantized_compressors/base.py,sha256=GXTSWgFAhksbno94Ulpth9-YM4a7NsDlx4oQGGB0swQ,8567
|
11
11
|
compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
|
12
12
|
compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=zH2PocRe_T5yt1-3kLdZH9AUQWQyaVOi4U9nEJiYaWA,8509
|
13
13
|
compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
|
@@ -23,7 +23,7 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
|
|
23
23
|
compressed_tensors/config/sparse_24_bitmask.py,sha256=Lhj39zT2V1hxftprvxvneyhv45ShlXOKd75DBbDTyTE,1401
|
24
24
|
compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
|
25
25
|
compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
|
26
|
-
compressed_tensors/linear/compressed_linear.py,sha256=
|
26
|
+
compressed_tensors/linear/compressed_linear.py,sha256=_m6XpNcI53eeSHO8VdiuAM6UBTdpDhn5Ivd8iRMwEKc,3980
|
27
27
|
compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
|
28
28
|
compressed_tensors/quantization/quant_args.py,sha256=sKpb8DcNObidjXjNol1Tn_Iih3ZXBycSp-fyz68TGhY,9117
|
29
29
|
compressed_tensors/quantization/quant_config.py,sha256=vx06wBo91p4LCb3Vzd-2eCTUeIf_Sz2ZXRP263eQyjQ,10385
|
@@ -33,7 +33,7 @@ compressed_tensors/quantization/lifecycle/apply.py,sha256=lZmCCSm1_o79iUAy460w6B
|
|
33
33
|
compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
|
34
34
|
compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
|
35
35
|
compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
|
36
|
-
compressed_tensors/quantization/lifecycle/initialize.py,sha256=
|
36
|
+
compressed_tensors/quantization/lifecycle/initialize.py,sha256=sK3PLm69N91QepBuq-83Qd2Br6XcOmRDpD5qo_WWNJo,7469
|
37
37
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
38
38
|
compressed_tensors/quantization/utils/helpers.py,sha256=DBP-sGRpGAY01K0LFE7qqonNj4hkTYL_mXrMs2LtAD8,14100
|
39
39
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
@@ -45,8 +45,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
|
|
45
45
|
compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
|
46
46
|
compressed_tensors/utils/safetensors_load.py,sha256=5SeM2hzLh77Ne8Vk7qR6-km7cf8bhov41ExpWITqX3A,11470
|
47
47
|
compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
|
48
|
-
compressed_tensors-0.9.
|
49
|
-
compressed_tensors-0.9.
|
50
|
-
compressed_tensors-0.9.
|
51
|
-
compressed_tensors-0.9.
|
52
|
-
compressed_tensors-0.9.
|
48
|
+
compressed_tensors-0.9.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
49
|
+
compressed_tensors-0.9.3.dist-info/METADATA,sha256=zs3aFaG-BGV9hqJbW9Zwzex0TVcM5sPZhiaeVx2qjR0,6997
|
50
|
+
compressed_tensors-0.9.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
51
|
+
compressed_tensors-0.9.3.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
52
|
+
compressed_tensors-0.9.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|