compressed-tensors-nightly 0.5.0.20240902__py3-none-any.whl → 0.5.0.20240903__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/compressors/base.py +2 -0
- compressed_tensors/compressors/model_compressor.py +3 -0
- compressed_tensors/quantization/lifecycle/forward.py +10 -1
- compressed_tensors/quantization/lifecycle/initialize.py +3 -2
- compressed_tensors/quantization/quant_args.py +54 -19
- compressed_tensors/quantization/quant_scheme.py +3 -0
- compressed_tensors/quantization/utils/helpers.py +1 -1
- {compressed_tensors_nightly-0.5.0.20240902.dist-info → compressed_tensors_nightly-0.5.0.20240903.dist-info}/METADATA +1 -1
- {compressed_tensors_nightly-0.5.0.20240902.dist-info → compressed_tensors_nightly-0.5.0.20240903.dist-info}/RECORD +12 -12
- {compressed_tensors_nightly-0.5.0.20240902.dist-info → compressed_tensors_nightly-0.5.0.20240903.dist-info}/LICENSE +0 -0
- {compressed_tensors_nightly-0.5.0.20240902.dist-info → compressed_tensors_nightly-0.5.0.20240903.dist-info}/WHEEL +0 -0
- {compressed_tensors_nightly-0.5.0.20240902.dist-info → compressed_tensors_nightly-0.5.0.20240903.dist-info}/top_level.txt +0 -0
@@ -108,6 +108,7 @@ class Compressor(RegistryMixin):
|
|
108
108
|
prefix = name[: -(len(weight_suffix))]
|
109
109
|
scale = model_state.get(merge_names(prefix, "weight_scale"), None)
|
110
110
|
zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
|
111
|
+
g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
|
111
112
|
if scale is not None:
|
112
113
|
# weight is quantized, compress it
|
113
114
|
quant_args = names_to_scheme[prefix]
|
@@ -115,6 +116,7 @@ class Compressor(RegistryMixin):
|
|
115
116
|
weight=value,
|
116
117
|
scale=scale,
|
117
118
|
zero_point=zp,
|
119
|
+
g_idx=g_idx,
|
118
120
|
quantization_args=quant_args,
|
119
121
|
device="cpu",
|
120
122
|
)
|
@@ -271,6 +271,9 @@ class ModelCompressor:
|
|
271
271
|
v_proj_has_quant_output = 0
|
272
272
|
for name, module in model.named_modules():
|
273
273
|
if not hasattr(module, "quantization_scheme"):
|
274
|
+
# We still want to count non-quantized q_proj
|
275
|
+
if name.endswith(".q_proj"):
|
276
|
+
q_proj_has_no_quant_output += 1
|
274
277
|
continue
|
275
278
|
out_act = module.quantization_scheme.output_activations
|
276
279
|
if name.endswith(".q_proj") and out_act is None:
|
@@ -348,7 +348,16 @@ def maybe_calibrate_or_quantize(
|
|
348
348
|
update_parameter_data(module, updated_scale, f"{base_name}_scale")
|
349
349
|
update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
|
350
350
|
|
351
|
-
|
351
|
+
scale = updated_scale
|
352
|
+
zero_point = updated_zero_point
|
353
|
+
|
354
|
+
return fake_quantize(
|
355
|
+
x=value,
|
356
|
+
scale=scale,
|
357
|
+
zero_point=zero_point,
|
358
|
+
args=args,
|
359
|
+
g_idx=g_idx,
|
360
|
+
)
|
352
361
|
|
353
362
|
|
354
363
|
@torch.no_grad()
|
@@ -21,6 +21,7 @@ from compressed_tensors.quantization.lifecycle.forward import (
|
|
21
21
|
wrap_module_forward_quantized,
|
22
22
|
)
|
23
23
|
from compressed_tensors.quantization.quant_args import (
|
24
|
+
ActivationOrdering,
|
24
25
|
QuantizationArgs,
|
25
26
|
QuantizationStrategy,
|
26
27
|
)
|
@@ -179,8 +180,8 @@ def _initialize_scale_zero_point_observer(
|
|
179
180
|
)
|
180
181
|
module.register_parameter(f"{base_name}_zero_point", init_zero_point)
|
181
182
|
|
182
|
-
#
|
183
|
-
if quantization_args.actorder:
|
183
|
+
# only grouped activation ordering has g_idx
|
184
|
+
if quantization_args.actorder == ActivationOrdering.GROUP:
|
184
185
|
g_idx_shape = (weight_shape[1],)
|
185
186
|
g_idx_dtype = torch.int
|
186
187
|
init_g_idx = Parameter(
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from enum import Enum
|
16
|
-
from typing import Any, Dict, Optional
|
16
|
+
from typing import Any, Dict, Optional, Union
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
@@ -25,6 +25,7 @@ __all__ = [
|
|
25
25
|
"QuantizationStrategy",
|
26
26
|
"QuantizationArgs",
|
27
27
|
"round_to_quantized_type",
|
28
|
+
"ActivationOrdering",
|
28
29
|
]
|
29
30
|
|
30
31
|
FP8_DTYPE = torch.float8_e4m3fn
|
@@ -51,6 +52,19 @@ class QuantizationStrategy(str, Enum):
|
|
51
52
|
TOKEN = "token"
|
52
53
|
|
53
54
|
|
55
|
+
class ActivationOrdering(str, Enum):
|
56
|
+
"""
|
57
|
+
Enum storing strategies for activation ordering
|
58
|
+
|
59
|
+
Group: reorder groups and weight\n
|
60
|
+
Weight: only reorder weight, not groups. Slightly lower latency and
|
61
|
+
accuracy compared to group actorder\n
|
62
|
+
"""
|
63
|
+
|
64
|
+
GROUP = "group"
|
65
|
+
WEIGHT = "weight"
|
66
|
+
|
67
|
+
|
54
68
|
class QuantizationArgs(BaseModel, use_enum_values=True):
|
55
69
|
"""
|
56
70
|
User facing arguments used to define a quantization config for weights or
|
@@ -69,17 +83,17 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
69
83
|
quantization. Note that enabling dynamic quantization will change the default
|
70
84
|
observer to a memoryless one
|
71
85
|
:param actorder: whether to apply group quantization in decreasing order of
|
72
|
-
activation. Defaults to
|
86
|
+
activation. Defaults to None for arbitrary ordering
|
73
87
|
"""
|
74
88
|
|
75
89
|
num_bits: int = 8
|
76
|
-
type: QuantizationType = QuantizationType.INT
|
90
|
+
type: QuantizationType = QuantizationType.INT
|
77
91
|
symmetric: bool = True
|
78
92
|
group_size: Optional[int] = None
|
79
93
|
strategy: Optional[QuantizationStrategy] = None
|
80
94
|
block_structure: Optional[str] = None
|
81
95
|
dynamic: bool = False
|
82
|
-
actorder:
|
96
|
+
actorder: Optional[ActivationOrdering] = None
|
83
97
|
observer: str = Field(
|
84
98
|
default="minmax",
|
85
99
|
description=(
|
@@ -108,8 +122,15 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
108
122
|
|
109
123
|
return Observer.load_from_registry(self.observer, quantization_args=self)
|
110
124
|
|
125
|
+
@field_validator("type", mode="before")
|
126
|
+
def validate_type(cls, value) -> QuantizationType:
|
127
|
+
if isinstance(value, str):
|
128
|
+
return QuantizationType(value.lower())
|
129
|
+
|
130
|
+
return value
|
131
|
+
|
111
132
|
@field_validator("group_size", mode="before")
|
112
|
-
def validate_group(cls, value) -> int:
|
133
|
+
def validate_group(cls, value) -> Union[int, None]:
|
113
134
|
if value is None:
|
114
135
|
return value
|
115
136
|
|
@@ -121,18 +142,29 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
121
142
|
|
122
143
|
return value
|
123
144
|
|
124
|
-
@
|
125
|
-
def validate_strategy(
|
126
|
-
|
127
|
-
|
128
|
-
group_size = values.get("group_size", model_fields["group_size"].default)
|
129
|
-
actorder = values.get("actorder", model_fields["actorder"].default)
|
145
|
+
@field_validator("strategy", mode="before")
|
146
|
+
def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
|
147
|
+
if isinstance(value, str):
|
148
|
+
return QuantizationStrategy(value.lower())
|
130
149
|
|
131
|
-
|
132
|
-
strategy = QuantizationStrategy(strategy.lower())
|
150
|
+
return value
|
133
151
|
|
134
|
-
|
135
|
-
|
152
|
+
@field_validator("actorder", mode="before")
|
153
|
+
def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
|
154
|
+
if isinstance(value, str):
|
155
|
+
return ActivationOrdering(value.lower())
|
156
|
+
|
157
|
+
return value
|
158
|
+
|
159
|
+
@model_validator(mode="after")
|
160
|
+
def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
|
161
|
+
# extract user-passed values from dictionary
|
162
|
+
strategy = model.strategy
|
163
|
+
group_size = model.group_size
|
164
|
+
actorder = model.actorder
|
165
|
+
|
166
|
+
# infer strategy
|
167
|
+
if strategy is None:
|
136
168
|
if group_size is None:
|
137
169
|
strategy = QuantizationStrategy.TENSOR
|
138
170
|
elif group_size > 0:
|
@@ -145,6 +177,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
145
177
|
"strategy='group' and group_size = -1 for 'channel'"
|
146
178
|
)
|
147
179
|
|
180
|
+
# validate strategy and group
|
148
181
|
if strategy == QuantizationStrategy.GROUP:
|
149
182
|
if group_size is None or group_size <= 0:
|
150
183
|
raise ValueError(
|
@@ -152,14 +185,16 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
152
185
|
"set to a positive value"
|
153
186
|
)
|
154
187
|
|
155
|
-
|
188
|
+
# validate activation ordering and strategy
|
189
|
+
if actorder is not None and strategy != QuantizationStrategy.GROUP:
|
156
190
|
raise ValueError(
|
157
|
-
"
|
191
|
+
"Must use group quantization strategy in order to apply "
|
158
192
|
"activation ordering"
|
159
193
|
)
|
160
194
|
|
161
|
-
|
162
|
-
|
195
|
+
# write back modified values
|
196
|
+
model.strategy = strategy
|
197
|
+
return model
|
163
198
|
|
164
199
|
def pytorch_dtype(self) -> torch.dtype:
|
165
200
|
if self.type == QuantizationType.FLOAT:
|
@@ -110,6 +110,7 @@ def is_preset_scheme(name: str) -> bool:
|
|
110
110
|
"""
|
111
111
|
return name.upper() in PRESET_SCHEMES
|
112
112
|
|
113
|
+
UNQUANTIZED = dict()
|
113
114
|
|
114
115
|
# 8 bit integer weights and 8 bit activations quantization
|
115
116
|
W8A8 = dict(
|
@@ -208,6 +209,8 @@ FP8_DYNAMIC = dict(
|
|
208
209
|
)
|
209
210
|
|
210
211
|
PRESET_SCHEMES = {
|
212
|
+
# Unquantized (no-op)
|
213
|
+
"UNQUANTIZED": UNQUANTIZED,
|
211
214
|
# Integer weight only schemes
|
212
215
|
"W8A16": W8A16,
|
213
216
|
"W4A16": W4A16,
|
@@ -181,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
|
|
181
181
|
for parameter in model.parameters():
|
182
182
|
uncompressed_bits = get_torch_bit_depth(parameter)
|
183
183
|
compressed_bits = uncompressed_bits
|
184
|
-
if is_module_quantized(submodule):
|
184
|
+
if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
|
185
185
|
compressed_bits = submodule.quantization_scheme.weights.num_bits
|
186
186
|
|
187
187
|
num_weights = parameter.numel()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors-nightly
|
3
|
-
Version: 0.5.0.
|
3
|
+
Version: 0.5.0.20240903
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -2,11 +2,11 @@ compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6h
|
|
2
2
|
compressed_tensors/base.py,sha256=Mq4mfVQcJhNpha-BXzpOfpmFIdl01o09BJE7D2oQ_00,796
|
3
3
|
compressed_tensors/version.py,sha256=DdMT4o5D6_t26gTuvhF1Q9HPeXY6vV5g7XMprWuHLdI,1586
|
4
4
|
compressed_tensors/compressors/__init__.py,sha256=wmX4VnkUTS63xBwK5-6w8FP78bNZpcdcqvf2KOEC5E4,1133
|
5
|
-
compressed_tensors/compressors/base.py,sha256=
|
5
|
+
compressed_tensors/compressors/base.py,sha256=sJB3QhvNHxwBmpoLy_obkJBuIZ2hY__Jd-Mf2-MAty8,9966
|
6
6
|
compressed_tensors/compressors/dense.py,sha256=xcWECjcRY4INN6jC7vHx5wvUX3NmnKlxA9SVE1A6m2Q,1267
|
7
7
|
compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
|
8
8
|
compressed_tensors/compressors/marlin_24.py,sha256=e7fGUyZbjUpA5VUMCPxqcYPGNiwoDKupHJaXWCoVKRw,9410
|
9
|
-
compressed_tensors/compressors/model_compressor.py,sha256=
|
9
|
+
compressed_tensors/compressors/model_compressor.py,sha256=gI6KKtH3eeWi2540Ayx-4bg9o8qjrvxlF4Gd_sqltGA,16678
|
10
10
|
compressed_tensors/compressors/naive_quantized.py,sha256=z3h3ca5xKCN69mahutxcbzdv-OysiaxaM8P-Qum6zUQ,4823
|
11
11
|
compressed_tensors/compressors/pack_quantized.py,sha256=27RVmJ2wg2dvCoawj407HSmKT3VPGJ6ujAMHlT26WlI,7571
|
12
12
|
compressed_tensors/compressors/sparse_bitmask.py,sha256=kiDwBlFV0sJGLcIdDYxIiuF64ccgwDfqq1hWRQThYDc,8647
|
@@ -17,17 +17,17 @@ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5y
|
|
17
17
|
compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
|
18
18
|
compressed_tensors/linear/compressed_linear.py,sha256=G0gEFfxLAUsgRcnfSV-PKz1ZBNTVokOauOoup7SE1mw,3210
|
19
19
|
compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
|
20
|
-
compressed_tensors/quantization/quant_args.py,sha256=
|
20
|
+
compressed_tensors/quantization/quant_args.py,sha256=Td71ap7oYxcrjAvRVafQ3hZv3BbmCL50Elyyv7EG0Rw,7733
|
21
21
|
compressed_tensors/quantization/quant_config.py,sha256=NpVu8YJ4Xw2pIQW_PGaNaml8kx1bUnxkvb0jBYWbKdE,9971
|
22
|
-
compressed_tensors/quantization/quant_scheme.py,sha256=
|
22
|
+
compressed_tensors/quantization/quant_scheme.py,sha256=VRvWweqwlhjYMrKf62fXKQTeoJGhjJa3tXnE-TuFdFA,6093
|
23
23
|
compressed_tensors/quantization/lifecycle/__init__.py,sha256=MXE2E7GfIfRRfhrdGy2Og3AZOz5N59B0ZGFcsD89y6c,821
|
24
24
|
compressed_tensors/quantization/lifecycle/apply.py,sha256=uftWFunr_CpCZM_qWfo2O1USXKB2qSYD1pBJsO8BuCU,15285
|
25
25
|
compressed_tensors/quantization/lifecycle/calibration.py,sha256=PlS_EqCOPqJD3QKuLPXO9AOtDzXtQWvEBTynFv-FFVw,2698
|
26
26
|
compressed_tensors/quantization/lifecycle/compressed.py,sha256=laNDwvhk4S925qWTPHCufo4uDdMo24NDV1qhsAkf5Iw,2225
|
27
|
-
compressed_tensors/quantization/lifecycle/forward.py,sha256=
|
27
|
+
compressed_tensors/quantization/lifecycle/forward.py,sha256=PljD9pzATILEOiC3ZdHUTsfSbZdAa6iSIxWmvAHLG9I,13688
|
28
28
|
compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
|
29
29
|
compressed_tensors/quantization/lifecycle/helpers.py,sha256=TmLY_G5VP_Fg2Ywio_dxoHRTxOKZdT7_aG5S9WtD4zI,2424
|
30
|
-
compressed_tensors/quantization/lifecycle/initialize.py,sha256=
|
30
|
+
compressed_tensors/quantization/lifecycle/initialize.py,sha256=S5Kwy16Da8WUIIpa1xVKc72MijJ5C_rqM6JjanZ7MGk,7133
|
31
31
|
compressed_tensors/quantization/observers/__init__.py,sha256=4Sa7rqi5RB_S5bPO8KmncETiqDsoMBhwP37arlQym8s,764
|
32
32
|
compressed_tensors/quantization/observers/base.py,sha256=5ovQicWPYHjIxr6-EkQ4lgOX0PpI9g23iSzKpxjM1Zg,8420
|
33
33
|
compressed_tensors/quantization/observers/helpers.py,sha256=s_A23Qa_BLfOdHJCN5bm-qPWkhjjj_RIVrhSp1Y9Dtk,4211
|
@@ -35,7 +35,7 @@ compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ
|
|
35
35
|
compressed_tensors/quantization/observers/min_max.py,sha256=sQXqU3z-voxIDfR_9mQzwQUflZj2sASm_G8CYaXntFw,3865
|
36
36
|
compressed_tensors/quantization/observers/mse.py,sha256=Aeh-253Vbab1F8cYuBiGNn4OXWJ67wXQ_JVfl3mu2a8,6034
|
37
37
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
38
|
-
compressed_tensors/quantization/utils/helpers.py,sha256=
|
38
|
+
compressed_tensors/quantization/utils/helpers.py,sha256=pwvU613XRvMDtI5b39II5jukBl5OUCqoX0ofVRpOFRY,8633
|
39
39
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
40
40
|
compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
|
41
41
|
compressed_tensors/utils/__init__.py,sha256=gS4gSU2pwcAbsKj-6YMaqhm25udFy6ISYaWBf-myRSM,808
|
@@ -45,8 +45,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
|
|
45
45
|
compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
|
46
46
|
compressed_tensors/utils/safetensors_load.py,sha256=m08ANVuTBxQdoa6LufDgcNJ7wCLDJolyZljB8VEybAU,8578
|
47
47
|
compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
|
48
|
-
compressed_tensors_nightly-0.5.0.
|
49
|
-
compressed_tensors_nightly-0.5.0.
|
50
|
-
compressed_tensors_nightly-0.5.0.
|
51
|
-
compressed_tensors_nightly-0.5.0.
|
52
|
-
compressed_tensors_nightly-0.5.0.
|
48
|
+
compressed_tensors_nightly-0.5.0.20240903.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
49
|
+
compressed_tensors_nightly-0.5.0.20240903.dist-info/METADATA,sha256=plHC3Fg0bs-UlLdWYSOLl7RoMbum05Vg-JLDaje0YrY,6799
|
50
|
+
compressed_tensors_nightly-0.5.0.20240903.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
51
|
+
compressed_tensors_nightly-0.5.0.20240903.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
52
|
+
compressed_tensors_nightly-0.5.0.20240903.dist-info/RECORD,,
|
File without changes
|
File without changes
|