compressed-tensors 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/compressors/base.py +200 -8
- compressed_tensors/compressors/model_compressor.py +68 -1
- compressed_tensors/compressors/naive_quantized.py +71 -75
- compressed_tensors/compressors/pack_quantized.py +83 -94
- compressed_tensors/config/base.py +6 -1
- compressed_tensors/linear/__init__.py +13 -0
- compressed_tensors/linear/compressed_linear.py +87 -0
- compressed_tensors/quantization/lifecycle/apply.py +46 -8
- compressed_tensors/quantization/lifecycle/calibration.py +5 -4
- compressed_tensors/quantization/lifecycle/compressed.py +3 -1
- compressed_tensors/quantization/lifecycle/forward.py +76 -43
- compressed_tensors/quantization/lifecycle/helpers.py +29 -2
- compressed_tensors/quantization/lifecycle/initialize.py +51 -16
- compressed_tensors/quantization/observers/__init__.py +1 -0
- compressed_tensors/quantization/observers/base.py +54 -14
- compressed_tensors/quantization/observers/min_max.py +8 -0
- compressed_tensors/quantization/observers/mse.py +162 -0
- compressed_tensors/quantization/quant_args.py +96 -24
- compressed_tensors/quantization/quant_scheme.py +7 -9
- compressed_tensors/quantization/utils/helpers.py +1 -1
- compressed_tensors/utils/__init__.py +1 -0
- compressed_tensors/utils/helpers.py +13 -0
- compressed_tensors/utils/offload.py +14 -2
- compressed_tensors/utils/permute.py +70 -0
- compressed_tensors/utils/safetensors_load.py +2 -0
- compressed_tensors/utils/semi_structured_conversions.py +1 -0
- compressed_tensors/version.py +1 -1
- {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -23
- compressed_tensors-0.6.0.dist-info/RECORD +52 -0
- {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
- compressed_tensors-0.5.0.dist-info/RECORD +0 -48
- {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
- {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Any, Optional, Tuple
|
16
|
+
|
17
|
+
import torch
|
18
|
+
from compressed_tensors.quantization.observers.base import Observer
|
19
|
+
from compressed_tensors.quantization.observers.helpers import calculate_qparams
|
20
|
+
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
21
|
+
from torch import FloatTensor, IntTensor, Tensor
|
22
|
+
|
23
|
+
|
24
|
+
__all__ = ["MovingAverageMSEObserver"]
|
25
|
+
|
26
|
+
|
27
|
+
@Observer.register("mse")
|
28
|
+
class MovingAverageMSEObserver(Observer):
|
29
|
+
"""
|
30
|
+
Implements a dynamic quantization observer that sets the scale and
|
31
|
+
zero point based on a moving average of the mse-clipped min and max observed values
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
quantization_args: QuantizationArgs,
|
37
|
+
averaging_constant: float = 0.01,
|
38
|
+
grid: float = 100.0,
|
39
|
+
maxshrink: float = 0.80,
|
40
|
+
norm: float = 2.4,
|
41
|
+
):
|
42
|
+
super().__init__(quantization_args=quantization_args)
|
43
|
+
|
44
|
+
self.min_val = {}
|
45
|
+
self.max_val = {}
|
46
|
+
self.averaging_constant = averaging_constant
|
47
|
+
self.grid = grid
|
48
|
+
self.maxshrink = maxshrink
|
49
|
+
self.norm = norm
|
50
|
+
|
51
|
+
def calculate_mse_min_max(
|
52
|
+
self,
|
53
|
+
observed: Tensor,
|
54
|
+
reduce_dims: Optional[Tuple[int]] = None,
|
55
|
+
):
|
56
|
+
"""
|
57
|
+
Computes the mse-clipped min and max values of the observed tensor by
|
58
|
+
optimizing for quantization error
|
59
|
+
|
60
|
+
:param observed: observed tensor to calculate quantization parameters for
|
61
|
+
:param reduce_dims: optional tuple of dimensions to reduce along,
|
62
|
+
returned values will be shaped (1,) along the reduced dimensions
|
63
|
+
:return: tuple of min and max values derived from the observed tensor
|
64
|
+
"""
|
65
|
+
from compressed_tensors.quantization.lifecycle import fake_quantize
|
66
|
+
|
67
|
+
if not reduce_dims:
|
68
|
+
absolute_min_val, absolute_max_val = torch.aminmax(observed)
|
69
|
+
else:
|
70
|
+
absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
|
71
|
+
absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
|
72
|
+
|
73
|
+
best = torch.full(absolute_min_val.shape, float("inf"))
|
74
|
+
min_val = torch.ones(absolute_min_val.shape)
|
75
|
+
max_val = torch.zeros(absolute_max_val.shape)
|
76
|
+
for i in range(int(self.maxshrink * self.grid)):
|
77
|
+
p = 1 - i / self.grid
|
78
|
+
shrinked_min_val = p * absolute_min_val
|
79
|
+
shrinked_max_val = p * absolute_max_val
|
80
|
+
|
81
|
+
candidate_scales, candidate_zero_points = calculate_qparams(
|
82
|
+
shrinked_min_val, shrinked_max_val, self.quantization_args
|
83
|
+
)
|
84
|
+
q = fake_quantize(
|
85
|
+
observed,
|
86
|
+
candidate_scales,
|
87
|
+
candidate_zero_points,
|
88
|
+
self.quantization_args,
|
89
|
+
)
|
90
|
+
|
91
|
+
q -= observed
|
92
|
+
q.abs_()
|
93
|
+
q.pow_(self.norm)
|
94
|
+
if not reduce_dims:
|
95
|
+
err = torch.sum(q)
|
96
|
+
else:
|
97
|
+
err = torch.sum(q, reduce_dims, keepdims=True)
|
98
|
+
|
99
|
+
tmp = err < best
|
100
|
+
if torch.any(tmp):
|
101
|
+
best[tmp] = err[tmp]
|
102
|
+
min_val[tmp] = shrinked_min_val[tmp]
|
103
|
+
max_val[tmp] = shrinked_max_val[tmp]
|
104
|
+
return min_val, max_val
|
105
|
+
|
106
|
+
def calculate_qparams(
|
107
|
+
self,
|
108
|
+
observed: Tensor,
|
109
|
+
reduce_dims: Optional[Tuple[int]] = None,
|
110
|
+
tensor_id: Optional[Any] = None,
|
111
|
+
) -> Tuple[FloatTensor, IntTensor]:
|
112
|
+
"""
|
113
|
+
Updates the mse-clipped min and max values of the observed tensor using
|
114
|
+
a moving average smoothed by the averaging_constant
|
115
|
+
|
116
|
+
:param observed: observed tensor to calculate quantization parameters for
|
117
|
+
:param reduce_dims: optional tuple of dimensions to reduce along,
|
118
|
+
returned scale and zero point will be shaped (1,) along the
|
119
|
+
reduced dimensions
|
120
|
+
:param tensor_id: Optional id if different ranges of observed tensors are
|
121
|
+
passed, useful for sharding tensors by group_size
|
122
|
+
:return: tuple of scale and zero point derived from the observed tensor
|
123
|
+
"""
|
124
|
+
min_val, max_val = self.calculate_mse_min_max(observed, reduce_dims)
|
125
|
+
|
126
|
+
running_min_val = self.min_val.get(tensor_id, None)
|
127
|
+
running_max_val = self.max_val.get(tensor_id, None)
|
128
|
+
|
129
|
+
if running_min_val is None or running_max_val is None:
|
130
|
+
updated_min_val = min_val
|
131
|
+
updated_max_val = max_val
|
132
|
+
else:
|
133
|
+
updated_min_val = running_min_val + self.averaging_constant * (
|
134
|
+
min_val - running_min_val
|
135
|
+
)
|
136
|
+
updated_max_val = running_max_val + self.averaging_constant * (
|
137
|
+
max_val - running_max_val
|
138
|
+
)
|
139
|
+
|
140
|
+
tensor_id = tensor_id or "default"
|
141
|
+
self.min_val[tensor_id] = updated_min_val
|
142
|
+
self.max_val[tensor_id] = updated_max_val
|
143
|
+
|
144
|
+
return calculate_qparams(
|
145
|
+
updated_min_val, updated_max_val, self.quantization_args
|
146
|
+
)
|
147
|
+
|
148
|
+
def get_qparams_along_dim(
|
149
|
+
self, observed, dim: int, tensor_id: Optional[Any] = None
|
150
|
+
):
|
151
|
+
reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
|
152
|
+
return self.calculate_qparams(
|
153
|
+
observed, reduce_dims=reduce_dims, tensor_id=tensor_id
|
154
|
+
)
|
155
|
+
|
156
|
+
def reset(self):
|
157
|
+
"""
|
158
|
+
Reset the state of the observer, including min and maximum values
|
159
|
+
"""
|
160
|
+
super().reset()
|
161
|
+
self.min_val = {}
|
162
|
+
self.max_val = {}
|
@@ -13,10 +13,10 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from enum import Enum
|
16
|
-
from typing import Any, Dict, Optional
|
16
|
+
from typing import Any, Dict, Optional, Union
|
17
17
|
|
18
18
|
import torch
|
19
|
-
from pydantic import BaseModel, Field,
|
19
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
20
20
|
|
21
21
|
|
22
22
|
__all__ = [
|
@@ -25,6 +25,7 @@ __all__ = [
|
|
25
25
|
"QuantizationStrategy",
|
26
26
|
"QuantizationArgs",
|
27
27
|
"round_to_quantized_type",
|
28
|
+
"ActivationOrdering",
|
28
29
|
]
|
29
30
|
|
30
31
|
FP8_DTYPE = torch.float8_e4m3fn
|
@@ -51,6 +52,19 @@ class QuantizationStrategy(str, Enum):
|
|
51
52
|
TOKEN = "token"
|
52
53
|
|
53
54
|
|
55
|
+
class ActivationOrdering(str, Enum):
|
56
|
+
"""
|
57
|
+
Enum storing strategies for activation ordering
|
58
|
+
|
59
|
+
Group: reorder groups and weight\n
|
60
|
+
Weight: only reorder weight, not groups. Slightly lower latency and
|
61
|
+
accuracy compared to group actorder\n
|
62
|
+
"""
|
63
|
+
|
64
|
+
GROUP = "group"
|
65
|
+
WEIGHT = "weight"
|
66
|
+
|
67
|
+
|
54
68
|
class QuantizationArgs(BaseModel, use_enum_values=True):
|
55
69
|
"""
|
56
70
|
User facing arguments used to define a quantization config for weights or
|
@@ -68,15 +82,18 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
68
82
|
ranges will be observed with every sample. Defaults to False for static
|
69
83
|
quantization. Note that enabling dynamic quantization will change the default
|
70
84
|
observer to a memoryless one
|
85
|
+
:param actorder: whether to apply group quantization in decreasing order of
|
86
|
+
activation. Defaults to None for arbitrary ordering
|
71
87
|
"""
|
72
88
|
|
73
89
|
num_bits: int = 8
|
74
|
-
type: QuantizationType = QuantizationType.INT
|
90
|
+
type: QuantizationType = QuantizationType.INT
|
75
91
|
symmetric: bool = True
|
76
92
|
group_size: Optional[int] = None
|
77
93
|
strategy: Optional[QuantizationStrategy] = None
|
78
94
|
block_structure: Optional[str] = None
|
79
95
|
dynamic: bool = False
|
96
|
+
actorder: Union[ActivationOrdering, bool, None] = None
|
80
97
|
observer: str = Field(
|
81
98
|
default="minmax",
|
82
99
|
description=(
|
@@ -98,41 +115,96 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
98
115
|
"""
|
99
116
|
from compressed_tensors.quantization.observers.base import Observer
|
100
117
|
|
101
|
-
if self.
|
118
|
+
if self.dynamic:
|
102
119
|
# override defualt observer for dynamic, you never want minmax which
|
103
120
|
# keeps state across samples for dynamic
|
104
121
|
self.observer = "memoryless"
|
105
122
|
|
106
123
|
return Observer.load_from_registry(self.observer, quantization_args=self)
|
107
124
|
|
108
|
-
@
|
109
|
-
def
|
110
|
-
|
125
|
+
@field_validator("type", mode="before")
|
126
|
+
def validate_type(cls, value) -> QuantizationType:
|
127
|
+
if isinstance(value, str):
|
128
|
+
return QuantizationType(value.lower())
|
111
129
|
|
112
|
-
|
113
|
-
if group_size is not None and value is None:
|
114
|
-
if group_size > 0:
|
115
|
-
return QuantizationStrategy.GROUP
|
130
|
+
return value
|
116
131
|
|
117
|
-
|
118
|
-
|
132
|
+
@field_validator("group_size", mode="before")
|
133
|
+
def validate_group(cls, value) -> Union[int, None]:
|
134
|
+
if value is None:
|
135
|
+
return value
|
119
136
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
)
|
137
|
+
if value < -1:
|
138
|
+
raise ValueError(
|
139
|
+
f"Invalid group size {value}. Use group_size > 0 for "
|
140
|
+
"strategy='group' and group_size = -1 for 'channel'"
|
141
|
+
)
|
126
142
|
|
127
|
-
|
128
|
-
if group_size is None:
|
129
|
-
raise ValueError(f"strategy {value} requires group_size to be set.")
|
143
|
+
return value
|
130
144
|
|
131
|
-
|
132
|
-
|
145
|
+
@field_validator("strategy", mode="before")
|
146
|
+
def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
|
147
|
+
if isinstance(value, str):
|
148
|
+
return QuantizationStrategy(value.lower())
|
149
|
+
|
150
|
+
return value
|
151
|
+
|
152
|
+
@field_validator("actorder", mode="before")
|
153
|
+
def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
|
154
|
+
if isinstance(value, bool):
|
155
|
+
return ActivationOrdering.GROUP if value else None
|
156
|
+
|
157
|
+
if isinstance(value, str):
|
158
|
+
return ActivationOrdering(value.lower())
|
133
159
|
|
134
160
|
return value
|
135
161
|
|
162
|
+
@model_validator(mode="after")
|
163
|
+
def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
|
164
|
+
# extract user-passed values from dictionary
|
165
|
+
strategy = model.strategy
|
166
|
+
group_size = model.group_size
|
167
|
+
actorder = model.actorder
|
168
|
+
|
169
|
+
# infer strategy
|
170
|
+
if strategy is None:
|
171
|
+
if group_size is None:
|
172
|
+
strategy = QuantizationStrategy.TENSOR
|
173
|
+
elif group_size > 0:
|
174
|
+
strategy = QuantizationStrategy.GROUP
|
175
|
+
elif group_size == -1:
|
176
|
+
strategy = QuantizationStrategy.CHANNEL
|
177
|
+
else:
|
178
|
+
raise ValueError(
|
179
|
+
f"Invalid group size {group_size}. Use group_size > 0 for "
|
180
|
+
"strategy='group' and group_size = -1 for 'channel'"
|
181
|
+
)
|
182
|
+
|
183
|
+
# validate strategy and group
|
184
|
+
if strategy == QuantizationStrategy.GROUP:
|
185
|
+
if group_size is None or group_size <= 0:
|
186
|
+
raise ValueError(
|
187
|
+
f"strategy {strategy} requires group_size to be "
|
188
|
+
"set to a positive value"
|
189
|
+
)
|
190
|
+
if (
|
191
|
+
group_size is not None
|
192
|
+
and group_size > 0
|
193
|
+
and strategy != QuantizationStrategy.GROUP
|
194
|
+
):
|
195
|
+
raise ValueError("group_size requires strategy to be set to 'group'")
|
196
|
+
|
197
|
+
# validate activation ordering and strategy
|
198
|
+
if actorder is not None and strategy != QuantizationStrategy.GROUP:
|
199
|
+
raise ValueError(
|
200
|
+
"Must use group quantization strategy in order to apply "
|
201
|
+
"activation ordering"
|
202
|
+
)
|
203
|
+
|
204
|
+
# write back modified values
|
205
|
+
model.strategy = strategy
|
206
|
+
return model
|
207
|
+
|
136
208
|
def pytorch_dtype(self) -> torch.dtype:
|
137
209
|
if self.type == QuantizationType.FLOAT:
|
138
210
|
return FP8_DTYPE
|
@@ -57,15 +57,9 @@ class QuantizationScheme(BaseModel):
|
|
57
57
|
# default to quantizing all Linear layers
|
58
58
|
targets = ["Linear"]
|
59
59
|
|
60
|
-
# default
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
# default to 8 bit integer asymmetric quantization
|
65
|
-
input_activations = QuantizationArgs(num_bits=8, symmetric=True)
|
66
|
-
|
67
|
-
# Do not quantize the output activations
|
68
|
-
# by default
|
60
|
+
# by default, activations and weights are left unquantized
|
61
|
+
weights = None
|
62
|
+
input_activations = None
|
69
63
|
output_activations = None
|
70
64
|
|
71
65
|
return cls(
|
@@ -111,6 +105,8 @@ def is_preset_scheme(name: str) -> bool:
|
|
111
105
|
return name.upper() in PRESET_SCHEMES
|
112
106
|
|
113
107
|
|
108
|
+
UNQUANTIZED = dict()
|
109
|
+
|
114
110
|
# 8 bit integer weights and 8 bit activations quantization
|
115
111
|
W8A8 = dict(
|
116
112
|
weights=QuantizationArgs(
|
@@ -208,6 +204,8 @@ FP8_DYNAMIC = dict(
|
|
208
204
|
)
|
209
205
|
|
210
206
|
PRESET_SCHEMES = {
|
207
|
+
# Unquantized (no-op)
|
208
|
+
"UNQUANTIZED": UNQUANTIZED,
|
211
209
|
# Integer weight only schemes
|
212
210
|
"W8A16": W8A16,
|
213
211
|
"W4A16": W4A16,
|
@@ -181,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
|
|
181
181
|
for parameter in model.parameters():
|
182
182
|
uncompressed_bits = get_torch_bit_depth(parameter)
|
183
183
|
compressed_bits = uncompressed_bits
|
184
|
-
if is_module_quantized(submodule):
|
184
|
+
if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
|
185
185
|
compressed_bits = submodule.quantization_scheme.weights.num_bits
|
186
186
|
|
187
187
|
num_weights = parameter.numel()
|
@@ -22,6 +22,7 @@ __all__ = [
|
|
22
22
|
"infer_compressor_from_model_config",
|
23
23
|
"fix_fsdp_module_name",
|
24
24
|
"tensor_follows_mask_structure",
|
25
|
+
"replace_module",
|
25
26
|
]
|
26
27
|
|
27
28
|
FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
|
@@ -90,3 +91,15 @@ def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool:
|
|
90
91
|
raise ValueError()
|
91
92
|
|
92
93
|
return True
|
94
|
+
|
95
|
+
|
96
|
+
def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
|
97
|
+
if "." in name:
|
98
|
+
parent_name = name.rsplit(".", 1)[0]
|
99
|
+
child_name = name[len(parent_name) + 1 :]
|
100
|
+
parent = model.get_submodule(parent_name)
|
101
|
+
else:
|
102
|
+
parent_name = ""
|
103
|
+
parent = model
|
104
|
+
child_name = name
|
105
|
+
setattr(parent, child_name, new_module)
|
@@ -40,7 +40,13 @@ def get_execution_device(module: Module) -> torch.device:
|
|
40
40
|
"""
|
41
41
|
if is_module_offloaded(module):
|
42
42
|
return module._hf_hook.execution_device
|
43
|
-
|
43
|
+
device = next(module.parameters()).device
|
44
|
+
|
45
|
+
# offload only gets set for leaf modules, fallback to checking for device type
|
46
|
+
if device.type == "meta":
|
47
|
+
return module._hf_hook.execution_device
|
48
|
+
|
49
|
+
return device
|
44
50
|
|
45
51
|
|
46
52
|
def get_offloaded_device(module: Module) -> torch.device:
|
@@ -83,8 +89,11 @@ def update_parameter_data(
|
|
83
89
|
|
84
90
|
:param module: layer containing the parameter to update
|
85
91
|
:param new_param_data: tensor to update parameter with
|
86
|
-
:param param_name:
|
92
|
+
:param param_name: name of layer parameter to update
|
87
93
|
"""
|
94
|
+
if not hasattr(module, param_name):
|
95
|
+
return
|
96
|
+
|
88
97
|
device = next(module.parameters()).device
|
89
98
|
|
90
99
|
offloaded = False
|
@@ -93,6 +102,9 @@ def update_parameter_data(
|
|
93
102
|
offloaded = True
|
94
103
|
|
95
104
|
parameter = getattr(module, param_name, None)
|
105
|
+
if parameter is None:
|
106
|
+
raise ValueError("Attempted to update uninitialized parameter")
|
107
|
+
|
96
108
|
dtype = parameter.dtype
|
97
109
|
parameter.data = new_param_data.to(device).to(dtype)
|
98
110
|
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Set, Tuple
|
16
|
+
|
17
|
+
import torch
|
18
|
+
|
19
|
+
|
20
|
+
__all__ = ["safe_permute"]
|
21
|
+
|
22
|
+
|
23
|
+
# these datatypes are missing implementations required for standard permutation
|
24
|
+
_EXPERIMENTAL_DTYPES: Set[Tuple[torch.dtype, torch.device]] = set()
|
25
|
+
|
26
|
+
|
27
|
+
def safe_permute(value: torch.Tensor, perm: torch.Tensor, dim: int = 0) -> torch.Tensor:
|
28
|
+
"""
|
29
|
+
Perform out-of-place permutation without using torch.Tensor.index_put_,
|
30
|
+
whose implementation is missing for datatypes such as `torch.float8_e4m3fn`
|
31
|
+
|
32
|
+
:param value: tensor to permute
|
33
|
+
:param perm: permutation map
|
34
|
+
:param dim: dimension along which to apply permutation
|
35
|
+
:return: permuted value
|
36
|
+
"""
|
37
|
+
dtype_tuple = (value.dtype, value.device)
|
38
|
+
|
39
|
+
if dtype_tuple in _EXPERIMENTAL_DTYPES:
|
40
|
+
return _fallback_permute(value, perm, dim)
|
41
|
+
|
42
|
+
try:
|
43
|
+
return value[tuple([slice(None)] * dim + [perm])]
|
44
|
+
except RuntimeError:
|
45
|
+
# Mark dtype as experimental if advanced indexing fails
|
46
|
+
_EXPERIMENTAL_DTYPES.add(dtype_tuple)
|
47
|
+
return _fallback_permute(value, perm, dim)
|
48
|
+
|
49
|
+
|
50
|
+
def _fallback_permute(
|
51
|
+
value: torch.Tensor, perm: torch.Tensor, dim: int
|
52
|
+
) -> torch.Tensor:
|
53
|
+
"""
|
54
|
+
Fallback permutation method for experimental dtypes.
|
55
|
+
|
56
|
+
:param value: tensor to permute
|
57
|
+
:param perm: permutation map
|
58
|
+
:param dim: dimension along which to apply permutation
|
59
|
+
:return: permuted value
|
60
|
+
"""
|
61
|
+
value_ret = value.clone() # cannot use zeros_like b/c of missing impl.
|
62
|
+
orig_slices = [slice(None)] * (dim + 1)
|
63
|
+
perm_slices = [slice(None)] * (dim + 1)
|
64
|
+
|
65
|
+
for index, perm_index in enumerate(perm):
|
66
|
+
orig_slices[dim] = index
|
67
|
+
perm_slices[dim] = perm_index
|
68
|
+
value_ret[tuple(orig_slices)] = value[tuple(perm_slices)]
|
69
|
+
|
70
|
+
return value_ret
|
compressed_tensors/version.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -8,44 +8,56 @@ Author-email: support@neuralmagic.com
|
|
8
8
|
License: Apache 2.0
|
9
9
|
Description-Content-Type: text/markdown
|
10
10
|
License-File: LICENSE
|
11
|
-
Requires-Dist: torch
|
11
|
+
Requires-Dist: torch>=1.7.0
|
12
12
|
Requires-Dist: transformers
|
13
|
-
Requires-Dist:
|
14
|
-
|
13
|
+
Requires-Dist: pydantic>=2.0
|
14
|
+
Provides-Extra: accelerate
|
15
|
+
Requires-Dist: accelerate; extra == "accelerate"
|
15
16
|
Provides-Extra: dev
|
16
|
-
Requires-Dist: black
|
17
|
-
Requires-Dist: isort
|
18
|
-
Requires-Dist: wheel
|
19
|
-
Requires-Dist: flake8
|
20
|
-
Requires-Dist: pytest
|
21
|
-
Requires-Dist: nbconvert
|
17
|
+
Requires-Dist: black==22.12.0; extra == "dev"
|
18
|
+
Requires-Dist: isort==5.8.0; extra == "dev"
|
19
|
+
Requires-Dist: wheel>=0.36.2; extra == "dev"
|
20
|
+
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
21
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
22
|
+
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
22
23
|
|
23
|
-
#
|
24
|
+
# compressed-tensors
|
24
25
|
|
25
|
-
|
26
|
+
The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
|
26
27
|
|
27
|
-
##
|
28
|
+
## Why `compressed-tensors`?
|
28
29
|
|
29
|
-
|
30
|
+
As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
|
31
|
+
Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
|
32
|
+
`compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
|
30
33
|
|
31
|
-
|
34
|
+
* **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
|
35
|
+
* **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
|
36
|
+
* **Flexible Quantization Support**:
|
37
|
+
* Weight-only quantization (e.g., W4A16, W8A16, WnA16)
|
38
|
+
* Activation quantization (e.g., W8A8)
|
39
|
+
* KV cache quantization
|
40
|
+
* Non-uniform schemes (different layers can be quantized in different ways!)
|
41
|
+
* **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
|
42
|
+
* **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
|
32
43
|
|
33
|
-
|
34
|
-
- Quantized -> due to their low precision representation.
|
35
|
-
|
36
|
-
### Introduce an elegant interface to save/load compressed tensors
|
37
|
-
|
38
|
-
The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
|
44
|
+
This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
|
39
45
|
|
40
46
|
## Installation
|
41
47
|
|
42
|
-
###
|
48
|
+
### From [PyPI](https://pypi.org/project/compressed-tensors)
|
43
49
|
|
50
|
+
Stable release:
|
44
51
|
```bash
|
45
52
|
pip install compressed-tensors
|
46
53
|
```
|
47
54
|
|
48
|
-
|
55
|
+
Nightly release:
|
56
|
+
```bash
|
57
|
+
pip install compressed-tensors-nightly
|
58
|
+
```
|
59
|
+
|
60
|
+
### From Source
|
49
61
|
|
50
62
|
```bash
|
51
63
|
git clone https://github.com/neuralmagic/compressed-tensors
|