compressed-tensors 0.9.4a20250414__py3-none-any.whl → 0.9.5a20250424__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/compressors/base.py +6 -1
- compressed_tensors/compressors/model_compressors/model_compressor.py +90 -7
- compressed_tensors/compressors/quantized_compressors/base.py +21 -6
- compressed_tensors/compressors/quantized_compressors/pack_quantized.py +88 -21
- compressed_tensors/compressors/sparse_compressors/base.py +21 -4
- compressed_tensors/quantization/lifecycle/apply.py +65 -30
- compressed_tensors/quantization/lifecycle/initialize.py +13 -2
- compressed_tensors/utils/offload.py +20 -17
- compressed_tensors/utils/safetensors_load.py +10 -8
- compressed_tensors/version.py +2 -2
- {compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/METADATA +1 -1
- {compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/RECORD +15 -15
- {compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/WHEEL +1 -1
- {compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/licenses/LICENSE +0 -0
- {compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import torch
|
|
19
19
|
from compressed_tensors.config import SparsityCompressionConfig
|
20
20
|
from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
|
21
21
|
from compressed_tensors.registry import RegistryMixin
|
22
|
+
from compressed_tensors.utils import has_offloaded_params
|
22
23
|
from torch import Tensor
|
23
24
|
from torch.nn import Module
|
24
25
|
|
@@ -169,6 +170,10 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
169
170
|
:param module: PyTorch module to decompress
|
170
171
|
:return: tensor of the decompressed weight, or None if module is not quantized
|
171
172
|
"""
|
173
|
+
|
174
|
+
params_device = next(module.parameters()).device
|
175
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
176
|
+
|
172
177
|
if not hasattr(module, "quantization_scheme"):
|
173
178
|
return None # module is not quantized
|
174
179
|
quantization_scheme = module.quantization_scheme
|
@@ -182,7 +187,7 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
182
187
|
|
183
188
|
return self.decompress_weight(
|
184
189
|
compressed_data=compressed_data, quantization_args=quantization_args
|
185
|
-
)
|
190
|
+
).to(device)
|
186
191
|
|
187
192
|
def decompress_weight(
|
188
193
|
self, compressed_data: Dict[str, Tensor], **kwargs
|
@@ -31,13 +31,14 @@ from compressed_tensors.base import (
|
|
31
31
|
SPARSITY_CONFIG_NAME,
|
32
32
|
)
|
33
33
|
from compressed_tensors.compressors.base import BaseCompressor
|
34
|
+
from compressed_tensors.compressors.sparse_compressors import DenseCompressor
|
34
35
|
from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
|
35
36
|
from compressed_tensors.quantization import (
|
36
37
|
DEFAULT_QUANTIZATION_METHOD,
|
37
38
|
QuantizationConfig,
|
38
39
|
QuantizationStatus,
|
39
40
|
apply_quantization_config,
|
40
|
-
|
41
|
+
load_pretrained_quantization_parameters,
|
41
42
|
)
|
42
43
|
from compressed_tensors.quantization.lifecycle import expand_target_names
|
43
44
|
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
@@ -47,7 +48,9 @@ from compressed_tensors.quantization.utils import (
|
|
47
48
|
)
|
48
49
|
from compressed_tensors.utils import (
|
49
50
|
get_safetensors_folder,
|
51
|
+
has_offloaded_params,
|
50
52
|
merge_names,
|
53
|
+
register_offload_parameter,
|
51
54
|
update_parameter_data,
|
52
55
|
)
|
53
56
|
from compressed_tensors.utils.helpers import (
|
@@ -382,6 +385,7 @@ class ModelCompressor:
|
|
382
385
|
compressed_state_dict = self.quantization_compressor.compress(
|
383
386
|
state_dict, names_to_scheme=quantized_modules_to_args
|
384
387
|
)
|
388
|
+
|
385
389
|
if self.quantization_config.format != CompressionFormat.dense.value:
|
386
390
|
self.quantization_config.quantization_status = (
|
387
391
|
QuantizationStatus.COMPRESSED
|
@@ -411,6 +415,13 @@ class ModelCompressor:
|
|
411
415
|
|
412
416
|
:param model_path: path to compressed weights
|
413
417
|
:param model: pytorch model to load decompressed weights into
|
418
|
+
|
419
|
+
Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
|
420
|
+
The variations in these methods are a result of the subtle variations between the sparsity
|
421
|
+
and quantization compressors. Specifically, quantization compressors return not just the
|
422
|
+
decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
|
423
|
+
compressors only return the decompressed weight.
|
424
|
+
|
414
425
|
"""
|
415
426
|
model_path = get_safetensors_folder(model_path)
|
416
427
|
sparse_decompressed = False
|
@@ -419,9 +430,16 @@ class ModelCompressor:
|
|
419
430
|
self.sparsity_compressor is not None
|
420
431
|
and self.sparsity_config.format != CompressionFormat.dense.value
|
421
432
|
):
|
433
|
+
params_to_ignore = None
|
434
|
+
if self.quantization_compressor is not None:
|
435
|
+
params_to_ignore = self.quantization_compressor.compression_param_names
|
422
436
|
# Sparse decompression is applied on the model_path
|
423
|
-
|
424
|
-
|
437
|
+
# The compressor will try and load any quantization parameters as well
|
438
|
+
# params_to_skip_load will skip over quantization params from being loaded
|
439
|
+
dense_gen = self.sparsity_compressor.decompress(
|
440
|
+
model_path, params_to_skip_load=params_to_ignore
|
441
|
+
)
|
442
|
+
self._replace_sparsity_weights(dense_gen, model)
|
425
443
|
setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
|
426
444
|
sparse_decompressed = True
|
427
445
|
|
@@ -430,13 +448,27 @@ class ModelCompressor:
|
|
430
448
|
# quantization during apply_quantization_config. This ensures
|
431
449
|
# that the dtypes of the weights are not unintentionally updated.
|
432
450
|
# The status is restored after quantization params are loaded.
|
451
|
+
|
433
452
|
with override_quantization_status(
|
434
453
|
self.quantization_config, QuantizationStatus.FROZEN
|
435
454
|
):
|
455
|
+
|
436
456
|
names_to_scheme = apply_quantization_config(
|
437
457
|
model, self.quantization_config
|
438
458
|
)
|
439
|
-
|
459
|
+
# Load activation scales/zp or any other quantization parameters
|
460
|
+
# Conditionally load the weight quantization parameters if we have a dense compressor
|
461
|
+
# Or if a sparsity compressor has already been applied
|
462
|
+
load_pretrained_quantization_parameters(
|
463
|
+
model,
|
464
|
+
model_path,
|
465
|
+
# TODO: all weight quantization params will be moved to the compressor in a follow-up
|
466
|
+
# including initialization
|
467
|
+
load_weight_quantization=(
|
468
|
+
sparse_decompressed
|
469
|
+
or isinstance(self.quantization_compressor, DenseCompressor)
|
470
|
+
),
|
471
|
+
)
|
440
472
|
|
441
473
|
model_path_or_state_dict = (
|
442
474
|
model.state_dict() if sparse_decompressed else model_path
|
@@ -445,6 +477,8 @@ class ModelCompressor:
|
|
445
477
|
dense_gen = self.quantization_compressor.decompress(
|
446
478
|
model_path_or_state_dict, names_to_scheme=names_to_scheme
|
447
479
|
)
|
480
|
+
# TODO: all weight quantization params will be moved to the compressor
|
481
|
+
# to prevent duplicate parameter updates in update_parameter_data
|
448
482
|
self._replace_weights(dense_gen, model)
|
449
483
|
|
450
484
|
def freeze_quantization_status(module):
|
@@ -500,7 +534,7 @@ class ModelCompressor:
|
|
500
534
|
with open(config_file_path, "w") as config_file:
|
501
535
|
json.dump(config_data, config_file, indent=2, sort_keys=True)
|
502
536
|
|
503
|
-
def
|
537
|
+
def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
|
504
538
|
"""
|
505
539
|
Replace the weights of the model with the
|
506
540
|
provided dense weights.
|
@@ -515,11 +549,60 @@ class ModelCompressor:
|
|
515
549
|
:param model: The model whose weights are to be updated.
|
516
550
|
"""
|
517
551
|
for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
|
552
|
+
|
518
553
|
split_name = name.split(".")
|
519
554
|
prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
|
520
555
|
module = operator.attrgetter(prefix)(model)
|
521
|
-
|
522
|
-
|
556
|
+
|
557
|
+
params_device = next(module.parameters()).device
|
558
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
559
|
+
delattr(module, param_name)
|
560
|
+
requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
|
561
|
+
param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
|
562
|
+
register_offload_parameter(module, param_name, param)
|
563
|
+
|
564
|
+
def _replace_weights(self, dense_weight_generator, model: Module):
|
565
|
+
"""
|
566
|
+
Replace the weights of the model with the
|
567
|
+
provided dense weights.
|
568
|
+
|
569
|
+
This method iterates over the dense_weight_generator and
|
570
|
+
updates the corresponding weights in the model. If a parameter
|
571
|
+
name does not exist in the model, it will be skipped.
|
572
|
+
|
573
|
+
:param dense_weight_generator (generator): A generator that yields
|
574
|
+
tuples of (name, data), where 'name' is the parameter name and
|
575
|
+
'data' is the updated param data
|
576
|
+
:param model: The model whose weights are to be updated.
|
577
|
+
"""
|
578
|
+
|
579
|
+
for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
|
580
|
+
module = operator.attrgetter(name)(model)
|
581
|
+
|
582
|
+
params_device = next(module.parameters()).device
|
583
|
+
device = "cpu" if has_offloaded_params(module) else params_device
|
584
|
+
|
585
|
+
for param_name, param_data in data.items():
|
586
|
+
if hasattr(module, param_name):
|
587
|
+
# If compressed, will have an incorrect dtype for transformers >4.49
|
588
|
+
# TODO: we can also just skip initialization of scales/zp if in decompression in init
|
589
|
+
# to be consistent with loading which happens later as well
|
590
|
+
# however, update_data does a good shape check - should be moved to the compressor
|
591
|
+
if param_name == "weight":
|
592
|
+
delattr(module, param_name)
|
593
|
+
requires_grad = param_data.dtype in (
|
594
|
+
torch.float16,
|
595
|
+
torch.float32,
|
596
|
+
torch.bfloat16,
|
597
|
+
)
|
598
|
+
param = torch.nn.Parameter(
|
599
|
+
param_data.to(device), requires_grad=requires_grad
|
600
|
+
)
|
601
|
+
register_offload_parameter(module, param_name, param)
|
602
|
+
else:
|
603
|
+
# Should already be registered to the correct device for
|
604
|
+
# for scales/zero-points
|
605
|
+
update_parameter_data(module, param_data, param_name)
|
523
606
|
|
524
607
|
|
525
608
|
def map_modules_to_quant_args(
|
@@ -14,11 +14,11 @@
|
|
14
14
|
|
15
15
|
import logging
|
16
16
|
from pathlib import Path
|
17
|
-
from typing import Any, Dict, Generator, Tuple, Union
|
17
|
+
from typing import Any, Dict, Generator, Optional, Tuple, Union
|
18
18
|
|
19
19
|
import torch
|
20
20
|
from compressed_tensors.compressors.base import BaseCompressor
|
21
|
-
from compressed_tensors.quantization import QuantizationArgs
|
21
|
+
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
|
22
22
|
from compressed_tensors.utils import (
|
23
23
|
get_nested_mappings_from_state_dict,
|
24
24
|
get_nested_weight_mappings,
|
@@ -132,8 +132,10 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
132
132
|
compressed_dict[merge_names(prefix, key)] = value
|
133
133
|
else:
|
134
134
|
compressed_dict[name] = value.to("cpu")
|
135
|
-
# only save if asym
|
136
|
-
elif is_weight_zp and
|
135
|
+
# only save zp if asym and not packed zp
|
136
|
+
elif is_weight_zp and (
|
137
|
+
quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
|
138
|
+
):
|
137
139
|
continue
|
138
140
|
# only save if asym
|
139
141
|
elif is_input_zp and input_args_zp.symmetric:
|
@@ -145,6 +147,17 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
145
147
|
|
146
148
|
return compressed_dict
|
147
149
|
|
150
|
+
def _check_if_zp_pack_quantized(self, quant_args):
|
151
|
+
from compressed_tensors.compressors import PackedQuantizationCompressor
|
152
|
+
|
153
|
+
if isinstance(self, PackedQuantizationCompressor):
|
154
|
+
if not quant_args.symmetric and quant_args.strategy in [
|
155
|
+
QuantizationStrategy.GROUP.value,
|
156
|
+
QuantizationStrategy.CHANNEL.value,
|
157
|
+
]:
|
158
|
+
return True
|
159
|
+
return False
|
160
|
+
|
148
161
|
def decompress(
|
149
162
|
self,
|
150
163
|
path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
|
@@ -186,7 +199,8 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
186
199
|
decompressed = self.decompress_weight(
|
187
200
|
compressed_data=weight_data, quantization_args=quant_args
|
188
201
|
)
|
189
|
-
|
202
|
+
weight_data["weight"] = decompressed
|
203
|
+
yield weight_name, weight_data
|
190
204
|
|
191
205
|
def _decompress_from_state_dict(self, state_dict, names_to_scheme):
|
192
206
|
weight_mappings = get_nested_mappings_from_state_dict(
|
@@ -202,4 +216,5 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
202
216
|
decompressed = self.decompress_weight(
|
203
217
|
compressed_data=weight_data, quantization_args=quant_args
|
204
218
|
)
|
205
|
-
|
219
|
+
weight_data["weight"] = decompressed
|
220
|
+
yield weight_name, weight_data
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
import math
|
15
|
-
from typing import Dict, Optional, Tuple
|
15
|
+
from typing import Dict, Literal, Optional, Tuple, Union
|
16
16
|
|
17
17
|
import numpy as np
|
18
18
|
import torch
|
@@ -21,7 +21,7 @@ from compressed_tensors.compressors.quantized_compressors.base import (
|
|
21
21
|
BaseQuantizationCompressor,
|
22
22
|
)
|
23
23
|
from compressed_tensors.config import CompressionFormat
|
24
|
-
from compressed_tensors.quantization import QuantizationArgs
|
24
|
+
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
|
25
25
|
from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
|
26
26
|
from compressed_tensors.quantization.utils import can_quantize
|
27
27
|
from torch import Tensor
|
@@ -65,10 +65,26 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
65
65
|
"""
|
66
66
|
pack_factor = 32 // quantization_args.num_bits
|
67
67
|
packed_size = math.ceil(weight_shape[1] / pack_factor)
|
68
|
-
|
68
|
+
packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
|
69
|
+
output = {
|
69
70
|
"weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
|
70
71
|
"weight_shape": (torch.Size((2,)), torch.int32),
|
71
72
|
}
|
73
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
74
|
+
QuantizationStrategy.GROUP.value,
|
75
|
+
QuantizationStrategy.CHANNEL.value,
|
76
|
+
]:
|
77
|
+
zp_factor = (
|
78
|
+
quantization_args.group_size
|
79
|
+
if quantization_args.strategy == QuantizationStrategy.GROUP.value
|
80
|
+
else weight_shape[-1]
|
81
|
+
)
|
82
|
+
|
83
|
+
output["weight_zero_point"] = (
|
84
|
+
torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
|
85
|
+
torch.int32,
|
86
|
+
)
|
87
|
+
return output
|
72
88
|
|
73
89
|
def compress_weight(
|
74
90
|
self,
|
@@ -104,6 +120,7 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
104
120
|
quantized_weight = weight
|
105
121
|
|
106
122
|
packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
|
123
|
+
|
107
124
|
weight_shape = torch.tensor(weight.shape)
|
108
125
|
if device is not None:
|
109
126
|
packed_weight = packed_weight.to(device)
|
@@ -112,6 +129,15 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
112
129
|
compressed_dict["weight_shape"] = weight_shape
|
113
130
|
compressed_dict["weight_packed"] = packed_weight
|
114
131
|
|
132
|
+
# We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
|
133
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
134
|
+
QuantizationStrategy.GROUP.value,
|
135
|
+
QuantizationStrategy.CHANNEL.value,
|
136
|
+
]:
|
137
|
+
packed_zp = pack_to_int32(
|
138
|
+
zero_point, quantization_args.num_bits, packed_dim=0
|
139
|
+
)
|
140
|
+
compressed_dict["weight_zero_point"] = packed_zp
|
115
141
|
return compressed_dict
|
116
142
|
|
117
143
|
def decompress_weight(
|
@@ -133,6 +159,21 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
133
159
|
original_shape = torch.Size(compressed_data["weight_shape"])
|
134
160
|
num_bits = quantization_args.num_bits
|
135
161
|
unpacked = unpack_from_int32(weight, num_bits, original_shape)
|
162
|
+
|
163
|
+
# NOTE: this will fail decompression as we don't currently handle packed zp on decompression
|
164
|
+
if not quantization_args.symmetric and quantization_args.strategy in [
|
165
|
+
QuantizationStrategy.GROUP.value,
|
166
|
+
QuantizationStrategy.CHANNEL.value,
|
167
|
+
]:
|
168
|
+
raise ValueError(
|
169
|
+
"Decompression of packed zero points is currently not supported"
|
170
|
+
)
|
171
|
+
assert zero_point is not None
|
172
|
+
original_zp_shape = (original_shape[0], scale.shape[-1])
|
173
|
+
zero_point = unpack_from_int32(
|
174
|
+
zero_point, num_bits, original_zp_shape, packed_dim=0
|
175
|
+
)
|
176
|
+
|
136
177
|
decompressed_weight = dequantize(
|
137
178
|
x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
|
138
179
|
)
|
@@ -140,7 +181,11 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
140
181
|
return decompressed_weight
|
141
182
|
|
142
183
|
|
143
|
-
def pack_to_int32(
|
184
|
+
def pack_to_int32(
|
185
|
+
value: torch.Tensor,
|
186
|
+
num_bits: int,
|
187
|
+
packed_dim: Union[Literal[0], Literal[1]] = 1,
|
188
|
+
) -> torch.Tensor:
|
144
189
|
"""
|
145
190
|
Packs a tensor of quantized weights stored in int8 into int32s with padding
|
146
191
|
|
@@ -176,14 +221,19 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
176
221
|
pack_factor = 32 // num_bits
|
177
222
|
|
178
223
|
# pad input tensor and initialize packed output
|
179
|
-
packed_size = math.ceil(value.shape[
|
180
|
-
padding = packed_size * pack_factor - value.shape[
|
224
|
+
packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
|
225
|
+
padding = packed_size * pack_factor - value.shape[packed_dim]
|
181
226
|
value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
|
182
227
|
|
183
228
|
# pack values
|
184
|
-
|
185
|
-
|
186
|
-
|
229
|
+
if packed_dim == 1:
|
230
|
+
packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
|
231
|
+
for i in range(pack_factor):
|
232
|
+
packed |= value[:, i::pack_factor] << num_bits * i
|
233
|
+
else:
|
234
|
+
packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
|
235
|
+
for i in range(pack_factor):
|
236
|
+
packed |= value[i::pack_factor, :] << num_bits * i
|
187
237
|
|
188
238
|
# convert back to signed and torch
|
189
239
|
packed = np.ascontiguousarray(packed).view(np.int32)
|
@@ -191,7 +241,10 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
191
241
|
|
192
242
|
|
193
243
|
def unpack_from_int32(
|
194
|
-
value: torch.Tensor,
|
244
|
+
value: torch.Tensor,
|
245
|
+
num_bits: int,
|
246
|
+
shape: torch.Size,
|
247
|
+
packed_dim: Union[Literal[0], Literal[1]] = 1,
|
195
248
|
) -> torch.Tensor:
|
196
249
|
"""
|
197
250
|
Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
|
@@ -216,17 +269,31 @@ def unpack_from_int32(
|
|
216
269
|
|
217
270
|
# unpack
|
218
271
|
mask = (1 << num_bits) - 1
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
272
|
+
|
273
|
+
if packed_dim == 1:
|
274
|
+
unpacked = torch.zeros(
|
275
|
+
(value.shape[0], value.shape[1] * pack_factor),
|
276
|
+
device=value.device,
|
277
|
+
dtype=torch.int32,
|
278
|
+
)
|
279
|
+
for i in range(pack_factor):
|
280
|
+
unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
|
281
|
+
|
282
|
+
# remove padding
|
283
|
+
original_row_size = int(shape[1])
|
284
|
+
unpacked = unpacked[:, :original_row_size]
|
285
|
+
else:
|
286
|
+
unpacked = torch.zeros(
|
287
|
+
(value.shape[0] * pack_factor, value.shape[1]),
|
288
|
+
device=value.device,
|
289
|
+
dtype=torch.int32,
|
290
|
+
)
|
291
|
+
for i in range(pack_factor):
|
292
|
+
unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
|
293
|
+
|
294
|
+
# remove padding
|
295
|
+
original_row_size = int(shape[0])
|
296
|
+
unpacked = unpacked[:original_row_size, :]
|
230
297
|
|
231
298
|
# bits are packed in unsigned format, reformat to signed
|
232
299
|
# update the value range from unsigned to signed
|
@@ -98,7 +98,11 @@ class BaseSparseCompressor(BaseCompressor):
|
|
98
98
|
return compressed_dict
|
99
99
|
|
100
100
|
def decompress(
|
101
|
-
self,
|
101
|
+
self,
|
102
|
+
path_to_model_or_tensors: str,
|
103
|
+
device: str = "cpu",
|
104
|
+
params_to_skip_load: Optional[Tuple] = None,
|
105
|
+
**kwargs,
|
102
106
|
) -> Generator[Tuple[str, Tensor], None, None]:
|
103
107
|
"""
|
104
108
|
Reads a bitmask compressed state dict located
|
@@ -108,6 +112,11 @@ class BaseSparseCompressor(BaseCompressor):
|
|
108
112
|
:param model_path: path to compressed safetensors model (directory with
|
109
113
|
one or more safetensors files) or compressed tensors file
|
110
114
|
:param device: device to load decompressed weights onto
|
115
|
+
:param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
|
116
|
+
parameters) that we want to skip loading. As the sparsity compresssor does
|
117
|
+
not handle quantized decompression, this should contain any quantization
|
118
|
+
parameters when decompressing stacked compressors. We want these parameters
|
119
|
+
to be handled by the quantization decompressor
|
111
120
|
:return: iterator for generating decompressed weights
|
112
121
|
"""
|
113
122
|
weight_mappings, ignored_params = get_nested_weight_mappings(
|
@@ -121,13 +130,21 @@ class BaseSparseCompressor(BaseCompressor):
|
|
121
130
|
full_name = merge_names(weight_name, param_name)
|
122
131
|
with safe_open(safe_path, framework="pt", device=device) as f:
|
123
132
|
weight_data[param_name] = f.get_tensor(full_name)
|
133
|
+
|
124
134
|
decompressed = self.decompress_weight(weight_data)
|
125
135
|
yield merge_names(weight_name, "weight"), decompressed
|
126
136
|
|
127
137
|
for ignored_param_name, safe_path in ignored_params.items():
|
128
|
-
|
129
|
-
|
130
|
-
|
138
|
+
should_skip = False
|
139
|
+
if params_to_skip_load is not None:
|
140
|
+
for param_to_skip in params_to_skip_load:
|
141
|
+
if param_to_skip in ignored_param_name:
|
142
|
+
should_skip = True
|
143
|
+
|
144
|
+
if not should_skip:
|
145
|
+
with safe_open(safe_path, framework="pt", device=device) as f:
|
146
|
+
value = f.get_tensor(ignored_param_name)
|
147
|
+
yield ignored_param_name, value
|
131
148
|
|
132
149
|
@staticmethod
|
133
150
|
def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:
|
@@ -44,11 +44,12 @@ from compressed_tensors.quantization.utils import (
|
|
44
44
|
from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
|
45
45
|
from compressed_tensors.utils.offload import update_parameter_data
|
46
46
|
from compressed_tensors.utils.safetensors_load import get_safetensors_folder
|
47
|
+
from safetensors import safe_open
|
47
48
|
from torch.nn import Module
|
48
49
|
|
49
50
|
|
50
51
|
__all__ = [
|
51
|
-
"
|
52
|
+
"load_pretrained_quantization_parameters",
|
52
53
|
"apply_quantization_config",
|
53
54
|
"apply_quantization_status",
|
54
55
|
"find_name_or_class_matches",
|
@@ -57,50 +58,62 @@ __all__ = [
|
|
57
58
|
]
|
58
59
|
|
59
60
|
from compressed_tensors.quantization.utils.helpers import is_module_quantized
|
60
|
-
from compressed_tensors.utils.safetensors_load import
|
61
|
+
from compressed_tensors.utils.safetensors_load import (
|
62
|
+
get_quantization_parameter_to_path_mapping,
|
63
|
+
)
|
61
64
|
|
62
65
|
|
63
66
|
_LOGGER = logging.getLogger(__name__)
|
64
67
|
|
65
68
|
|
66
|
-
def
|
69
|
+
def load_pretrained_quantization_parameters(
|
70
|
+
model: Module,
|
71
|
+
model_name_or_path: Optional[str] = None,
|
72
|
+
load_weight_quantization: Optional[bool] = False,
|
73
|
+
):
|
67
74
|
"""
|
68
75
|
Loads the quantization parameters (scale and zero point) from model_name_or_path to
|
69
|
-
a model that has already been initialized with a quantization config
|
76
|
+
a model that has already been initialized with a quantization config.
|
77
|
+
|
78
|
+
NOTE: Will always load inputs/output parameters.
|
79
|
+
Will conditioanlly load weight parameters, if load_weight_quantization is set to True.
|
70
80
|
|
71
81
|
:param model: model to load pretrained quantization parameters to
|
72
82
|
:param model_name_or_path: Hugging Face stub or local folder containing a quantized
|
73
|
-
|
83
|
+
model, which is used to load quantization parameters
|
84
|
+
:param load_weight_quantization: whether or not the weight quantization parameters shoud
|
85
|
+
be laoded
|
74
86
|
"""
|
75
87
|
model_path = get_safetensors_folder(model_name_or_path)
|
76
|
-
|
88
|
+
mapping = get_quantization_parameter_to_path_mapping(model_path)
|
77
89
|
|
78
90
|
for name, submodule in iter_named_leaf_modules(model):
|
79
91
|
if not is_module_quantized(submodule):
|
80
92
|
continue
|
81
|
-
if submodule.quantization_scheme.weights is not None:
|
82
|
-
base_name = "weight"
|
83
|
-
_load_quant_args_from_state_dict(
|
84
|
-
base_name=base_name,
|
85
|
-
module_name=name,
|
86
|
-
module=submodule,
|
87
|
-
state_dict=state_dict,
|
88
|
-
)
|
89
93
|
if submodule.quantization_scheme.input_activations is not None:
|
90
94
|
base_name = "input"
|
91
|
-
|
95
|
+
_load_quant_args_from_mapping(
|
92
96
|
base_name=base_name,
|
93
97
|
module_name=name,
|
94
98
|
module=submodule,
|
95
|
-
|
99
|
+
mapping=mapping,
|
96
100
|
)
|
97
101
|
if submodule.quantization_scheme.output_activations is not None:
|
98
102
|
base_name = "output"
|
99
|
-
|
103
|
+
_load_quant_args_from_mapping(
|
100
104
|
base_name=base_name,
|
101
105
|
module_name=name,
|
102
106
|
module=submodule,
|
103
|
-
|
107
|
+
mapping=mapping,
|
108
|
+
)
|
109
|
+
|
110
|
+
if load_weight_quantization and submodule.quantization_scheme.weights:
|
111
|
+
base_name = "weight"
|
112
|
+
_load_quant_args_from_mapping(
|
113
|
+
base_name=base_name,
|
114
|
+
module_name=name,
|
115
|
+
module=submodule,
|
116
|
+
mapping=mapping,
|
104
117
|
)
|
105
118
|
|
106
119
|
|
@@ -237,9 +250,19 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
|
|
237
250
|
|
238
251
|
if status >= QuantizationStatus.INITIALIZED > current_status:
|
239
252
|
force_zero_point_init = status != QuantizationStatus.COMPRESSED
|
253
|
+
|
254
|
+
# When decompressing, we set the scale_dtype as the model's dtype
|
255
|
+
# This is because the normal workflow of using the weight's dtype
|
256
|
+
# will be incorrect as the model weight will be compressed
|
257
|
+
# Therfore, use the dtype set by the user using the PretrainedModel
|
258
|
+
scale_dtype = None
|
259
|
+
if status == QuantizationStatus.FROZEN:
|
260
|
+
if hasattr(model, "dtype"):
|
261
|
+
scale_dtype = model.dtype
|
262
|
+
|
240
263
|
model.apply(
|
241
264
|
lambda module: initialize_module_for_quantization(
|
242
|
-
module, force_zero_point=force_zero_point_init
|
265
|
+
module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype
|
243
266
|
)
|
244
267
|
)
|
245
268
|
|
@@ -344,9 +367,10 @@ def _infer_status(model: Module) -> Optional[QuantizationStatus]:
|
|
344
367
|
return None
|
345
368
|
|
346
369
|
|
347
|
-
def
|
348
|
-
base_name: str, module_name: str, module: Module,
|
370
|
+
def _load_quant_args_from_mapping(
|
371
|
+
base_name: str, module_name: str, module: Module, mapping: Dict
|
349
372
|
):
|
373
|
+
# TODO: skip update and just register here, don't do it in initialize
|
350
374
|
"""
|
351
375
|
Loads scale and zero point from a state_dict into the specified module
|
352
376
|
|
@@ -354,26 +378,37 @@ def _load_quant_args_from_state_dict(
|
|
354
378
|
output_activations
|
355
379
|
:param module_name: pytorch module name to look up in state_dict
|
356
380
|
:module: pytorch module associated with module_name
|
357
|
-
:
|
381
|
+
:mapping: mapping to search fetch paths on disk for a given parameter
|
358
382
|
"""
|
359
383
|
scale_name = f"{base_name}_scale"
|
360
384
|
zp_name = f"{base_name}_zero_point"
|
361
385
|
g_idx_name = f"{base_name}_g_idx"
|
362
386
|
|
363
|
-
|
364
|
-
|
365
|
-
|
387
|
+
state_dict_scale_path = mapping.get(f"{module_name}.{scale_name}", None)
|
388
|
+
state_dict_zp_path = mapping.get(f"{module_name}.{zp_name}", None)
|
389
|
+
state_dict_g_idx_path = mapping.get(f"{module_name}.{g_idx_name}", None)
|
390
|
+
|
391
|
+
if state_dict_g_idx_path is not None:
|
392
|
+
with safe_open(state_dict_g_idx_path, framework="pt", device="cpu") as f:
|
393
|
+
state_dict_g_idx = f.get_tensor(f"{module_name}.{g_idx_name}")
|
394
|
+
|
395
|
+
update_parameter_data(module, state_dict_g_idx, g_idx_name)
|
366
396
|
|
367
|
-
if
|
397
|
+
if state_dict_scale_path is not None:
|
368
398
|
# module is quantized
|
399
|
+
with safe_open(state_dict_scale_path, framework="pt", device="cpu") as f:
|
400
|
+
state_dict_scale = f.get_tensor(f"{module_name}.{scale_name}")
|
401
|
+
|
369
402
|
update_parameter_data(module, state_dict_scale, scale_name)
|
370
|
-
|
403
|
+
|
404
|
+
if state_dict_zp_path is None:
|
371
405
|
# fill in zero point for symmetric quantization
|
372
406
|
state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu")
|
373
|
-
|
407
|
+
else:
|
408
|
+
with safe_open(state_dict_zp_path, framework="pt", device="cpu") as f:
|
409
|
+
state_dict_zp = f.get_tensor(f"{module_name}.{zp_name}")
|
374
410
|
|
375
|
-
|
376
|
-
update_parameter_data(module, state_dict_g_idx, g_idx_name)
|
411
|
+
update_parameter_data(module, state_dict_zp, zp_name)
|
377
412
|
|
378
413
|
|
379
414
|
def _scheme_from_targets(
|
@@ -56,6 +56,7 @@ def initialize_module_for_quantization(
|
|
56
56
|
module: Module,
|
57
57
|
scheme: Optional[QuantizationScheme] = None,
|
58
58
|
force_zero_point: bool = True,
|
59
|
+
scale_dtype: Optional[torch.dtype] = None,
|
59
60
|
):
|
60
61
|
"""
|
61
62
|
attaches appropriate scales, zero points, and observers to a layer
|
@@ -69,7 +70,10 @@ def initialize_module_for_quantization(
|
|
69
70
|
if not provided, the layer will be skipped
|
70
71
|
:param force_zero_point: whether to force initialization of a zero point for
|
71
72
|
symmetric quantization
|
73
|
+
:param scale_dtype: dtype to used for the scales, if overriding the
|
74
|
+
weight dtype as the scale dtype
|
72
75
|
"""
|
76
|
+
# TODO: don't initialize parameters when running decompression
|
73
77
|
scheme = scheme or getattr(module, "quantization_scheme", None)
|
74
78
|
if scheme is None:
|
75
79
|
# no scheme passed and layer not targeted for quantization - skip
|
@@ -87,7 +91,9 @@ def initialize_module_for_quantization(
|
|
87
91
|
"input",
|
88
92
|
scheme.input_activations,
|
89
93
|
force_zero_point=force_zero_point,
|
94
|
+
scale_dtype=scale_dtype,
|
90
95
|
)
|
96
|
+
|
91
97
|
if scheme.weights is not None:
|
92
98
|
if hasattr(module, "weight"):
|
93
99
|
weight_shape = None
|
@@ -99,6 +105,7 @@ def initialize_module_for_quantization(
|
|
99
105
|
scheme.weights,
|
100
106
|
weight_shape=weight_shape,
|
101
107
|
force_zero_point=force_zero_point,
|
108
|
+
scale_dtype=scale_dtype,
|
102
109
|
)
|
103
110
|
else:
|
104
111
|
_LOGGER.warning(
|
@@ -110,7 +117,7 @@ def initialize_module_for_quantization(
|
|
110
117
|
if scheme.output_activations is not None:
|
111
118
|
if not is_kv_cache_quant_scheme(scheme):
|
112
119
|
_initialize_scale_zero_point(
|
113
|
-
module, "output", scheme.output_activations
|
120
|
+
module, "output", scheme.output_activations, scale_dtype=scale_dtype
|
114
121
|
)
|
115
122
|
|
116
123
|
module.quantization_scheme = scheme
|
@@ -136,6 +143,7 @@ def _initialize_scale_zero_point(
|
|
136
143
|
quantization_args: QuantizationArgs,
|
137
144
|
weight_shape: Optional[torch.Size] = None,
|
138
145
|
force_zero_point: bool = True,
|
146
|
+
scale_dtype: Optional[torch.dtype] = None,
|
139
147
|
):
|
140
148
|
if quantization_args.dynamic:
|
141
149
|
return
|
@@ -160,7 +168,10 @@ def _initialize_scale_zero_point(
|
|
160
168
|
num_groups = weight_shape[1] // quantization_args.group_size
|
161
169
|
expected_shape = (weight_shape[0], max(num_groups, 1))
|
162
170
|
|
163
|
-
scale_dtype = module.weight.dtype
|
171
|
+
scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
|
172
|
+
# TODO: consider erroring out in the future as if the dtype if not one fo these,
|
173
|
+
# there is likely bug
|
174
|
+
|
164
175
|
if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
|
165
176
|
scale_dtype = torch.float16
|
166
177
|
|
@@ -94,22 +94,6 @@ def is_module_offloaded(module: torch.nn.Module) -> bool:
|
|
94
94
|
return has_offloaded_params(module)
|
95
95
|
|
96
96
|
|
97
|
-
def get_execution_device(module: torch.nn.Module) -> torch.device:
|
98
|
-
"""
|
99
|
-
:param module: module to check
|
100
|
-
:return: device module is loaded onto during forward pass
|
101
|
-
"""
|
102
|
-
if has_offloaded_params(module):
|
103
|
-
return module._hf_hook.execution_device
|
104
|
-
device = next(module.parameters()).device
|
105
|
-
|
106
|
-
# offload only gets set for leaf modules, fallback to checking for device type
|
107
|
-
if device.type == "meta":
|
108
|
-
return module._hf_hook.execution_device
|
109
|
-
|
110
|
-
return device
|
111
|
-
|
112
|
-
|
113
97
|
def get_offloaded_device(module: torch.nn.Module) -> torch.device:
|
114
98
|
"""
|
115
99
|
:param module: module to check
|
@@ -158,6 +142,26 @@ def update_parameter_data(
|
|
158
142
|
""" Candidates for Upstreaming """
|
159
143
|
|
160
144
|
|
145
|
+
def get_execution_device(module: torch.nn.Module) -> torch.device:
|
146
|
+
"""
|
147
|
+
Get the device which inputs should be moved to before module execution
|
148
|
+
|
149
|
+
:param module: module to check, may be offloaded
|
150
|
+
:return: onload device of module
|
151
|
+
"""
|
152
|
+
if has_offloaded_params(module):
|
153
|
+
return module._hf_hook.execution_device
|
154
|
+
|
155
|
+
first_param = next(module.parameters(), None)
|
156
|
+
if first_param is None:
|
157
|
+
warnings.warn(
|
158
|
+
f"Unable able to infer execution device of {module}, falling back to CPU"
|
159
|
+
)
|
160
|
+
return torch.device("cpu")
|
161
|
+
|
162
|
+
return first_param.device
|
163
|
+
|
164
|
+
|
161
165
|
def register_offload_parameter(
|
162
166
|
module: torch.nn.Module,
|
163
167
|
name: str,
|
@@ -200,7 +204,6 @@ def update_offload_parameter(
|
|
200
204
|
provided, then infer device from parameters on module
|
201
205
|
"""
|
202
206
|
param = getattr(module, name)
|
203
|
-
data = data.to(param.dtype)
|
204
207
|
if param.data.shape != data.shape:
|
205
208
|
warnings.warn(
|
206
209
|
f"Shape of parameter being updated {param.data.shape} does not match shape "
|
@@ -31,7 +31,7 @@ __all__ = [
|
|
31
31
|
"get_weight_mappings",
|
32
32
|
"get_nested_weight_mappings",
|
33
33
|
"get_nested_mappings_from_state_dict",
|
34
|
-
"
|
34
|
+
"get_quantization_parameter_to_path_mapping",
|
35
35
|
"is_quantization_param",
|
36
36
|
]
|
37
37
|
|
@@ -279,16 +279,18 @@ def get_nested_mappings_from_state_dict(
|
|
279
279
|
return nested_weight_mappings
|
280
280
|
|
281
281
|
|
282
|
-
def
|
282
|
+
def get_quantization_parameter_to_path_mapping(model_path: str) -> Dict[str, str]:
|
283
|
+
"""
|
284
|
+
Given a model path, return a mapping between a parameter and its path
|
285
|
+
on disk
|
286
|
+
"""
|
283
287
|
weight_mappings = get_weight_mappings(model_path)
|
284
|
-
|
288
|
+
mapping = {}
|
285
289
|
for weight_name, safe_path in weight_mappings.items():
|
286
|
-
if
|
290
|
+
if is_quantization_param(weight_name):
|
291
|
+
mapping[weight_name] = safe_path
|
287
292
|
continue
|
288
|
-
|
289
|
-
state_dict[weight_name] = f.get_tensor(weight_name)
|
290
|
-
|
291
|
-
return state_dict
|
293
|
+
return mapping
|
292
294
|
|
293
295
|
|
294
296
|
def is_quantization_param(name: str) -> bool:
|
compressed_tensors/version.py
CHANGED
@@ -17,5 +17,5 @@ __version__: str
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
18
18
|
version_tuple: VERSION_TUPLE
|
19
19
|
|
20
|
-
__version__ = version = '0.9.
|
21
|
-
__version_tuple__ = version_tuple = (0, 9,
|
20
|
+
__version__ = version = '0.9.5.a20250424'
|
21
|
+
__version_tuple__ = version_tuple = (0, 9, 5)
|
{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.5a20250424
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
{compressed_tensors-0.9.4a20250414.dist-info → compressed_tensors-0.9.5a20250424.dist-info}/RECORD
RENAMED
@@ -1,17 +1,17 @@
|
|
1
1
|
compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
|
2
2
|
compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
|
3
|
-
compressed_tensors/version.py,sha256=
|
3
|
+
compressed_tensors/version.py,sha256=fMpLfUNedNFTmTmQeHxGZnMaXAKOKiqpI9xyx46F2gI,521
|
4
4
|
compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
|
5
|
-
compressed_tensors/compressors/base.py,sha256=
|
5
|
+
compressed_tensors/compressors/base.py,sha256=nvWsv4xEw1Tkxkxth6TmHplDYXfBeP22xWxOsZERyDY,7204
|
6
6
|
compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
|
7
7
|
compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
|
8
|
-
compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=
|
8
|
+
compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=gZvhGSMYIWvLiH0Xl2dmh7PxfyLHAX5nFBvIUUDE6Qc,27451
|
9
9
|
compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=09UJq68Pht6Bf-4iP9xYl3tetKsncNPHD8IAGbePsr4,714
|
10
|
-
compressed_tensors/compressors/quantized_compressors/base.py,sha256=
|
10
|
+
compressed_tensors/compressors/quantized_compressors/base.py,sha256=PWSPLQ7zBBjHfQyHUqr9D-mGYLe5WczJHMSRZWCOxOI,9189
|
11
11
|
compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
|
12
|
-
compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=
|
12
|
+
compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=SPIHlk8ewip2LcjgkCw02K21EkfUSFSd9qQqL0Pt5eM,11162
|
13
13
|
compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
|
14
|
-
compressed_tensors/compressors/sparse_compressors/base.py,sha256=
|
14
|
+
compressed_tensors/compressors/sparse_compressors/base.py,sha256=PMiWIaW2XSF_esYJlQ12RVW7opeAzavdbkRFtelMFX0,6655
|
15
15
|
compressed_tensors/compressors/sparse_compressors/dense.py,sha256=_uW_HISeDNz4yboSZWoh6GwrkUE6HFibzPQSKrHOCkg,1505
|
16
16
|
compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py,sha256=mEKSSgpXookqYSJw3mlyP6cYYKD-eaIvpQMvi4JO6TY,8807
|
17
17
|
compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py,sha256=S8vW0FI9ep_XtUQOxj0P5utJt3vKEYOHjWEPp-Xd9aY,5820
|
@@ -29,24 +29,24 @@ compressed_tensors/quantization/quant_args.py,sha256=sKpb8DcNObidjXjNol1Tn_Iih3Z
|
|
29
29
|
compressed_tensors/quantization/quant_config.py,sha256=MxSUcb5dOqMN6LFyD5K2h8X0TvEtcWIAoiUJqD2dHGE,10159
|
30
30
|
compressed_tensors/quantization/quant_scheme.py,sha256=yz0oMbbwp7QZXXd2k5KIJu-Q6aTqg2929VdUzZ7vysM,6324
|
31
31
|
compressed_tensors/quantization/lifecycle/__init__.py,sha256=_uItzFWusyV74Zco_pHLOTdE9a83cL-R-ZdyQrBkIyw,772
|
32
|
-
compressed_tensors/quantization/lifecycle/apply.py,sha256=
|
32
|
+
compressed_tensors/quantization/lifecycle/apply.py,sha256=OR-6QmN9pFRGteYMBAatu2T5qHutQt7Iw3jH4DILvEk,18071
|
33
33
|
compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
|
34
34
|
compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
|
35
35
|
compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
|
36
|
-
compressed_tensors/quantization/lifecycle/initialize.py,sha256=
|
36
|
+
compressed_tensors/quantization/lifecycle/initialize.py,sha256=SY4-FJWpVSupQjuvy7rrIc0pFYU9cRL5Lo1KyfUSvoU,8010
|
37
37
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
38
38
|
compressed_tensors/quantization/utils/helpers.py,sha256=-wX0H7zVysJ67jRRCGbx6BfxbMU_1sqffTf5YUIpPiU,14391
|
39
39
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
40
40
|
compressed_tensors/registry/registry.py,sha256=vRcjVB1ITfSbfYUaGndBBmqhip_5vsS62weorVg0iXo,11896
|
41
41
|
compressed_tensors/utils/__init__.py,sha256=gS4gSU2pwcAbsKj-6YMaqhm25udFy6ISYaWBf-myRSM,808
|
42
42
|
compressed_tensors/utils/helpers.py,sha256=RrNvzD08naEjEiXdU-FdZjQVda1nQywu1hA_GCDj0vg,10415
|
43
|
-
compressed_tensors/utils/offload.py,sha256=
|
43
|
+
compressed_tensors/utils/offload.py,sha256=Fmb4jBJhH5OdSQFaecFSHK_UreSyZdynEkadZ_oKcvM,14153
|
44
44
|
compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVyah6BUUir_StT28,2530
|
45
45
|
compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
|
46
|
-
compressed_tensors/utils/safetensors_load.py,sha256=
|
46
|
+
compressed_tensors/utils/safetensors_load.py,sha256=rwj0ufU5561ScWDoCG7tzLBRDtiykNno2Iq4PM_JA7E,11499
|
47
47
|
compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
|
48
|
-
compressed_tensors-0.9.
|
49
|
-
compressed_tensors-0.9.
|
50
|
-
compressed_tensors-0.9.
|
51
|
-
compressed_tensors-0.9.
|
52
|
-
compressed_tensors-0.9.
|
48
|
+
compressed_tensors-0.9.5a20250424.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
49
|
+
compressed_tensors-0.9.5a20250424.dist-info/METADATA,sha256=P0oAhrS28ZU90nUEi9yjIu3CE-968yZTsTLTx1Uj1nM,7004
|
50
|
+
compressed_tensors-0.9.5a20250424.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
51
|
+
compressed_tensors-0.9.5a20250424.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
52
|
+
compressed_tensors-0.9.5a20250424.dist-info/RECORD,,
|
File without changes
|
File without changes
|