compressed-tensors 0.9.1__tar.gz → 0.9.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/PKG-INFO +23 -3
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/base.py +9 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/model_compressors/model_compressor.py +120 -6
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/quantized_compressors/base.py +33 -4
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py +12 -6
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py +36 -13
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_compressors/base.py +2 -3
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_compressors/dense.py +8 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py +11 -5
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py +7 -1
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py +8 -2
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/linear/compressed_linear.py +25 -6
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/apply.py +17 -12
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/initialize.py +2 -3
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/quant_args.py +8 -9
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/helpers.py +7 -3
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/offload.py +7 -1
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/safetensors_load.py +7 -5
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/version.py +1 -1
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/PKG-INFO +23 -3
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/SOURCES.txt +3 -1
- compressed_tensors-0.9.3/tests/test_registry.py +53 -0
- compressed_tensors-0.9.3/tests/testing_utils.py +144 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/LICENSE +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/README.md +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/pyproject.toml +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/setup.cfg +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/setup.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/base.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/helpers.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/model_compressors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/quantized_compressors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_compressors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/base.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/dense.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/sparse_24_bitmask.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/sparse_bitmask.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/linear/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/compressed.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/forward.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/lifecycle/helpers.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/quant_config.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/quant_scheme.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/utils/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/quantization/utils/helpers.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/registry/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/registry/registry.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/__init__.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/permutations_24.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/permute.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/semi_structured_conversions.py +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/dependency_links.txt +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/requires.txt +0 -0
- {compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/top_level.txt +0 -0
@@ -1,15 +1,35 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.3
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
7
7
|
Author-email: support@neuralmagic.com
|
8
8
|
License: Apache 2.0
|
9
9
|
Description-Content-Type: text/markdown
|
10
|
+
License-File: LICENSE
|
11
|
+
Requires-Dist: torch>=1.7.0
|
12
|
+
Requires-Dist: transformers
|
13
|
+
Requires-Dist: pydantic>=2.0
|
10
14
|
Provides-Extra: dev
|
15
|
+
Requires-Dist: black==22.12.0; extra == "dev"
|
16
|
+
Requires-Dist: isort==5.8.0; extra == "dev"
|
17
|
+
Requires-Dist: wheel>=0.36.2; extra == "dev"
|
18
|
+
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
19
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
20
|
+
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
11
21
|
Provides-Extra: accelerate
|
12
|
-
|
22
|
+
Requires-Dist: accelerate; extra == "accelerate"
|
23
|
+
Dynamic: author
|
24
|
+
Dynamic: author-email
|
25
|
+
Dynamic: description
|
26
|
+
Dynamic: description-content-type
|
27
|
+
Dynamic: home-page
|
28
|
+
Dynamic: license
|
29
|
+
Dynamic: license-file
|
30
|
+
Dynamic: provides-extra
|
31
|
+
Dynamic: requires-dist
|
32
|
+
Dynamic: summary
|
13
33
|
|
14
34
|
# compressed-tensors
|
15
35
|
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/base.py
RENAMED
@@ -77,6 +77,15 @@ class BaseCompressor(RegistryMixin, ABC):
|
|
77
77
|
"""
|
78
78
|
raise NotImplementedError()
|
79
79
|
|
80
|
+
@property
|
81
|
+
@abstractmethod
|
82
|
+
def compression_param_names(self) -> Tuple[str]:
|
83
|
+
"""
|
84
|
+
Returns a tuple of compression parameter names introduced by
|
85
|
+
the compressor during compression
|
86
|
+
"""
|
87
|
+
raise NotImplementedError()
|
88
|
+
|
80
89
|
@abstractmethod
|
81
90
|
def compress(
|
82
91
|
self,
|
@@ -19,7 +19,7 @@ import os
|
|
19
19
|
import re
|
20
20
|
from contextlib import contextmanager
|
21
21
|
from copy import deepcopy
|
22
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional, Set, TypeVar, Union
|
22
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
|
23
23
|
|
24
24
|
import compressed_tensors
|
25
25
|
import torch
|
@@ -39,13 +39,17 @@ from compressed_tensors.quantization import (
|
|
39
39
|
apply_quantization_config,
|
40
40
|
load_pretrained_quantization,
|
41
41
|
)
|
42
|
-
from compressed_tensors.quantization.lifecycle import
|
42
|
+
from compressed_tensors.quantization.lifecycle import expand_target_names
|
43
43
|
from compressed_tensors.quantization.quant_args import QuantizationArgs
|
44
44
|
from compressed_tensors.quantization.utils import (
|
45
45
|
is_module_quantized,
|
46
46
|
iter_named_leaf_modules,
|
47
47
|
)
|
48
|
-
from compressed_tensors.utils import
|
48
|
+
from compressed_tensors.utils import (
|
49
|
+
get_safetensors_folder,
|
50
|
+
merge_names,
|
51
|
+
update_parameter_data,
|
52
|
+
)
|
49
53
|
from compressed_tensors.utils.helpers import (
|
50
54
|
fix_fsdp_module_name,
|
51
55
|
is_compressed_tensors_config,
|
@@ -254,6 +258,107 @@ class ModelCompressor:
|
|
254
258
|
quantization_config.format, config=quantization_config
|
255
259
|
)
|
256
260
|
|
261
|
+
def get_missing_module_keys(self, model: Module) -> List[str]:
|
262
|
+
"""
|
263
|
+
Identifies the expected missing weight keys in the compressed state_dict.
|
264
|
+
|
265
|
+
When a model undergoes sparsity or quantization compression, certain
|
266
|
+
weight tensors may be absent from the checkpoint by virtue of compression.
|
267
|
+
This function determines which weight keys are missing based on the
|
268
|
+
applied compression techniques.
|
269
|
+
|
270
|
+
|
271
|
+
:param model: The PyTorch model to check for missing keys.
|
272
|
+
:return: A list of missing keys expected in the compressed state_dict.
|
273
|
+
"""
|
274
|
+
missing_keys = set()
|
275
|
+
|
276
|
+
# Determine missing keys due to sparsity compression
|
277
|
+
if (
|
278
|
+
self.sparsity_compressor
|
279
|
+
and self.sparsity_config.format != CompressionFormat.dense.value
|
280
|
+
):
|
281
|
+
sparse_targets = expand_target_names(
|
282
|
+
model=model,
|
283
|
+
targets=self.sparsity_config.targets,
|
284
|
+
ignore=self.sparsity_config.ignore,
|
285
|
+
)
|
286
|
+
missing_keys.update(
|
287
|
+
merge_names(target, "weight") for target in sparse_targets
|
288
|
+
)
|
289
|
+
|
290
|
+
# Determine missing keys due to pack quantization
|
291
|
+
if (
|
292
|
+
self.quantization_compressor
|
293
|
+
and self.quantization_config.format
|
294
|
+
== CompressionFormat.pack_quantized.value
|
295
|
+
):
|
296
|
+
for scheme in self.quantization_config.config_groups.values():
|
297
|
+
quant_targets = expand_target_names(
|
298
|
+
model=model,
|
299
|
+
targets=scheme.targets,
|
300
|
+
ignore=self.quantization_config.ignore,
|
301
|
+
)
|
302
|
+
missing_keys.update(
|
303
|
+
merge_names(target, "weight") for target in quant_targets
|
304
|
+
)
|
305
|
+
|
306
|
+
return list(missing_keys)
|
307
|
+
|
308
|
+
def get_unexpected_file_keys(self, model: Module) -> List[str]:
|
309
|
+
"""
|
310
|
+
Identifies extra keys introduced by the compression process in the
|
311
|
+
compressed state_dict that are not expected by the model graph.
|
312
|
+
|
313
|
+
During sparsity or quantization compression, additional metadata or
|
314
|
+
auxiliary parameters may be stored in the checkpoint, which do not
|
315
|
+
correspond to any parameter in the original model. These keys are
|
316
|
+
typically introduced to support the reconstruction of compressed weights.
|
317
|
+
|
318
|
+
For example, Sparse24Bitmask compression may introduce keys such as
|
319
|
+
'compressed', 'bitmask', and 'shape' in the checkpoint, which are
|
320
|
+
not part of the original model parameters.
|
321
|
+
|
322
|
+
:param model: The PyTorch model to check for unexpected keys.
|
323
|
+
:return: A list of extra keys introduced by the compression process
|
324
|
+
that are not expected by the model.
|
325
|
+
"""
|
326
|
+
|
327
|
+
unexpected_keys = set()
|
328
|
+
|
329
|
+
# Identify unexpected keys from sparsity compression
|
330
|
+
if (
|
331
|
+
self.sparsity_compressor
|
332
|
+
and self.sparsity_config.format != CompressionFormat.dense.value
|
333
|
+
):
|
334
|
+
sparse_targets: Set[str] = expand_target_names(
|
335
|
+
model=model,
|
336
|
+
targets=self.sparsity_config.targets,
|
337
|
+
ignore=self.sparsity_config.ignore,
|
338
|
+
)
|
339
|
+
unexpected_keys.update(
|
340
|
+
merge_names(target, param)
|
341
|
+
for target in sparse_targets
|
342
|
+
for param in self.sparsity_compressor.compression_param_names
|
343
|
+
)
|
344
|
+
|
345
|
+
# Identify unexpected keys from quantization compression
|
346
|
+
if self.quantization_compressor:
|
347
|
+
for scheme in self.quantization_config.config_groups.values():
|
348
|
+
quant_targets: Set[str] = expand_target_names(
|
349
|
+
model=model,
|
350
|
+
targets=scheme.targets,
|
351
|
+
ignore=self.quantization_config.ignore,
|
352
|
+
)
|
353
|
+
unexpected_keys.update(
|
354
|
+
merge_names(target, param)
|
355
|
+
for target in quant_targets
|
356
|
+
for param in self.quantization_compressor.compression_param_names
|
357
|
+
if param != "weight"
|
358
|
+
)
|
359
|
+
|
360
|
+
return list(unexpected_keys)
|
361
|
+
|
257
362
|
def compress(
|
258
363
|
self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
|
259
364
|
) -> Dict[str, Tensor]:
|
@@ -283,7 +388,7 @@ class ModelCompressor:
|
|
283
388
|
)
|
284
389
|
|
285
390
|
if self.sparsity_compressor is not None:
|
286
|
-
sparse_compression_targets: Set[str] =
|
391
|
+
sparse_compression_targets: Set[str] = expand_target_names(
|
287
392
|
model=model,
|
288
393
|
targets=self.sparsity_config.targets,
|
289
394
|
ignore=self.sparsity_config.ignore,
|
@@ -417,10 +522,13 @@ class ModelCompressor:
|
|
417
522
|
update_parameter_data(module, data, param_name)
|
418
523
|
|
419
524
|
|
420
|
-
def map_modules_to_quant_args(
|
525
|
+
def map_modules_to_quant_args(
|
526
|
+
model: Module,
|
527
|
+
) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
|
421
528
|
"""
|
422
529
|
Given a pytorch model, map out the submodule name (usually linear layers)
|
423
|
-
|
530
|
+
to the weight QuantizationArgs. If running input activation quantization, will also
|
531
|
+
map to the input QuantizationArgs in a tuple.
|
424
532
|
|
425
533
|
:param model: pytorch model
|
426
534
|
"""
|
@@ -430,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
|
|
430
538
|
if submodule.quantization_scheme.weights is not None:
|
431
539
|
name = fix_fsdp_module_name(name)
|
432
540
|
quantized_modules_to_args[name] = submodule.quantization_scheme.weights
|
541
|
+
if submodule.quantization_scheme.input_activations is not None:
|
542
|
+
weight_args = quantized_modules_to_args.get(name)
|
543
|
+
quantized_modules_to_args[name] = (
|
544
|
+
weight_args,
|
545
|
+
submodule.quantization_scheme.input_activations,
|
546
|
+
)
|
433
547
|
|
434
548
|
return quantized_modules_to_args
|
435
549
|
|
@@ -82,11 +82,32 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
82
82
|
"""
|
83
83
|
compressed_dict = {}
|
84
84
|
weight_suffix = ".weight"
|
85
|
+
input_zp_suffix = ".input_zero_point"
|
86
|
+
weight_zp_suffix = ".weight_zero_point"
|
85
87
|
_LOGGER.debug(
|
86
88
|
f"Compressing model with {len(model_state)} parameterized layers..."
|
87
89
|
)
|
88
90
|
|
89
91
|
for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
|
92
|
+
# check if the parameter we're compressing is the weight zp
|
93
|
+
# or the input zp
|
94
|
+
is_weight_zp = name.endswith(weight_zp_suffix)
|
95
|
+
is_input_zp = name.endswith(input_zp_suffix)
|
96
|
+
|
97
|
+
# if we're saving the weight zp, fetch weight quant args
|
98
|
+
if is_weight_zp:
|
99
|
+
quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
|
100
|
+
if isinstance(quant_args_zp, tuple):
|
101
|
+
# If tuple, first value is weight args, second is input args
|
102
|
+
quant_args_zp = quant_args_zp[0]
|
103
|
+
|
104
|
+
# if we're saving the input zp, fetch input quant args
|
105
|
+
if is_input_zp:
|
106
|
+
input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
|
107
|
+
if isinstance(input_args_zp, tuple):
|
108
|
+
# If tuple, first value is weight args, second is input args
|
109
|
+
input_args_zp = input_args_zp[-1]
|
110
|
+
|
90
111
|
if name.endswith(weight_suffix):
|
91
112
|
prefix = name[: -(len(weight_suffix))]
|
92
113
|
scale = model_state.get(merge_names(prefix, "weight_scale"), None)
|
@@ -94,7 +115,11 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
94
115
|
g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
|
95
116
|
if scale is not None:
|
96
117
|
# weight is quantized, compress it
|
97
|
-
|
118
|
+
if isinstance(names_to_scheme[prefix], tuple):
|
119
|
+
quant_args = names_to_scheme[prefix][0]
|
120
|
+
else:
|
121
|
+
quant_args = names_to_scheme[prefix]
|
122
|
+
|
98
123
|
compressed_data = self.compress_weight(
|
99
124
|
weight=value,
|
100
125
|
scale=scale,
|
@@ -107,7 +132,11 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
107
132
|
compressed_dict[merge_names(prefix, key)] = value
|
108
133
|
else:
|
109
134
|
compressed_dict[name] = value.to("cpu")
|
110
|
-
|
135
|
+
# only save if asym
|
136
|
+
elif is_weight_zp and quant_args_zp.symmetric:
|
137
|
+
continue
|
138
|
+
# only save if asym
|
139
|
+
elif is_input_zp and input_args_zp.symmetric:
|
111
140
|
continue
|
112
141
|
elif name.endswith("g_idx") and torch.any(value <= -1):
|
113
142
|
continue
|
@@ -144,7 +173,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
144
173
|
|
145
174
|
def _decompress_from_path(self, path_to_model, names_to_scheme, device):
|
146
175
|
weight_mappings = get_nested_weight_mappings(
|
147
|
-
path_to_model, self.
|
176
|
+
path_to_model, self.compression_param_names
|
148
177
|
)
|
149
178
|
for weight_name in weight_mappings.keys():
|
150
179
|
weight_data = {}
|
@@ -161,7 +190,7 @@ class BaseQuantizationCompressor(BaseCompressor):
|
|
161
190
|
|
162
191
|
def _decompress_from_state_dict(self, state_dict, names_to_scheme):
|
163
192
|
weight_mappings = get_nested_mappings_from_state_dict(
|
164
|
-
state_dict, self.
|
193
|
+
state_dict, self.compression_param_names
|
165
194
|
)
|
166
195
|
for weight_name in weight_mappings.keys():
|
167
196
|
weight_data = {}
|
@@ -41,12 +41,18 @@ class NaiveQuantizationCompressor(BaseQuantizationCompressor):
|
|
41
41
|
type to the type specified by the layer's QuantizationArgs.
|
42
42
|
"""
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
"
|
47
|
-
|
48
|
-
|
49
|
-
|
44
|
+
@property
|
45
|
+
def compression_param_names(self) -> Tuple[str]:
|
46
|
+
"""
|
47
|
+
Returns a tuple of compression parameter names introduced by
|
48
|
+
the compressor during compression
|
49
|
+
"""
|
50
|
+
return (
|
51
|
+
"weight",
|
52
|
+
"weight_scale",
|
53
|
+
"weight_zero_point",
|
54
|
+
"weight_g_idx",
|
55
|
+
)
|
50
56
|
|
51
57
|
def compression_param_info(
|
52
58
|
self,
|
@@ -36,13 +36,19 @@ class PackedQuantizationCompressor(BaseQuantizationCompressor):
|
|
36
36
|
Compresses a quantized model by packing every eight 4-bit weights into an int32
|
37
37
|
"""
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
"
|
42
|
-
|
43
|
-
|
44
|
-
"
|
45
|
-
|
39
|
+
@property
|
40
|
+
def compression_param_names(self) -> Tuple[str]:
|
41
|
+
"""
|
42
|
+
Returns a tuple of compression parameter names introduced by
|
43
|
+
the compressor during compression
|
44
|
+
"""
|
45
|
+
return (
|
46
|
+
"weight_packed",
|
47
|
+
"weight_scale",
|
48
|
+
"weight_zero_point",
|
49
|
+
"weight_g_idx",
|
50
|
+
"weight_shape",
|
51
|
+
)
|
46
52
|
|
47
53
|
def compression_param_info(
|
48
54
|
self,
|
@@ -138,8 +144,20 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
138
144
|
"""
|
139
145
|
Packs a tensor of quantized weights stored in int8 into int32s with padding
|
140
146
|
|
147
|
+
Pseudocode:
|
148
|
+
1. Shift wrt num_bits to convert to unsigned. num_bits=8
|
149
|
+
[1,2] -> [129, 130]
|
150
|
+
2. Pad to fill in 32 bits
|
151
|
+
[129, 130] -> [129, 130, 0, 0]
|
152
|
+
3. convert to binary align in order
|
153
|
+
[129, 130, 0, 0] -> 00000000 00000000 10000010 10000001
|
154
|
+
4. convert aligned binary to number
|
155
|
+
00000000000000001000001010000001 -> 33409
|
156
|
+
5. covert back to uint32
|
157
|
+
33409 -> 33409
|
158
|
+
|
141
159
|
:param value: tensor to pack
|
142
|
-
:param num_bits: number of bits used to store underlying data
|
160
|
+
:param num_bits: number of bits used to store underlying data, must be at least 1
|
143
161
|
:returns: packed int32 tensor
|
144
162
|
"""
|
145
163
|
if value.dtype is not torch.int8:
|
@@ -148,19 +166,22 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
|
|
148
166
|
if num_bits > 8:
|
149
167
|
raise ValueError("Packing is only supported for less than 8 bits")
|
150
168
|
|
169
|
+
if num_bits < 1:
|
170
|
+
raise ValueError(f"num_bits must be at least 1, got {num_bits}")
|
171
|
+
|
151
172
|
# convert to unsigned for packing
|
152
|
-
offset =
|
173
|
+
offset = 1 << (num_bits - 1)
|
153
174
|
value = (value + offset).to(torch.uint8)
|
154
175
|
value = value.cpu().numpy().astype(np.uint32)
|
155
176
|
pack_factor = 32 // num_bits
|
156
177
|
|
157
178
|
# pad input tensor and initialize packed output
|
158
179
|
packed_size = math.ceil(value.shape[1] / pack_factor)
|
159
|
-
|
160
|
-
padding = packed.shape[1] * pack_factor - value.shape[1]
|
180
|
+
padding = packed_size * pack_factor - value.shape[1]
|
161
181
|
value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
|
162
182
|
|
163
183
|
# pack values
|
184
|
+
packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
|
164
185
|
for i in range(pack_factor):
|
165
186
|
packed |= value[:, i::pack_factor] << num_bits * i
|
166
187
|
|
@@ -174,7 +195,9 @@ def unpack_from_int32(
|
|
174
195
|
) -> torch.Tensor:
|
175
196
|
"""
|
176
197
|
Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
|
177
|
-
original
|
198
|
+
original bit range.
|
199
|
+
|
200
|
+
Return tensors in int8
|
178
201
|
|
179
202
|
:param value: tensor to upack
|
180
203
|
:param num_bits: number of bits to unpack each data point into
|
@@ -192,7 +215,7 @@ def unpack_from_int32(
|
|
192
215
|
pack_factor = 32 // num_bits
|
193
216
|
|
194
217
|
# unpack
|
195
|
-
mask =
|
218
|
+
mask = (1 << num_bits) - 1
|
196
219
|
unpacked = torch.zeros(
|
197
220
|
(value.shape[0], value.shape[1] * pack_factor),
|
198
221
|
device=value.device,
|
@@ -30,8 +30,7 @@ _LOGGER: logging.Logger = logging.getLogger(__name__)
|
|
30
30
|
class BaseSparseCompressor(BaseCompressor):
|
31
31
|
"""
|
32
32
|
Base class representing a sparse compression algorithm. Each child class should
|
33
|
-
implement
|
34
|
-
classes should also define COMPRESSION_PARAM_NAMES.
|
33
|
+
implement compression_param_names, compress_weight and decompress_weight;
|
35
34
|
|
36
35
|
Compressors support compressing/decompressing a full module state dict or a single
|
37
36
|
quantized PyTorch leaf module.
|
@@ -113,7 +112,7 @@ class BaseSparseCompressor(BaseCompressor):
|
|
113
112
|
"""
|
114
113
|
weight_mappings, ignored_params = get_nested_weight_mappings(
|
115
114
|
path_to_model_or_tensors,
|
116
|
-
self.
|
115
|
+
self.compression_param_names,
|
117
116
|
return_unmatched_params=True,
|
118
117
|
)
|
119
118
|
for weight_name in weight_mappings.keys():
|
@@ -25,6 +25,14 @@ class DenseCompressor(BaseCompressor):
|
|
25
25
|
Identity compressor for dense models, returns the original state_dict
|
26
26
|
"""
|
27
27
|
|
28
|
+
@property
|
29
|
+
def compression_param_names(self) -> Tuple[str]:
|
30
|
+
"""
|
31
|
+
Returns a tuple of compression parameter names introduced by
|
32
|
+
the compressor during compression
|
33
|
+
"""
|
34
|
+
return ()
|
35
|
+
|
28
36
|
def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
|
29
37
|
return model_state
|
30
38
|
|
@@ -40,11 +40,17 @@ class Sparse24BitMaskCompressor(BaseSparseCompressor):
|
|
40
40
|
values tensor, with their locations stored in a 2d bitmask
|
41
41
|
"""
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
"
|
46
|
-
|
47
|
-
|
43
|
+
@property
|
44
|
+
def compression_param_names(self) -> Tuple[str]:
|
45
|
+
"""
|
46
|
+
Returns a tuple of compression parameter names introduced by
|
47
|
+
the compressor during compression
|
48
|
+
"""
|
49
|
+
return (
|
50
|
+
"shape",
|
51
|
+
"compressed",
|
52
|
+
"bitmask",
|
53
|
+
)
|
48
54
|
|
49
55
|
def compress_weight(self, name, value):
|
50
56
|
bitmask_tensor = Sparse24BitMaskTensor.from_dense(
|
@@ -38,7 +38,13 @@ class BitmaskCompressor(BaseSparseCompressor):
|
|
38
38
|
values tensor, with their locations stored in a 2d bitmask
|
39
39
|
"""
|
40
40
|
|
41
|
-
|
41
|
+
@property
|
42
|
+
def compression_param_names(self) -> Tuple[str]:
|
43
|
+
"""
|
44
|
+
Returns a tuple of compression parameter names introduced by
|
45
|
+
the compressor during compression
|
46
|
+
"""
|
47
|
+
return ("shape", "compressed", "bitmask", "row_offsets")
|
42
48
|
|
43
49
|
def compress_weight(self, name, value):
|
44
50
|
bitmask_tensor = BitmaskTensor.from_dense(value)
|
@@ -42,8 +42,6 @@ class Marlin24Compressor(BaseCompressor):
|
|
42
42
|
Marlin24 kernel. Decompression is not implemented for this compressor.
|
43
43
|
"""
|
44
44
|
|
45
|
-
COMPRESSION_PARAM_NAMES = ["weight_packed", "scale_packed", "meta"]
|
46
|
-
|
47
45
|
@staticmethod
|
48
46
|
def validate_quant_compatability(
|
49
47
|
model_quant_args: Dict[str, QuantizationArgs]
|
@@ -105,6 +103,14 @@ class Marlin24Compressor(BaseCompressor):
|
|
105
103
|
|
106
104
|
return True
|
107
105
|
|
106
|
+
@property
|
107
|
+
def compression_param_names(self) -> Tuple[str]:
|
108
|
+
"""
|
109
|
+
Returns a tuple of compression parameter names introduced by
|
110
|
+
the compressor during compression
|
111
|
+
"""
|
112
|
+
return ("weight_packed", "scale_packed", "meta")
|
113
|
+
|
108
114
|
def compress(
|
109
115
|
self,
|
110
116
|
model_state: Dict[str, Tensor],
|
@@ -12,6 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
import warnings
|
15
16
|
from typing import Dict, Tuple
|
16
17
|
|
17
18
|
import torch
|
@@ -21,6 +22,7 @@ from compressed_tensors.quantization import (
|
|
21
22
|
QuantizationStatus,
|
22
23
|
initialize_module_for_quantization,
|
23
24
|
)
|
25
|
+
from compressed_tensors.utils import register_offload_parameter
|
24
26
|
from torch import Tensor
|
25
27
|
from torch.nn import Parameter
|
26
28
|
from torch.nn.functional import linear
|
@@ -32,11 +34,16 @@ class CompressedLinear(Linear):
|
|
32
34
|
Wrapper module for running a compressed forward pass of a quantized Linear module.
|
33
35
|
The wrapped layer will decompressed on each forward call.
|
34
36
|
|
35
|
-
:param module: dense linear module to replace
|
36
|
-
:param quantization_scheme: quantization config for the module to wrap
|
37
|
-
:param quantization_format: compression format module is stored as
|
38
37
|
"""
|
39
38
|
|
39
|
+
def __init__(self, *args, **kwargs) -> None:
|
40
|
+
super().__init__(*args, **kwargs)
|
41
|
+
warnings.warn(
|
42
|
+
"CompressedLinear should not be initialized directly. "
|
43
|
+
"Use the from_linear method instead.",
|
44
|
+
UserWarning,
|
45
|
+
)
|
46
|
+
|
40
47
|
@classmethod
|
41
48
|
@torch.no_grad()
|
42
49
|
def from_linear(
|
@@ -45,6 +52,12 @@ class CompressedLinear(Linear):
|
|
45
52
|
quantization_scheme: QuantizationScheme,
|
46
53
|
quantization_format: str,
|
47
54
|
):
|
55
|
+
"""
|
56
|
+
:param module: dense linear module to replace
|
57
|
+
:param quantization_scheme: quantization config for the module to wrap
|
58
|
+
:param quantization_format: compression format module is stored as
|
59
|
+
:return: CompressedLinear module wrapping the input module
|
60
|
+
"""
|
48
61
|
module.__class__ = CompressedLinear
|
49
62
|
module.compressor = BaseCompressor.load_from_registry(quantization_format)
|
50
63
|
device = next(module.parameters()).device
|
@@ -68,7 +81,7 @@ class CompressedLinear(Linear):
|
|
68
81
|
param = Parameter(
|
69
82
|
torch.empty(shape, device=device, dtype=dtype), requires_grad=False
|
70
83
|
)
|
71
|
-
module
|
84
|
+
register_offload_parameter(module, name, param)
|
72
85
|
|
73
86
|
# mark module as compressed
|
74
87
|
module.quantization_status = QuantizationStatus.COMPRESSED
|
@@ -85,5 +98,11 @@ class CompressedLinear(Linear):
|
|
85
98
|
"""
|
86
99
|
Decompresses the weight, then runs the wrapped forward pass
|
87
100
|
"""
|
88
|
-
|
89
|
-
|
101
|
+
if self.quantization_status == QuantizationStatus.COMPRESSED:
|
102
|
+
weight_data = self.compressor.decompress_module(self)
|
103
|
+
param = Parameter(weight_data, requires_grad=False)
|
104
|
+
register_offload_parameter(self, "weight", param)
|
105
|
+
|
106
|
+
self.quantization_status = QuantizationStatus.FROZEN
|
107
|
+
|
108
|
+
return linear(input, self.weight, self.bias)
|
@@ -52,8 +52,8 @@ __all__ = [
|
|
52
52
|
"apply_quantization_config",
|
53
53
|
"apply_quantization_status",
|
54
54
|
"find_name_or_class_matches",
|
55
|
-
"
|
56
|
-
"
|
55
|
+
"expand_target_names",
|
56
|
+
"is_target",
|
57
57
|
]
|
58
58
|
|
59
59
|
from compressed_tensors.quantization.utils.helpers import is_module_quantized
|
@@ -247,8 +247,10 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
|
|
247
247
|
model.apply(compress_quantized_weights)
|
248
248
|
|
249
249
|
|
250
|
-
def
|
251
|
-
model: Module,
|
250
|
+
def expand_target_names(
|
251
|
+
model: Module,
|
252
|
+
targets: Optional[Iterable[str]] = None,
|
253
|
+
ignore: Optional[Iterable[str]] = None,
|
252
254
|
) -> Set[str]:
|
253
255
|
"""
|
254
256
|
Finds all unique module names in the model that match the given
|
@@ -257,20 +259,23 @@ def expand_sparse_target_names(
|
|
257
259
|
Note: Targets must be regexes, layer types, or full layer names.
|
258
260
|
|
259
261
|
:param model: model to search for targets in
|
260
|
-
:param targets:
|
261
|
-
:param ignore:
|
262
|
+
:param targets: Iterable of targets to search for
|
263
|
+
:param ignore: Iterable of targets to ignore
|
262
264
|
:return: set of all targets that match the given targets and should
|
263
265
|
not be ignored
|
264
266
|
"""
|
265
267
|
return {
|
266
268
|
name
|
267
269
|
for name, module in iter_named_leaf_modules(model)
|
268
|
-
if
|
270
|
+
if is_target(name, module, targets, ignore)
|
269
271
|
}
|
270
272
|
|
271
273
|
|
272
|
-
def
|
273
|
-
name: str,
|
274
|
+
def is_target(
|
275
|
+
name: str,
|
276
|
+
module: Module,
|
277
|
+
targets: Optional[Iterable[str]] = None,
|
278
|
+
ignore: Optional[Iterable[str]] = None,
|
274
279
|
) -> bool:
|
275
280
|
"""
|
276
281
|
Determines if a module should be included in the targets based on the
|
@@ -280,12 +285,12 @@ def is_sparse_target(
|
|
280
285
|
|
281
286
|
:param name: name of the module
|
282
287
|
:param module: the module itself
|
283
|
-
:param targets:
|
284
|
-
:param ignore:
|
288
|
+
:param targets: Iterable of targets to search for
|
289
|
+
:param ignore: Iterable of targets to ignore
|
285
290
|
:return: True if the module is a target and not ignored, False otherwise
|
286
291
|
"""
|
287
292
|
return bool(
|
288
|
-
find_name_or_class_matches(name, module, targets)
|
293
|
+
find_name_or_class_matches(name, module, targets or [])
|
289
294
|
and not find_name_or_class_matches(name, module, ignore or [])
|
290
295
|
)
|
291
296
|
|
@@ -203,11 +203,10 @@ def _initialize_attn_scales(module: Module) -> None:
|
|
203
203
|
torch.empty(expected_shape, dtype=scale_dtype, device=device),
|
204
204
|
requires_grad=False,
|
205
205
|
)
|
206
|
-
|
207
|
-
module.register_parameter(KVCacheScaleType.KEY.value, init_scale)
|
206
|
+
register_offload_parameter(module, KVCacheScaleType.KEY.value, init_scale)
|
208
207
|
|
209
208
|
init_scale = Parameter(
|
210
209
|
torch.empty(expected_shape, dtype=scale_dtype, device=device),
|
211
210
|
requires_grad=False,
|
212
211
|
)
|
213
|
-
module
|
212
|
+
register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
|
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional, Union
|
|
18
18
|
|
19
19
|
import torch
|
20
20
|
from compressed_tensors.utils import Aliasable
|
21
|
+
from compressed_tensors.utils.helpers import deprecated
|
21
22
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
22
23
|
|
23
24
|
|
@@ -109,10 +110,10 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
109
110
|
dynamic: bool = False
|
110
111
|
actorder: Union[ActivationOrdering, bool, None] = None
|
111
112
|
observer: Optional[str] = Field(
|
112
|
-
default=
|
113
|
+
default=None,
|
113
114
|
description=(
|
114
|
-
"
|
115
|
-
"
|
115
|
+
"Determines the method of computing quantization parameters (scales and "
|
116
|
+
"zero-points). Defaults to min-max when not using dynamic quantization"
|
116
117
|
),
|
117
118
|
)
|
118
119
|
observer_kwargs: Dict[str, Any] = Field(
|
@@ -123,12 +124,6 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
123
124
|
),
|
124
125
|
)
|
125
126
|
|
126
|
-
def get_observer(self):
|
127
|
-
"""
|
128
|
-
:return: torch quantization FakeQuantize built based on these QuantizationArgs
|
129
|
-
"""
|
130
|
-
return self.observer
|
131
|
-
|
132
127
|
@field_validator("type", mode="before")
|
133
128
|
def validate_type(cls, value) -> QuantizationType:
|
134
129
|
if isinstance(value, str):
|
@@ -250,6 +245,10 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
|
|
250
245
|
else:
|
251
246
|
raise ValueError(f"Invalid quantization type {self.type}")
|
252
247
|
|
248
|
+
@deprecated("QuantizationArgs.observer")
|
249
|
+
def get_observer(self) -> str:
|
250
|
+
return self.observer
|
251
|
+
|
253
252
|
|
254
253
|
def round_to_quantized_type(
|
255
254
|
tensor: torch.Tensor, args: QuantizationArgs
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/helpers.py
RENAMED
@@ -14,13 +14,17 @@
|
|
14
14
|
|
15
15
|
import warnings
|
16
16
|
from functools import wraps
|
17
|
-
from typing import Any, Callable, Dict, List, Optional
|
17
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
|
18
18
|
|
19
19
|
import numpy
|
20
20
|
import torch
|
21
21
|
from transformers import AutoConfig
|
22
22
|
|
23
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from compressed_tensors.compressors import ModelCompressor
|
26
|
+
|
27
|
+
|
24
28
|
__all__ = [
|
25
29
|
"infer_compressor_from_model_config",
|
26
30
|
"fix_fsdp_module_name",
|
@@ -166,8 +170,8 @@ def deprecated(future_name: Optional[str] = None, message: Optional[str] = None)
|
|
166
170
|
"""
|
167
171
|
Decorator to mark functions as deprecated
|
168
172
|
|
169
|
-
:param new_function: Function called in place of
|
170
|
-
:param message:
|
173
|
+
:param new_function: Function called in place of deprecated function
|
174
|
+
:param message: Deprecation message, replaces default deprecation message
|
171
175
|
"""
|
172
176
|
|
173
177
|
def decorator(func: Callable[[Any], Any]):
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/offload.py
RENAMED
@@ -26,6 +26,7 @@ Utilities associated with offloading functionality provided by `accelerate`.
|
|
26
26
|
"""
|
27
27
|
|
28
28
|
import contextlib
|
29
|
+
import warnings
|
29
30
|
from functools import wraps
|
30
31
|
from typing import Any, Callable, Dict, Literal, Optional, Union
|
31
32
|
|
@@ -200,9 +201,14 @@ def update_offload_parameter(
|
|
200
201
|
"""
|
201
202
|
param = getattr(module, name)
|
202
203
|
data = data.to(param.dtype)
|
204
|
+
if param.data.shape != data.shape:
|
205
|
+
warnings.warn(
|
206
|
+
f"Shape of parameter being updated {param.data.shape} does not match shape "
|
207
|
+
f"of update data {data.shape}"
|
208
|
+
)
|
203
209
|
|
204
210
|
# copy data into onloaded parameter if applicable
|
205
|
-
if param.device != "meta":
|
211
|
+
if param.device != torch.device("meta"):
|
206
212
|
param.data.copy_(data)
|
207
213
|
|
208
214
|
# update offload dict
|
@@ -16,7 +16,7 @@ import json
|
|
16
16
|
import os
|
17
17
|
import re
|
18
18
|
import struct
|
19
|
-
from typing import Dict,
|
19
|
+
from typing import Dict, Iterable, Optional, Tuple, Union
|
20
20
|
|
21
21
|
from safetensors import safe_open
|
22
22
|
from torch import Tensor
|
@@ -180,7 +180,9 @@ def get_weight_mappings(path_to_model_or_tensors: str) -> Dict[str, str]:
|
|
180
180
|
|
181
181
|
|
182
182
|
def get_nested_weight_mappings(
|
183
|
-
model_path: str,
|
183
|
+
model_path: str,
|
184
|
+
params_to_nest: Iterable[str],
|
185
|
+
return_unmatched_params: bool = False,
|
184
186
|
) -> Union[NestedWeightMappingType, Tuple[NestedWeightMappingType, WeightMappingType]]:
|
185
187
|
"""
|
186
188
|
Takes a path to a state dict saved in safetensors format and returns a nested
|
@@ -211,7 +213,7 @@ def get_nested_weight_mappings(
|
|
211
213
|
|
212
214
|
:param model_path: Path to the safetensors state dict, must contain either a
|
213
215
|
single safetensors file or multiple files with an index.
|
214
|
-
:param params_to_nest:
|
216
|
+
:param params_to_nest: Iterable of parameter names to nest.
|
215
217
|
:param return_unmatched_params: If True, return a second dictionary containing
|
216
218
|
the remaining parameters that were not matched to the params_to_nest.
|
217
219
|
:return:
|
@@ -247,7 +249,7 @@ def get_nested_weight_mappings(
|
|
247
249
|
|
248
250
|
|
249
251
|
def get_nested_mappings_from_state_dict(
|
250
|
-
state_dict, params_to_nest
|
252
|
+
state_dict, params_to_nest: Iterable[str]
|
251
253
|
) -> NestedWeightMappingType:
|
252
254
|
"""
|
253
255
|
Takes a state dict and returns a nested mapping from uncompressed
|
@@ -262,7 +264,7 @@ def get_nested_mappings_from_state_dict(
|
|
262
264
|
}
|
263
265
|
|
264
266
|
:param state_dict: state dict of the model
|
265
|
-
:param params_to_nest:
|
267
|
+
:param params_to_nest: Iterable of parameter names to nest.
|
266
268
|
:return: Nested mapping of parameterized layer names to the value of
|
267
269
|
each layer's compression parameters.
|
268
270
|
"""
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/PKG-INFO
RENAMED
@@ -1,15 +1,35 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: compressed-tensors
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.3
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
7
7
|
Author-email: support@neuralmagic.com
|
8
8
|
License: Apache 2.0
|
9
9
|
Description-Content-Type: text/markdown
|
10
|
+
License-File: LICENSE
|
11
|
+
Requires-Dist: torch>=1.7.0
|
12
|
+
Requires-Dist: transformers
|
13
|
+
Requires-Dist: pydantic>=2.0
|
10
14
|
Provides-Extra: dev
|
15
|
+
Requires-Dist: black==22.12.0; extra == "dev"
|
16
|
+
Requires-Dist: isort==5.8.0; extra == "dev"
|
17
|
+
Requires-Dist: wheel>=0.36.2; extra == "dev"
|
18
|
+
Requires-Dist: flake8>=3.8.3; extra == "dev"
|
19
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
20
|
+
Requires-Dist: nbconvert>=7.16.3; extra == "dev"
|
11
21
|
Provides-Extra: accelerate
|
12
|
-
|
22
|
+
Requires-Dist: accelerate; extra == "accelerate"
|
23
|
+
Dynamic: author
|
24
|
+
Dynamic: author-email
|
25
|
+
Dynamic: description
|
26
|
+
Dynamic: description-content-type
|
27
|
+
Dynamic: home-page
|
28
|
+
Dynamic: license
|
29
|
+
Dynamic: license-file
|
30
|
+
Dynamic: provides-extra
|
31
|
+
Dynamic: requires-dist
|
32
|
+
Dynamic: summary
|
13
33
|
|
14
34
|
# compressed-tensors
|
15
35
|
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/SOURCES.txt
RENAMED
@@ -54,4 +54,6 @@ src/compressed_tensors/utils/offload.py
|
|
54
54
|
src/compressed_tensors/utils/permutations_24.py
|
55
55
|
src/compressed_tensors/utils/permute.py
|
56
56
|
src/compressed_tensors/utils/safetensors_load.py
|
57
|
-
src/compressed_tensors/utils/semi_structured_conversions.py
|
57
|
+
src/compressed_tensors/utils/semi_structured_conversions.py
|
58
|
+
tests/test_registry.py
|
59
|
+
tests/testing_utils.py
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import pytest
|
16
|
+
from compressed_tensors import (
|
17
|
+
BaseCompressor,
|
18
|
+
BitmaskCompressor,
|
19
|
+
BitmaskConfig,
|
20
|
+
CompressionFormat,
|
21
|
+
DenseCompressor,
|
22
|
+
DenseSparsityConfig,
|
23
|
+
SparsityCompressionConfig,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.mark.parametrize(
|
28
|
+
"name,type",
|
29
|
+
[
|
30
|
+
[CompressionFormat.sparse_bitmask.value, BitmaskConfig],
|
31
|
+
[CompressionFormat.dense.value, DenseSparsityConfig],
|
32
|
+
],
|
33
|
+
)
|
34
|
+
def test_configs(name, type):
|
35
|
+
config = SparsityCompressionConfig.load_from_registry(name)
|
36
|
+
assert isinstance(config, type)
|
37
|
+
assert config.format == name
|
38
|
+
|
39
|
+
|
40
|
+
@pytest.mark.parametrize(
|
41
|
+
"name,type",
|
42
|
+
[
|
43
|
+
[CompressionFormat.sparse_bitmask.value, BitmaskCompressor],
|
44
|
+
[CompressionFormat.dense.value, DenseCompressor],
|
45
|
+
],
|
46
|
+
)
|
47
|
+
def test_compressors(name, type):
|
48
|
+
compressor = BaseCompressor.load_from_registry(
|
49
|
+
name, config=SparsityCompressionConfig(format="none")
|
50
|
+
)
|
51
|
+
assert isinstance(compressor, type)
|
52
|
+
assert isinstance(compressor.config, SparsityCompressionConfig)
|
53
|
+
assert compressor.config.format == "none"
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
# flake8: noqa
|
15
|
+
import unittest
|
16
|
+
|
17
|
+
import pytest
|
18
|
+
|
19
|
+
|
20
|
+
def compressed_tensors_config_available():
|
21
|
+
try:
|
22
|
+
from transformers.utils.quantization_config import ( # noqa: F401
|
23
|
+
CompressedTensorsConfig,
|
24
|
+
)
|
25
|
+
|
26
|
+
return True
|
27
|
+
except ImportError:
|
28
|
+
return False
|
29
|
+
|
30
|
+
|
31
|
+
def accelerate_availabe():
|
32
|
+
try:
|
33
|
+
import accelerate # noqa: F401
|
34
|
+
|
35
|
+
return True
|
36
|
+
|
37
|
+
except ImportError:
|
38
|
+
return False
|
39
|
+
|
40
|
+
|
41
|
+
_is_compressed_tensors_config_available = compressed_tensors_config_available()
|
42
|
+
_is_accelerate_available = accelerate_availabe()
|
43
|
+
|
44
|
+
|
45
|
+
def requires_hf_quantizer():
|
46
|
+
return pytest.mark.skipif(
|
47
|
+
not _is_compressed_tensors_config_available,
|
48
|
+
reason="requires transformers>=4.45 to support CompressedTensorsHfQuantizer",
|
49
|
+
)
|
50
|
+
|
51
|
+
|
52
|
+
def requires_accelerate():
|
53
|
+
return pytest.mark.skipif(
|
54
|
+
not _is_accelerate_available,
|
55
|
+
reason="requires accelerate",
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
def get_random_mat(M, K, dtype) -> "torch.Tensor":
|
60
|
+
"""
|
61
|
+
:param M: number of rows
|
62
|
+
:param K: number of columns
|
63
|
+
:param dtype: data type of the matrix
|
64
|
+
:return: random matrix of shape (M, K) with non-zero values
|
65
|
+
"""
|
66
|
+
import torch
|
67
|
+
from compressed_tensors.quantization import FP8_DTYPE
|
68
|
+
|
69
|
+
rand_tensor_dtype = dtype
|
70
|
+
if dtype in [torch.int8, FP8_DTYPE]:
|
71
|
+
rand_tensor_dtype = torch.float16
|
72
|
+
mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
|
73
|
+
mat = mat.masked_fill_(mat == 0, 1)
|
74
|
+
return mat.to(dtype)
|
75
|
+
|
76
|
+
|
77
|
+
def generate_pruned_semi_structured_mat(M, K, dtype) -> "torch.Tensor":
|
78
|
+
"""
|
79
|
+
:param M: number of rows
|
80
|
+
:param K: number of columns
|
81
|
+
:param dtype: data type of the matrix
|
82
|
+
:return: random matrix of shape (M, K) with 2:4 sparsity pattern
|
83
|
+
"""
|
84
|
+
import torch
|
85
|
+
from compressed_tensors.quantization import FP8_DTYPE
|
86
|
+
|
87
|
+
mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).bool()
|
88
|
+
rand_tensor_dtype = dtype
|
89
|
+
if dtype in [torch.int8, FP8_DTYPE]:
|
90
|
+
rand_tensor_dtype = torch.float16
|
91
|
+
mat = torch.rand(M, K, dtype=rand_tensor_dtype)
|
92
|
+
mat = mat.masked_fill_(mat == 0, 1)
|
93
|
+
if dtype == FP8_DTYPE:
|
94
|
+
# some float8_e4m3fn operations are not supported on CPU
|
95
|
+
mat = mat.cuda()
|
96
|
+
mask = mask.cuda()
|
97
|
+
mat = mat * mask
|
98
|
+
return mat.to(dtype)
|
99
|
+
|
100
|
+
|
101
|
+
def induce_sparsity(tensor, sparsity_ratio) -> "torch.Tensor":
|
102
|
+
"""
|
103
|
+
Makes a tensor sparse by zeroing out a given fraction
|
104
|
+
of its smallest absolute values.
|
105
|
+
|
106
|
+
:param: weight_tensor (torch.Tensor): The input weight tensor.
|
107
|
+
:param: sparsity_ratio (float): Fraction of weights to be zeroed
|
108
|
+
(0 <= sparsity_ratio <= 1).
|
109
|
+
:returns: torch.Tensor: Sparse version of the input tensor.
|
110
|
+
"""
|
111
|
+
import torch
|
112
|
+
|
113
|
+
if not (0 <= sparsity_ratio <= 1):
|
114
|
+
raise ValueError("Sparsity ratio must be between 0 and 1.")
|
115
|
+
|
116
|
+
# Flatten the tensor and compute the threshold for sparsity
|
117
|
+
flattened = tensor.view(-1)
|
118
|
+
k = int(sparsity_ratio * flattened.numel())
|
119
|
+
|
120
|
+
if k > 0:
|
121
|
+
threshold = torch.topk(flattened.abs(), k, largest=False).values.max()
|
122
|
+
sparse_tensor = torch.where(
|
123
|
+
tensor.abs() > threshold, tensor, torch.zeros_like(tensor)
|
124
|
+
)
|
125
|
+
else:
|
126
|
+
sparse_tensor = tensor
|
127
|
+
|
128
|
+
return sparse_tensor
|
129
|
+
|
130
|
+
|
131
|
+
def is_gpu_available():
|
132
|
+
"""
|
133
|
+
:return: True if a GPU is available, False otherwise
|
134
|
+
"""
|
135
|
+
try:
|
136
|
+
import torch # noqa: F401
|
137
|
+
|
138
|
+
return torch.cuda.device_count() > 0
|
139
|
+
except ImportError:
|
140
|
+
return False
|
141
|
+
|
142
|
+
|
143
|
+
def requires_gpu(test_case):
|
144
|
+
return unittest.skipUnless(is_gpu_available(), "test requires GPU")(test_case)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/__init__.py
RENAMED
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/compressors/helpers.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/__init__.py
RENAMED
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/config/dense.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/linear/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/registry/__init__.py
RENAMED
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/registry/registry.py
RENAMED
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/__init__.py
RENAMED
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors/utils/permute.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/requires.txt
RENAMED
File without changes
|
{compressed-tensors-0.9.1 → compressed_tensors-0.9.3}/src/compressed_tensors.egg-info/top_level.txt
RENAMED
File without changes
|