compressed-tensors 0.9.2__py3-none-any.whl → 0.9.4a20250408__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@ import os
19
19
  import re
20
20
  from contextlib import contextmanager
21
21
  from copy import deepcopy
22
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, TypeVar, Union
22
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
23
23
 
24
24
  import compressed_tensors
25
25
  import torch
@@ -522,10 +522,13 @@ class ModelCompressor:
522
522
  update_parameter_data(module, data, param_name)
523
523
 
524
524
 
525
- def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
525
+ def map_modules_to_quant_args(
526
+ model: Module,
527
+ ) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
526
528
  """
527
529
  Given a pytorch model, map out the submodule name (usually linear layers)
528
- to the QuantizationArgs
530
+ to the weight QuantizationArgs. If running input activation quantization, will also
531
+ map to the input QuantizationArgs in a tuple.
529
532
 
530
533
  :param model: pytorch model
531
534
  """
@@ -535,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
535
538
  if submodule.quantization_scheme.weights is not None:
536
539
  name = fix_fsdp_module_name(name)
537
540
  quantized_modules_to_args[name] = submodule.quantization_scheme.weights
541
+ if submodule.quantization_scheme.input_activations is not None:
542
+ weight_args = quantized_modules_to_args.get(name)
543
+ quantized_modules_to_args[name] = (
544
+ weight_args,
545
+ submodule.quantization_scheme.input_activations,
546
+ )
538
547
 
539
548
  return quantized_modules_to_args
540
549
 
@@ -82,11 +82,32 @@ class BaseQuantizationCompressor(BaseCompressor):
82
82
  """
83
83
  compressed_dict = {}
84
84
  weight_suffix = ".weight"
85
+ input_zp_suffix = ".input_zero_point"
86
+ weight_zp_suffix = ".weight_zero_point"
85
87
  _LOGGER.debug(
86
88
  f"Compressing model with {len(model_state)} parameterized layers..."
87
89
  )
88
90
 
89
91
  for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
92
+ # check if the parameter we're compressing is the weight zp
93
+ # or the input zp
94
+ is_weight_zp = name.endswith(weight_zp_suffix)
95
+ is_input_zp = name.endswith(input_zp_suffix)
96
+
97
+ # if we're saving the weight zp, fetch weight quant args
98
+ if is_weight_zp:
99
+ quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
100
+ if isinstance(quant_args_zp, tuple):
101
+ # If tuple, first value is weight args, second is input args
102
+ quant_args_zp = quant_args_zp[0]
103
+
104
+ # if we're saving the input zp, fetch input quant args
105
+ if is_input_zp:
106
+ input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
107
+ if isinstance(input_args_zp, tuple):
108
+ # If tuple, first value is weight args, second is input args
109
+ input_args_zp = input_args_zp[-1]
110
+
90
111
  if name.endswith(weight_suffix):
91
112
  prefix = name[: -(len(weight_suffix))]
92
113
  scale = model_state.get(merge_names(prefix, "weight_scale"), None)
@@ -94,7 +115,11 @@ class BaseQuantizationCompressor(BaseCompressor):
94
115
  g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
95
116
  if scale is not None:
96
117
  # weight is quantized, compress it
97
- quant_args = names_to_scheme[prefix]
118
+ if isinstance(names_to_scheme[prefix], tuple):
119
+ quant_args = names_to_scheme[prefix][0]
120
+ else:
121
+ quant_args = names_to_scheme[prefix]
122
+
98
123
  compressed_data = self.compress_weight(
99
124
  weight=value,
100
125
  scale=scale,
@@ -107,7 +132,11 @@ class BaseQuantizationCompressor(BaseCompressor):
107
132
  compressed_dict[merge_names(prefix, key)] = value
108
133
  else:
109
134
  compressed_dict[name] = value.to("cpu")
110
- elif name.endswith("zero_point") and torch.all(value == 0):
135
+ # only save if asym
136
+ elif is_weight_zp and quant_args_zp.symmetric:
137
+ continue
138
+ # only save if asym
139
+ elif is_input_zp and input_args_zp.symmetric:
111
140
  continue
112
141
  elif name.endswith("g_idx") and torch.any(value <= -1):
113
142
  continue
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import warnings
15
16
  from typing import Dict, Tuple
16
17
 
17
18
  import torch
@@ -21,6 +22,7 @@ from compressed_tensors.quantization import (
21
22
  QuantizationStatus,
22
23
  initialize_module_for_quantization,
23
24
  )
25
+ from compressed_tensors.utils import register_offload_parameter
24
26
  from torch import Tensor
25
27
  from torch.nn import Parameter
26
28
  from torch.nn.functional import linear
@@ -32,11 +34,16 @@ class CompressedLinear(Linear):
32
34
  Wrapper module for running a compressed forward pass of a quantized Linear module.
33
35
  The wrapped layer will decompressed on each forward call.
34
36
 
35
- :param module: dense linear module to replace
36
- :param quantization_scheme: quantization config for the module to wrap
37
- :param quantization_format: compression format module is stored as
38
37
  """
39
38
 
39
+ def __init__(self, *args, **kwargs) -> None:
40
+ super().__init__(*args, **kwargs)
41
+ warnings.warn(
42
+ "CompressedLinear should not be initialized directly. "
43
+ "Use the from_linear method instead.",
44
+ UserWarning,
45
+ )
46
+
40
47
  @classmethod
41
48
  @torch.no_grad()
42
49
  def from_linear(
@@ -45,6 +52,12 @@ class CompressedLinear(Linear):
45
52
  quantization_scheme: QuantizationScheme,
46
53
  quantization_format: str,
47
54
  ):
55
+ """
56
+ :param module: dense linear module to replace
57
+ :param quantization_scheme: quantization config for the module to wrap
58
+ :param quantization_format: compression format module is stored as
59
+ :return: CompressedLinear module wrapping the input module
60
+ """
48
61
  module.__class__ = CompressedLinear
49
62
  module.compressor = BaseCompressor.load_from_registry(quantization_format)
50
63
  device = next(module.parameters()).device
@@ -68,7 +81,7 @@ class CompressedLinear(Linear):
68
81
  param = Parameter(
69
82
  torch.empty(shape, device=device, dtype=dtype), requires_grad=False
70
83
  )
71
- module.register_parameter(name, param)
84
+ register_offload_parameter(module, name, param)
72
85
 
73
86
  # mark module as compressed
74
87
  module.quantization_status = QuantizationStatus.COMPRESSED
@@ -85,5 +98,11 @@ class CompressedLinear(Linear):
85
98
  """
86
99
  Decompresses the weight, then runs the wrapped forward pass
87
100
  """
88
- uncompressed_weight = self.compressor.decompress_module(self)
89
- return linear(input, uncompressed_weight, self.bias)
101
+ if self.quantization_status == QuantizationStatus.COMPRESSED:
102
+ weight_data = self.compressor.decompress_module(self)
103
+ param = Parameter(weight_data, requires_grad=False)
104
+ register_offload_parameter(self, "weight", param)
105
+
106
+ self.quantization_status = QuantizationStatus.FROZEN
107
+
108
+ return linear(input, self.weight, self.bias)
@@ -203,11 +203,10 @@ def _initialize_attn_scales(module: Module) -> None:
203
203
  torch.empty(expected_shape, dtype=scale_dtype, device=device),
204
204
  requires_grad=False,
205
205
  )
206
-
207
- module.register_parameter(KVCacheScaleType.KEY.value, init_scale)
206
+ register_offload_parameter(module, KVCacheScaleType.KEY.value, init_scale)
208
207
 
209
208
  init_scale = Parameter(
210
209
  torch.empty(expected_shape, dtype=scale_dtype, device=device),
211
210
  requires_grad=False,
212
211
  )
213
- module.register_parameter(KVCacheScaleType.VALUE.value, init_scale)
212
+ register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
@@ -230,10 +230,6 @@ class QuantizationConfig(BaseModel):
230
230
  group_name = "group_" + str(idx)
231
231
  config_groups[group_name] = scheme
232
232
 
233
- # TODO: this is incorrect in compressed mode, since we are overwriting the
234
- # original weight we lose the uncompressed bit_depth indo
235
- compression_ratio = calculate_compression_ratio(model)
236
-
237
233
  if format is None:
238
234
  if quantization_status == QuantizationStatus.COMPRESSED:
239
235
  format = CompressionFormat.int_quantized.value
@@ -244,7 +240,7 @@ class QuantizationConfig(BaseModel):
244
240
  config_groups=config_groups,
245
241
  quantization_status=quantization_status,
246
242
  kv_cache_scheme=kv_cache_scheme,
247
- global_compression_ratio=compression_ratio,
243
+ global_compression_ratio=None,
248
244
  format=format,
249
245
  ignore=consolidated_ignore,
250
246
  )
@@ -1,53 +1,21 @@
1
- # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing,
10
- # software distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
14
3
 
15
- """
16
- Functionality for storing and setting the version info for SparseML
17
- """
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
18
5
 
6
+ TYPE_CHECKING = False
7
+ if TYPE_CHECKING:
8
+ from typing import Tuple
9
+ from typing import Union
19
10
 
20
- version_base = "0.9.2"
21
- is_release = True # change to True to set the generated version as a release version
11
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
12
+ else:
13
+ VERSION_TUPLE = object
22
14
 
15
+ version: str
16
+ __version__: str
17
+ __version_tuple__: VERSION_TUPLE
18
+ version_tuple: VERSION_TUPLE
23
19
 
24
- def _generate_version(
25
- is_release: bool,
26
- version_base: str,
27
- ):
28
- from datetime import date
29
-
30
- if is_release:
31
- return version_base
32
- else:
33
- return f"{version_base}.{date.today().strftime('%Y%m%d')}"
34
-
35
-
36
- __all__ = [
37
- "__version__",
38
- "version_base",
39
- "is_release",
40
- "version",
41
- "version_major",
42
- "version_minor",
43
- "version_patch",
44
- "version_build",
45
- "version_major_minor",
46
- ]
47
- __version__ = _generate_version(is_release, version_base)
48
-
49
- version = __version__
50
- version_major, version_minor, version_patch, version_build = version.split(".") + (
51
- [None] if len(version.split(".")) < 4 else []
52
- ) # handle conditional for version being 3 parts or 4 (4 containing build date)
53
- version_major_minor = f"{version_major}.{version_minor}"
20
+ __version__ = version = '0.9.4a20250408'
21
+ __version_tuple__ = version_tuple = (0, 9, 4)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: compressed-tensors
3
- Version: 0.9.2
3
+ Version: 0.9.4a20250408
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -26,6 +26,7 @@ Dynamic: description
26
26
  Dynamic: description-content-type
27
27
  Dynamic: home-page
28
28
  Dynamic: license
29
+ Dynamic: license-file
29
30
  Dynamic: provides-extra
30
31
  Dynamic: requires-dist
31
32
  Dynamic: summary
@@ -63,7 +64,7 @@ pip install compressed-tensors
63
64
 
64
65
  Nightly release:
65
66
  ```bash
66
- pip install compressed-tensors-nightly
67
+ pip install --pre compressed-tensors
67
68
  ```
68
69
 
69
70
  ### From Source
@@ -1,13 +1,13 @@
1
1
  compressed_tensors/__init__.py,sha256=UtKmifNeBCSE2TZSAfduVNNzHY-3V7bLjZ7n7RuXLOE,812
2
2
  compressed_tensors/base.py,sha256=73HYH7HY7O2roC89yG_piPFnZwrBfn_i7HmKl90SKc0,875
3
- compressed_tensors/version.py,sha256=BaFkY2H2ed1_O6ZxJgT-GUlT7oI0xJIeXGPO8yXqBE0,1585
3
+ compressed_tensors/version.py,sha256=dU0WxLg_un23vp7nx7GfLU01yj3Z9Aru2yP4cp8c0-c,520
4
4
  compressed_tensors/compressors/__init__.py,sha256=smSygTSfcfuujRrAXDc6uZm4L_ccV1tWZewqVnOb4lM,825
5
5
  compressed_tensors/compressors/base.py,sha256=x8dQrWVEurynXw03yHJZTaAmrRTOsdZJoHjmvs0IKwk,7002
6
6
  compressed_tensors/compressors/helpers.py,sha256=OK6qxX9j3bHwF9JfIYSGMgBJe2PWjlTA3byXKCJaTIQ,5431
7
7
  compressed_tensors/compressors/model_compressors/__init__.py,sha256=5RGGPFu4YqEt_aOdFSQYFYFDjcZFJN0CsMqRtDZz3Js,666
8
- compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=AmIE1SoNRH1fNgQALfNkdQo8y5tePVpdWUgLIOtf5rg,22569
8
+ compressed_tensors/compressors/model_compressors/model_compressor.py,sha256=n0gcrKwefJuO6b4LNjCynJQf7NNqNHDcoLlzZgTCPGc,23080
9
9
  compressed_tensors/compressors/quantized_compressors/__init__.py,sha256=09UJq68Pht6Bf-4iP9xYl3tetKsncNPHD8IAGbePsr4,714
10
- compressed_tensors/compressors/quantized_compressors/base.py,sha256=cp8S1Kr3HhlMHIz7k4vGo-qxxdknEC3qP1QLIhNnwRA,7217
10
+ compressed_tensors/compressors/quantized_compressors/base.py,sha256=GXTSWgFAhksbno94Ulpth9-YM4a7NsDlx4oQGGB0swQ,8567
11
11
  compressed_tensors/compressors/quantized_compressors/naive_quantized.py,sha256=fd0KlkSx6bvZ3xwIkK3jEUdPSUPs56Eua4dEDOtzKW0,5150
12
12
  compressed_tensors/compressors/quantized_compressors/pack_quantized.py,sha256=zH2PocRe_T5yt1-3kLdZH9AUQWQyaVOi4U9nEJiYaWA,8509
13
13
  compressed_tensors/compressors/sparse_compressors/__init__.py,sha256=Atuz-OdEgn8OCUhx7Ovd6gXdyImAI186uCR-uR0t_Nk,737
@@ -23,17 +23,17 @@ compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74j
23
23
  compressed_tensors/config/sparse_24_bitmask.py,sha256=Lhj39zT2V1hxftprvxvneyhv45ShlXOKd75DBbDTyTE,1401
24
24
  compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
25
25
  compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
26
- compressed_tensors/linear/compressed_linear.py,sha256=MJa-UfoKhIkdUWRD1shrXXri2cOwR5GK0a4t4bNYosM,3268
26
+ compressed_tensors/linear/compressed_linear.py,sha256=_m6XpNcI53eeSHO8VdiuAM6UBTdpDhn5Ivd8iRMwEKc,3980
27
27
  compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
28
28
  compressed_tensors/quantization/quant_args.py,sha256=sKpb8DcNObidjXjNol1Tn_Iih3ZXBycSp-fyz68TGhY,9117
29
- compressed_tensors/quantization/quant_config.py,sha256=vx06wBo91p4LCb3Vzd-2eCTUeIf_Sz2ZXRP263eQyjQ,10385
29
+ compressed_tensors/quantization/quant_config.py,sha256=MxSUcb5dOqMN6LFyD5K2h8X0TvEtcWIAoiUJqD2dHGE,10159
30
30
  compressed_tensors/quantization/quant_scheme.py,sha256=eQ0JrRZ80GX69fpwW87VzPzzhajhk4mUaJScjk82OY4,6010
31
31
  compressed_tensors/quantization/lifecycle/__init__.py,sha256=_uItzFWusyV74Zco_pHLOTdE9a83cL-R-ZdyQrBkIyw,772
32
32
  compressed_tensors/quantization/lifecycle/apply.py,sha256=lZmCCSm1_o79iUAy460w6Bv9FaOvntVisMdS-dN9fnk,16594
33
33
  compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
34
34
  compressed_tensors/quantization/lifecycle/forward.py,sha256=DOWouUqfaLA4Qhg-ojVVBdhhSAlgZqFC26vZARxE0ko,12961
35
35
  compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
36
- compressed_tensors/quantization/lifecycle/initialize.py,sha256=hymYtayTSumm8KCYAYPY267aWmlsJpt8oQFiRblk8qE,7452
36
+ compressed_tensors/quantization/lifecycle/initialize.py,sha256=sK3PLm69N91QepBuq-83Qd2Br6XcOmRDpD5qo_WWNJo,7469
37
37
  compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
38
38
  compressed_tensors/quantization/utils/helpers.py,sha256=DBP-sGRpGAY01K0LFE7qqonNj4hkTYL_mXrMs2LtAD8,14100
39
39
  compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
@@ -45,8 +45,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
45
45
  compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
46
46
  compressed_tensors/utils/safetensors_load.py,sha256=5SeM2hzLh77Ne8Vk7qR6-km7cf8bhov41ExpWITqX3A,11470
47
47
  compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
48
- compressed_tensors-0.9.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
49
- compressed_tensors-0.9.2.dist-info/METADATA,sha256=BmuThcrHjHQVScbgaWfyCRGuLo-cB6OnujZJPOKe1bQ,6975
50
- compressed_tensors-0.9.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
51
- compressed_tensors-0.9.2.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
52
- compressed_tensors-0.9.2.dist-info/RECORD,,
48
+ compressed_tensors-0.9.4a20250408.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
49
+ compressed_tensors-0.9.4a20250408.dist-info/METADATA,sha256=CXl80o7QymLek4-pwpCHF9L3-OgIowJ2KDmfi8r-YBs,7004
50
+ compressed_tensors-0.9.4a20250408.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
+ compressed_tensors-0.9.4a20250408.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
52
+ compressed_tensors-0.9.4a20250408.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5