compressed-tensors 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. compressed_tensors/compressors/base.py +200 -8
  2. compressed_tensors/compressors/model_compressor.py +68 -1
  3. compressed_tensors/compressors/naive_quantized.py +71 -75
  4. compressed_tensors/compressors/pack_quantized.py +83 -94
  5. compressed_tensors/config/base.py +6 -1
  6. compressed_tensors/linear/__init__.py +13 -0
  7. compressed_tensors/linear/compressed_linear.py +87 -0
  8. compressed_tensors/quantization/lifecycle/apply.py +46 -8
  9. compressed_tensors/quantization/lifecycle/calibration.py +5 -4
  10. compressed_tensors/quantization/lifecycle/compressed.py +3 -1
  11. compressed_tensors/quantization/lifecycle/forward.py +76 -43
  12. compressed_tensors/quantization/lifecycle/helpers.py +29 -2
  13. compressed_tensors/quantization/lifecycle/initialize.py +51 -16
  14. compressed_tensors/quantization/observers/__init__.py +1 -0
  15. compressed_tensors/quantization/observers/base.py +54 -14
  16. compressed_tensors/quantization/observers/min_max.py +8 -0
  17. compressed_tensors/quantization/observers/mse.py +162 -0
  18. compressed_tensors/quantization/quant_args.py +96 -24
  19. compressed_tensors/quantization/quant_scheme.py +7 -9
  20. compressed_tensors/quantization/utils/helpers.py +1 -1
  21. compressed_tensors/utils/__init__.py +1 -0
  22. compressed_tensors/utils/helpers.py +13 -0
  23. compressed_tensors/utils/offload.py +14 -2
  24. compressed_tensors/utils/permute.py +70 -0
  25. compressed_tensors/utils/safetensors_load.py +2 -0
  26. compressed_tensors/utils/semi_structured_conversions.py +1 -0
  27. compressed_tensors/version.py +1 -1
  28. {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/METADATA +35 -23
  29. compressed_tensors-0.6.0.dist-info/RECORD +52 -0
  30. {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/WHEEL +1 -1
  31. compressed_tensors-0.5.0.dist-info/RECORD +0 -48
  32. {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/LICENSE +0 -0
  33. {compressed_tensors-0.5.0.dist-info → compressed_tensors-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Optional, Tuple
16
+
17
+ import torch
18
+ from compressed_tensors.quantization.observers.base import Observer
19
+ from compressed_tensors.quantization.observers.helpers import calculate_qparams
20
+ from compressed_tensors.quantization.quant_args import QuantizationArgs
21
+ from torch import FloatTensor, IntTensor, Tensor
22
+
23
+
24
+ __all__ = ["MovingAverageMSEObserver"]
25
+
26
+
27
+ @Observer.register("mse")
28
+ class MovingAverageMSEObserver(Observer):
29
+ """
30
+ Implements a dynamic quantization observer that sets the scale and
31
+ zero point based on a moving average of the mse-clipped min and max observed values
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ quantization_args: QuantizationArgs,
37
+ averaging_constant: float = 0.01,
38
+ grid: float = 100.0,
39
+ maxshrink: float = 0.80,
40
+ norm: float = 2.4,
41
+ ):
42
+ super().__init__(quantization_args=quantization_args)
43
+
44
+ self.min_val = {}
45
+ self.max_val = {}
46
+ self.averaging_constant = averaging_constant
47
+ self.grid = grid
48
+ self.maxshrink = maxshrink
49
+ self.norm = norm
50
+
51
+ def calculate_mse_min_max(
52
+ self,
53
+ observed: Tensor,
54
+ reduce_dims: Optional[Tuple[int]] = None,
55
+ ):
56
+ """
57
+ Computes the mse-clipped min and max values of the observed tensor by
58
+ optimizing for quantization error
59
+
60
+ :param observed: observed tensor to calculate quantization parameters for
61
+ :param reduce_dims: optional tuple of dimensions to reduce along,
62
+ returned values will be shaped (1,) along the reduced dimensions
63
+ :return: tuple of min and max values derived from the observed tensor
64
+ """
65
+ from compressed_tensors.quantization.lifecycle import fake_quantize
66
+
67
+ if not reduce_dims:
68
+ absolute_min_val, absolute_max_val = torch.aminmax(observed)
69
+ else:
70
+ absolute_min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
71
+ absolute_max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
72
+
73
+ best = torch.full(absolute_min_val.shape, float("inf"))
74
+ min_val = torch.ones(absolute_min_val.shape)
75
+ max_val = torch.zeros(absolute_max_val.shape)
76
+ for i in range(int(self.maxshrink * self.grid)):
77
+ p = 1 - i / self.grid
78
+ shrinked_min_val = p * absolute_min_val
79
+ shrinked_max_val = p * absolute_max_val
80
+
81
+ candidate_scales, candidate_zero_points = calculate_qparams(
82
+ shrinked_min_val, shrinked_max_val, self.quantization_args
83
+ )
84
+ q = fake_quantize(
85
+ observed,
86
+ candidate_scales,
87
+ candidate_zero_points,
88
+ self.quantization_args,
89
+ )
90
+
91
+ q -= observed
92
+ q.abs_()
93
+ q.pow_(self.norm)
94
+ if not reduce_dims:
95
+ err = torch.sum(q)
96
+ else:
97
+ err = torch.sum(q, reduce_dims, keepdims=True)
98
+
99
+ tmp = err < best
100
+ if torch.any(tmp):
101
+ best[tmp] = err[tmp]
102
+ min_val[tmp] = shrinked_min_val[tmp]
103
+ max_val[tmp] = shrinked_max_val[tmp]
104
+ return min_val, max_val
105
+
106
+ def calculate_qparams(
107
+ self,
108
+ observed: Tensor,
109
+ reduce_dims: Optional[Tuple[int]] = None,
110
+ tensor_id: Optional[Any] = None,
111
+ ) -> Tuple[FloatTensor, IntTensor]:
112
+ """
113
+ Updates the mse-clipped min and max values of the observed tensor using
114
+ a moving average smoothed by the averaging_constant
115
+
116
+ :param observed: observed tensor to calculate quantization parameters for
117
+ :param reduce_dims: optional tuple of dimensions to reduce along,
118
+ returned scale and zero point will be shaped (1,) along the
119
+ reduced dimensions
120
+ :param tensor_id: Optional id if different ranges of observed tensors are
121
+ passed, useful for sharding tensors by group_size
122
+ :return: tuple of scale and zero point derived from the observed tensor
123
+ """
124
+ min_val, max_val = self.calculate_mse_min_max(observed, reduce_dims)
125
+
126
+ running_min_val = self.min_val.get(tensor_id, None)
127
+ running_max_val = self.max_val.get(tensor_id, None)
128
+
129
+ if running_min_val is None or running_max_val is None:
130
+ updated_min_val = min_val
131
+ updated_max_val = max_val
132
+ else:
133
+ updated_min_val = running_min_val + self.averaging_constant * (
134
+ min_val - running_min_val
135
+ )
136
+ updated_max_val = running_max_val + self.averaging_constant * (
137
+ max_val - running_max_val
138
+ )
139
+
140
+ tensor_id = tensor_id or "default"
141
+ self.min_val[tensor_id] = updated_min_val
142
+ self.max_val[tensor_id] = updated_max_val
143
+
144
+ return calculate_qparams(
145
+ updated_min_val, updated_max_val, self.quantization_args
146
+ )
147
+
148
+ def get_qparams_along_dim(
149
+ self, observed, dim: int, tensor_id: Optional[Any] = None
150
+ ):
151
+ reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim)
152
+ return self.calculate_qparams(
153
+ observed, reduce_dims=reduce_dims, tensor_id=tensor_id
154
+ )
155
+
156
+ def reset(self):
157
+ """
158
+ Reset the state of the observer, including min and maximum values
159
+ """
160
+ super().reset()
161
+ self.min_val = {}
162
+ self.max_val = {}
@@ -13,10 +13,10 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from enum import Enum
16
- from typing import Any, Dict, Optional
16
+ from typing import Any, Dict, Optional, Union
17
17
 
18
18
  import torch
19
- from pydantic import BaseModel, Field, validator
19
+ from pydantic import BaseModel, Field, field_validator, model_validator
20
20
 
21
21
 
22
22
  __all__ = [
@@ -25,6 +25,7 @@ __all__ = [
25
25
  "QuantizationStrategy",
26
26
  "QuantizationArgs",
27
27
  "round_to_quantized_type",
28
+ "ActivationOrdering",
28
29
  ]
29
30
 
30
31
  FP8_DTYPE = torch.float8_e4m3fn
@@ -51,6 +52,19 @@ class QuantizationStrategy(str, Enum):
51
52
  TOKEN = "token"
52
53
 
53
54
 
55
+ class ActivationOrdering(str, Enum):
56
+ """
57
+ Enum storing strategies for activation ordering
58
+
59
+ Group: reorder groups and weight\n
60
+ Weight: only reorder weight, not groups. Slightly lower latency and
61
+ accuracy compared to group actorder\n
62
+ """
63
+
64
+ GROUP = "group"
65
+ WEIGHT = "weight"
66
+
67
+
54
68
  class QuantizationArgs(BaseModel, use_enum_values=True):
55
69
  """
56
70
  User facing arguments used to define a quantization config for weights or
@@ -68,15 +82,18 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
68
82
  ranges will be observed with every sample. Defaults to False for static
69
83
  quantization. Note that enabling dynamic quantization will change the default
70
84
  observer to a memoryless one
85
+ :param actorder: whether to apply group quantization in decreasing order of
86
+ activation. Defaults to None for arbitrary ordering
71
87
  """
72
88
 
73
89
  num_bits: int = 8
74
- type: QuantizationType = QuantizationType.INT.value
90
+ type: QuantizationType = QuantizationType.INT
75
91
  symmetric: bool = True
76
92
  group_size: Optional[int] = None
77
93
  strategy: Optional[QuantizationStrategy] = None
78
94
  block_structure: Optional[str] = None
79
95
  dynamic: bool = False
96
+ actorder: Union[ActivationOrdering, bool, None] = None
80
97
  observer: str = Field(
81
98
  default="minmax",
82
99
  description=(
@@ -98,41 +115,96 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
98
115
  """
99
116
  from compressed_tensors.quantization.observers.base import Observer
100
117
 
101
- if self.observer == "minmax" and self.dynamic:
118
+ if self.dynamic:
102
119
  # override defualt observer for dynamic, you never want minmax which
103
120
  # keeps state across samples for dynamic
104
121
  self.observer = "memoryless"
105
122
 
106
123
  return Observer.load_from_registry(self.observer, quantization_args=self)
107
124
 
108
- @validator("strategy", pre=True, always=True)
109
- def validate_strategy(cls, value, values):
110
- group_size = values.get("group_size")
125
+ @field_validator("type", mode="before")
126
+ def validate_type(cls, value) -> QuantizationType:
127
+ if isinstance(value, str):
128
+ return QuantizationType(value.lower())
111
129
 
112
- # use group_size to determinine strategy if not given explicity
113
- if group_size is not None and value is None:
114
- if group_size > 0:
115
- return QuantizationStrategy.GROUP
130
+ return value
116
131
 
117
- elif group_size == -1:
118
- return QuantizationStrategy.CHANNEL
132
+ @field_validator("group_size", mode="before")
133
+ def validate_group(cls, value) -> Union[int, None]:
134
+ if value is None:
135
+ return value
119
136
 
120
- else:
121
- raise ValueError(
122
- f"group_size={group_size} with strategy {value} is invald. "
123
- "group_size > 0 for strategy='group' and "
124
- "group_size = -1 for 'channel'"
125
- )
137
+ if value < -1:
138
+ raise ValueError(
139
+ f"Invalid group size {value}. Use group_size > 0 for "
140
+ "strategy='group' and group_size = -1 for 'channel'"
141
+ )
126
142
 
127
- if value == QuantizationStrategy.GROUP:
128
- if group_size is None:
129
- raise ValueError(f"strategy {value} requires group_size to be set.")
143
+ return value
130
144
 
131
- if value is None:
132
- return QuantizationStrategy.TENSOR
145
+ @field_validator("strategy", mode="before")
146
+ def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
147
+ if isinstance(value, str):
148
+ return QuantizationStrategy(value.lower())
149
+
150
+ return value
151
+
152
+ @field_validator("actorder", mode="before")
153
+ def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
154
+ if isinstance(value, bool):
155
+ return ActivationOrdering.GROUP if value else None
156
+
157
+ if isinstance(value, str):
158
+ return ActivationOrdering(value.lower())
133
159
 
134
160
  return value
135
161
 
162
+ @model_validator(mode="after")
163
+ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
164
+ # extract user-passed values from dictionary
165
+ strategy = model.strategy
166
+ group_size = model.group_size
167
+ actorder = model.actorder
168
+
169
+ # infer strategy
170
+ if strategy is None:
171
+ if group_size is None:
172
+ strategy = QuantizationStrategy.TENSOR
173
+ elif group_size > 0:
174
+ strategy = QuantizationStrategy.GROUP
175
+ elif group_size == -1:
176
+ strategy = QuantizationStrategy.CHANNEL
177
+ else:
178
+ raise ValueError(
179
+ f"Invalid group size {group_size}. Use group_size > 0 for "
180
+ "strategy='group' and group_size = -1 for 'channel'"
181
+ )
182
+
183
+ # validate strategy and group
184
+ if strategy == QuantizationStrategy.GROUP:
185
+ if group_size is None or group_size <= 0:
186
+ raise ValueError(
187
+ f"strategy {strategy} requires group_size to be "
188
+ "set to a positive value"
189
+ )
190
+ if (
191
+ group_size is not None
192
+ and group_size > 0
193
+ and strategy != QuantizationStrategy.GROUP
194
+ ):
195
+ raise ValueError("group_size requires strategy to be set to 'group'")
196
+
197
+ # validate activation ordering and strategy
198
+ if actorder is not None and strategy != QuantizationStrategy.GROUP:
199
+ raise ValueError(
200
+ "Must use group quantization strategy in order to apply "
201
+ "activation ordering"
202
+ )
203
+
204
+ # write back modified values
205
+ model.strategy = strategy
206
+ return model
207
+
136
208
  def pytorch_dtype(self) -> torch.dtype:
137
209
  if self.type == QuantizationType.FLOAT:
138
210
  return FP8_DTYPE
@@ -57,15 +57,9 @@ class QuantizationScheme(BaseModel):
57
57
  # default to quantizing all Linear layers
58
58
  targets = ["Linear"]
59
59
 
60
- # default to 8 bit integer symmetric quantization
61
- # for weights
62
- weights = QuantizationArgs(num_bits=8, symmetric=True)
63
-
64
- # default to 8 bit integer asymmetric quantization
65
- input_activations = QuantizationArgs(num_bits=8, symmetric=True)
66
-
67
- # Do not quantize the output activations
68
- # by default
60
+ # by default, activations and weights are left unquantized
61
+ weights = None
62
+ input_activations = None
69
63
  output_activations = None
70
64
 
71
65
  return cls(
@@ -111,6 +105,8 @@ def is_preset_scheme(name: str) -> bool:
111
105
  return name.upper() in PRESET_SCHEMES
112
106
 
113
107
 
108
+ UNQUANTIZED = dict()
109
+
114
110
  # 8 bit integer weights and 8 bit activations quantization
115
111
  W8A8 = dict(
116
112
  weights=QuantizationArgs(
@@ -208,6 +204,8 @@ FP8_DYNAMIC = dict(
208
204
  )
209
205
 
210
206
  PRESET_SCHEMES = {
207
+ # Unquantized (no-op)
208
+ "UNQUANTIZED": UNQUANTIZED,
211
209
  # Integer weight only schemes
212
210
  "W8A16": W8A16,
213
211
  "W4A16": W4A16,
@@ -181,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
181
181
  for parameter in model.parameters():
182
182
  uncompressed_bits = get_torch_bit_depth(parameter)
183
183
  compressed_bits = uncompressed_bits
184
- if is_module_quantized(submodule):
184
+ if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
185
185
  compressed_bits = submodule.quantization_scheme.weights.num_bits
186
186
 
187
187
  num_weights = parameter.numel()
@@ -16,5 +16,6 @@
16
16
  from .helpers import *
17
17
  from .offload import *
18
18
  from .permutations_24 import *
19
+ from .permute import *
19
20
  from .safetensors_load import *
20
21
  from .semi_structured_conversions import *
@@ -22,6 +22,7 @@ __all__ = [
22
22
  "infer_compressor_from_model_config",
23
23
  "fix_fsdp_module_name",
24
24
  "tensor_follows_mask_structure",
25
+ "replace_module",
25
26
  ]
26
27
 
27
28
  FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -90,3 +91,15 @@ def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool:
90
91
  raise ValueError()
91
92
 
92
93
  return True
94
+
95
+
96
+ def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
97
+ if "." in name:
98
+ parent_name = name.rsplit(".", 1)[0]
99
+ child_name = name[len(parent_name) + 1 :]
100
+ parent = model.get_submodule(parent_name)
101
+ else:
102
+ parent_name = ""
103
+ parent = model
104
+ child_name = name
105
+ setattr(parent, child_name, new_module)
@@ -40,7 +40,13 @@ def get_execution_device(module: Module) -> torch.device:
40
40
  """
41
41
  if is_module_offloaded(module):
42
42
  return module._hf_hook.execution_device
43
- return next(module.parameters()).device
43
+ device = next(module.parameters()).device
44
+
45
+ # offload only gets set for leaf modules, fallback to checking for device type
46
+ if device.type == "meta":
47
+ return module._hf_hook.execution_device
48
+
49
+ return device
44
50
 
45
51
 
46
52
  def get_offloaded_device(module: Module) -> torch.device:
@@ -83,8 +89,11 @@ def update_parameter_data(
83
89
 
84
90
  :param module: layer containing the parameter to update
85
91
  :param new_param_data: tensor to update parameter with
86
- :param param_name:
92
+ :param param_name: name of layer parameter to update
87
93
  """
94
+ if not hasattr(module, param_name):
95
+ return
96
+
88
97
  device = next(module.parameters()).device
89
98
 
90
99
  offloaded = False
@@ -93,6 +102,9 @@ def update_parameter_data(
93
102
  offloaded = True
94
103
 
95
104
  parameter = getattr(module, param_name, None)
105
+ if parameter is None:
106
+ raise ValueError("Attempted to update uninitialized parameter")
107
+
96
108
  dtype = parameter.dtype
97
109
  parameter.data = new_param_data.to(device).to(dtype)
98
110
 
@@ -0,0 +1,70 @@
1
+ # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing,
10
+ # software distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Set, Tuple
16
+
17
+ import torch
18
+
19
+
20
+ __all__ = ["safe_permute"]
21
+
22
+
23
+ # these datatypes are missing implementations required for standard permutation
24
+ _EXPERIMENTAL_DTYPES: Set[Tuple[torch.dtype, torch.device]] = set()
25
+
26
+
27
+ def safe_permute(value: torch.Tensor, perm: torch.Tensor, dim: int = 0) -> torch.Tensor:
28
+ """
29
+ Perform out-of-place permutation without using torch.Tensor.index_put_,
30
+ whose implementation is missing for datatypes such as `torch.float8_e4m3fn`
31
+
32
+ :param value: tensor to permute
33
+ :param perm: permutation map
34
+ :param dim: dimension along which to apply permutation
35
+ :return: permuted value
36
+ """
37
+ dtype_tuple = (value.dtype, value.device)
38
+
39
+ if dtype_tuple in _EXPERIMENTAL_DTYPES:
40
+ return _fallback_permute(value, perm, dim)
41
+
42
+ try:
43
+ return value[tuple([slice(None)] * dim + [perm])]
44
+ except RuntimeError:
45
+ # Mark dtype as experimental if advanced indexing fails
46
+ _EXPERIMENTAL_DTYPES.add(dtype_tuple)
47
+ return _fallback_permute(value, perm, dim)
48
+
49
+
50
+ def _fallback_permute(
51
+ value: torch.Tensor, perm: torch.Tensor, dim: int
52
+ ) -> torch.Tensor:
53
+ """
54
+ Fallback permutation method for experimental dtypes.
55
+
56
+ :param value: tensor to permute
57
+ :param perm: permutation map
58
+ :param dim: dimension along which to apply permutation
59
+ :return: permuted value
60
+ """
61
+ value_ret = value.clone() # cannot use zeros_like b/c of missing impl.
62
+ orig_slices = [slice(None)] * (dim + 1)
63
+ perm_slices = [slice(None)] * (dim + 1)
64
+
65
+ for index, perm_index in enumerate(perm):
66
+ orig_slices[dim] = index
67
+ perm_slices[dim] = perm_index
68
+ value_ret[tuple(orig_slices)] = value[tuple(perm_slices)]
69
+
70
+ return value_ret
@@ -234,5 +234,7 @@ def is_quantization_param(name: str) -> bool:
234
234
  return True
235
235
  if name.endswith("zero_point"):
236
236
  return True
237
+ if name.endswith("g_idx"):
238
+ return True
237
239
 
238
240
  return False
@@ -28,6 +28,7 @@ __all__ = [
28
28
  "mask_creator",
29
29
  ]
30
30
 
31
+
31
32
  # This is PyTorch implementation of main part of reorder_meta()
32
33
  # function, from tools/util/include/cutlass/util/host_reorder.h file
33
34
  # of CUTLASS source tree. Furthermore, CUTLASS template for sparse
@@ -17,7 +17,7 @@ Functionality for storing and setting the version info for SparseML
17
17
  """
18
18
 
19
19
 
20
- version_base = "0.5.0"
20
+ version_base = "0.6.0"
21
21
  is_release = True # change to True to set the generated version as a release version
22
22
 
23
23
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -8,44 +8,56 @@ Author-email: support@neuralmagic.com
8
8
  License: Apache 2.0
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
- Requires-Dist: torch >=1.7.0
11
+ Requires-Dist: torch>=1.7.0
12
12
  Requires-Dist: transformers
13
- Requires-Dist: accelerate
14
- Requires-Dist: pydantic >=2.0
13
+ Requires-Dist: pydantic>=2.0
14
+ Provides-Extra: accelerate
15
+ Requires-Dist: accelerate; extra == "accelerate"
15
16
  Provides-Extra: dev
16
- Requires-Dist: black ==22.12.0 ; extra == 'dev'
17
- Requires-Dist: isort ==5.8.0 ; extra == 'dev'
18
- Requires-Dist: wheel >=0.36.2 ; extra == 'dev'
19
- Requires-Dist: flake8 >=3.8.3 ; extra == 'dev'
20
- Requires-Dist: pytest >=6.0.0 ; extra == 'dev'
21
- Requires-Dist: nbconvert >=7.16.3 ; extra == 'dev'
17
+ Requires-Dist: black==22.12.0; extra == "dev"
18
+ Requires-Dist: isort==5.8.0; extra == "dev"
19
+ Requires-Dist: wheel>=0.36.2; extra == "dev"
20
+ Requires-Dist: flake8>=3.8.3; extra == "dev"
21
+ Requires-Dist: pytest>=6.0.0; extra == "dev"
22
+ Requires-Dist: nbconvert>=7.16.3; extra == "dev"
22
23
 
23
- # compressed_tensors
24
+ # compressed-tensors
24
25
 
25
- This repository extends a [safetensors](https://github.com/huggingface/safetensors) format to efficiently store sparse and/or quantized tensors on disk. `compressed-tensors` format supports multiple compression types to minimize the disk space and facilitate the tensor manipulation.
26
+ The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
26
27
 
27
- ## Motivation
28
+ ## Why `compressed-tensors`?
28
29
 
29
- ### Reduce disk space by saving sparse tensors in a compressed format
30
+ As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented.
31
+ Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them.
32
+ `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes.
30
33
 
31
- The compressed format stores the data much more efficiently by taking advantage of two properties of tensors:
34
+ * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format.
35
+ * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor)
36
+ * **Flexible Quantization Support**:
37
+ * Weight-only quantization (e.g., W4A16, W8A16, WnA16)
38
+ * Activation quantization (e.g., W8A8)
39
+ * KV cache quantization
40
+ * Non-uniform schemes (different layers can be quantized in different ways!)
41
+ * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns.
42
+ * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch.
32
43
 
33
- - Sparse tensors -> due to a large number of entries that are equal to zero.
34
- - Quantized -> due to their low precision representation.
35
-
36
- ### Introduce an elegant interface to save/load compressed tensors
37
-
38
- The library provides the user with the ability to compress/decompress tensors. The properties of tensors are defined by human-readable configs, allowing the users to understand the compression format at a quick glance.
44
+ This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines.
39
45
 
40
46
  ## Installation
41
47
 
42
- ### Pip
48
+ ### From [PyPI](https://pypi.org/project/compressed-tensors)
43
49
 
50
+ Stable release:
44
51
  ```bash
45
52
  pip install compressed-tensors
46
53
  ```
47
54
 
48
- ### From source
55
+ Nightly release:
56
+ ```bash
57
+ pip install compressed-tensors-nightly
58
+ ```
59
+
60
+ ### From Source
49
61
 
50
62
  ```bash
51
63
  git clone https://github.com/neuralmagic/compressed-tensors