compressed-tensors-nightly 0.7.0.20241011__py3-none-any.whl → 0.7.0.20241012__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,10 @@ from typing import Callable, Optional
18
18
 
19
19
  import torch
20
20
  from compressed_tensors.quantization.cache import QuantizedKVParameterCache
21
- from compressed_tensors.quantization.observers.helpers import calculate_range
21
+ from compressed_tensors.quantization.observers.helpers import (
22
+ calculate_range,
23
+ compute_dynamic_scales_and_zp,
24
+ )
22
25
  from compressed_tensors.quantization.quant_args import (
23
26
  QuantizationArgs,
24
27
  QuantizationStrategy,
@@ -376,9 +379,8 @@ def maybe_calibrate_or_quantize(
376
379
  g_idx = getattr(module, "weight_g_idx", None)
377
380
 
378
381
  if args.dynamic:
379
- # dynamic quantization - get scale and zero point directly from observer
380
- observer = getattr(module, f"{base_name}_observer")
381
- scale, zero_point = observer(value, g_idx=g_idx)
382
+ # dynamic quantization - no need to invoke observer
383
+ scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args)
382
384
  else:
383
385
  # static quantization - get previous scale and zero point from layer
384
386
  scale = getattr(module, f"{base_name}_scale")
@@ -153,12 +153,16 @@ def _initialize_scale_zero_point_observer(
153
153
  weight_shape: Optional[torch.Size] = None,
154
154
  force_zero_point: bool = True,
155
155
  ):
156
+
156
157
  # initialize observer module and attach as submodule
157
158
  observer = quantization_args.get_observer()
158
- module.register_module(f"{base_name}_observer", observer)
159
+ # no need to register an observer for dynamic quantization
160
+ if observer:
161
+ module.register_module(f"{base_name}_observer", observer)
159
162
 
163
+ # no need to register a scale and zero point for a dynamic quantization
160
164
  if quantization_args.dynamic:
161
- return # no need to register a scale and zero point for a dynamic observer
165
+ return
162
166
 
163
167
  device = next(module.parameters()).device
164
168
  if is_module_offloaded(module):
@@ -173,10 +177,7 @@ def _initialize_scale_zero_point_observer(
173
177
  expected_shape = (weight_shape[0], 1)
174
178
  elif quantization_args.strategy == QuantizationStrategy.GROUP:
175
179
  num_groups = weight_shape[1] // quantization_args.group_size
176
- expected_shape = (
177
- weight_shape[0],
178
- max(num_groups, 1)
179
- )
180
+ expected_shape = (weight_shape[0], max(num_groups, 1))
180
181
 
181
182
  scale_dtype = module.weight.dtype
182
183
  if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
@@ -17,6 +17,5 @@
17
17
 
18
18
  from .helpers import *
19
19
  from .base import *
20
- from .memoryless import *
21
20
  from .min_max import *
22
21
  from .mse import *
@@ -13,18 +13,56 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from collections import Counter
16
- from typing import Tuple
16
+ from typing import Optional, Tuple
17
17
 
18
18
  import torch
19
19
  from compressed_tensors.quantization.quant_args import (
20
20
  FP8_DTYPE,
21
21
  QuantizationArgs,
22
+ QuantizationStrategy,
22
23
  QuantizationType,
23
24
  )
24
25
  from torch import FloatTensor, IntTensor, Tensor
25
26
 
26
27
 
27
- __all__ = ["calculate_qparams", "get_observer_token_count", "calculate_range"]
28
+ __all__ = [
29
+ "calculate_qparams",
30
+ "get_observer_token_count",
31
+ "calculate_range",
32
+ "compute_dynamic_scales_and_zp",
33
+ ]
34
+
35
+
36
+ def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
37
+ """
38
+ Returns the computed scales and zero points for dynamic activation
39
+ qunatization.
40
+
41
+ :param value: tensor to calculate quantization parameters for
42
+ :param args: quantization args
43
+ :param reduce_dims: optional tuple of dimensions to reduce along,
44
+ returned scale and zero point will be shaped (1,) along the
45
+ reduced dimensions
46
+ :return: tuple of scale and zero point derived from the observed tensor
47
+ """
48
+ if args.strategy == QuantizationStrategy.TOKEN:
49
+ dim = {1, 2}
50
+ reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
51
+ elif args.strategy == QuantizationStrategy.TENSOR:
52
+ reduce_dims = None
53
+ else:
54
+ raise ValueError(
55
+ f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ",
56
+ "must be used for dynamic quantization",
57
+ )
58
+
59
+ if not reduce_dims:
60
+ min_val, max_val = torch.aminmax(value)
61
+ else:
62
+ min_val = torch.amin(value, dim=reduce_dims, keepdims=True)
63
+ max_val = torch.amax(value, dim=reduce_dims, keepdims=True)
64
+
65
+ return calculate_qparams(min_val, max_val, args)
28
66
 
29
67
 
30
68
  def get_observer_token_count(module: torch.nn.Module) -> Counter:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import warnings
15
16
  from enum import Enum
16
17
  from typing import Any, Dict, Optional, Union
17
18
 
@@ -94,7 +95,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
94
95
  block_structure: Optional[str] = None
95
96
  dynamic: bool = False
96
97
  actorder: Union[ActivationOrdering, bool, None] = None
97
- observer: str = Field(
98
+ observer: Optional[str] = Field(
98
99
  default="minmax",
99
100
  description=(
100
101
  "The class to use to compute the quantization param - "
@@ -115,10 +116,10 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
115
116
  """
116
117
  from compressed_tensors.quantization.observers.base import Observer
117
118
 
119
+ # No observer required for the dynamic case
118
120
  if self.dynamic:
119
- # override defualt observer for dynamic, you never want minmax which
120
- # keeps state across samples for dynamic
121
- self.observer = "memoryless"
121
+ self.observer = None
122
+ return self.observer
122
123
 
123
124
  return Observer.load_from_registry(self.observer, quantization_args=self)
124
125
 
@@ -171,6 +172,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
171
172
  strategy = model.strategy
172
173
  group_size = model.group_size
173
174
  actorder = model.actorder
175
+ dynamic = model.dynamic
176
+ observer = model.observer
174
177
 
175
178
  # infer strategy
176
179
  if strategy is None:
@@ -207,6 +210,27 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
207
210
  "activation ordering"
208
211
  )
209
212
 
213
+ if dynamic:
214
+ if strategy not in (
215
+ QuantizationStrategy.TOKEN,
216
+ QuantizationStrategy.TENSOR,
217
+ ):
218
+ raise ValueError(
219
+ f"One of {QuantizationStrategy.TOKEN} or "
220
+ f"{QuantizationStrategy.TENSOR} must be used for dynamic ",
221
+ "quantization",
222
+ )
223
+ if observer is not None:
224
+ warnings.warn(
225
+ "No observer is used for dynamic quantization, setting to None"
226
+ )
227
+ model.observer = None
228
+
229
+ # if we have not set an observer and we
230
+ # are running static quantization, use minmax
231
+ if not observer and not dynamic:
232
+ model.observer = "minmax"
233
+
210
234
  # write back modified values
211
235
  model.strategy = strategy
212
236
  return model
@@ -122,6 +122,7 @@ INT8_W8A8 = dict(
122
122
  strategy=QuantizationStrategy.TOKEN,
123
123
  symmetric=True,
124
124
  dynamic=True,
125
+ observer=None,
125
126
  ),
126
127
  )
127
128
 
@@ -164,6 +165,7 @@ INT8_W4A8 = dict(
164
165
  strategy=QuantizationStrategy.TOKEN,
165
166
  symmetric=True,
166
167
  dynamic=True,
168
+ observer=None,
167
169
  ),
168
170
  )
169
171
 
@@ -200,6 +202,7 @@ FP8_DYNAMIC = dict(
200
202
  strategy=QuantizationStrategy.TOKEN,
201
203
  symmetric=True,
202
204
  dynamic=True,
205
+ observer=None,
203
206
  ),
204
207
  )
205
208
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors-nightly
3
- Version: 0.7.0.20241011
3
+ Version: 0.7.0.20241012
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -24,21 +24,20 @@ compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajC
24
24
  compressed_tensors/linear/compressed_linear.py,sha256=0jTTf6XxOAjAYs3tvFtgiNMAO4W10sSeR-pdH2M413g,3218
25
25
  compressed_tensors/quantization/__init__.py,sha256=nWP_fsl6Nn0ksEgZPzerGiETdvF-ZfNwPnwGlRiR5pY,805
26
26
  compressed_tensors/quantization/cache.py,sha256=vnBB5zasO_XpHomZvzUPVVbzyCz2VgebsHePm0kANzY,6831
27
- compressed_tensors/quantization/quant_args.py,sha256=73KevZXHyrkMCT_3CxbYHz70fI3i-wcF8NvN0wsBPK4,8271
27
+ compressed_tensors/quantization/quant_args.py,sha256=k7NuZn8OqjgzmAVaN2-jHPQ1bgDkMuUoLJtLnhkvIOI,9085
28
28
  compressed_tensors/quantization/quant_config.py,sha256=NCiMvUMnnz5kTyAkDylxjtEGQnjgsIYIeNR2zyHEdTQ,10371
29
- compressed_tensors/quantization/quant_scheme.py,sha256=uFgp6ECU6ZkHWkeKlAVAzZTLDbrTrzPSPrY23eJluaw,5931
29
+ compressed_tensors/quantization/quant_scheme.py,sha256=5ggPz5sqEfTUgvJJeiPIINA74QtO-08hb3szsm7UHGE,6000
30
30
  compressed_tensors/quantization/lifecycle/__init__.py,sha256=MXE2E7GfIfRRfhrdGy2Og3AZOz5N59B0ZGFcsD89y6c,821
31
31
  compressed_tensors/quantization/lifecycle/apply.py,sha256=czaayvpeUYyWRJhO_klffw6esptOgA9sBKL5TWQcRdw,15805
32
32
  compressed_tensors/quantization/lifecycle/calibration.py,sha256=IuLeRkVQPrMxkMcIjr4OMFlIUMHkqjH4qAxC2KiUBGw,2673
33
33
  compressed_tensors/quantization/lifecycle/compressed.py,sha256=Fj9n66IN0EWsOAkBHg3O0GlOQpxstqjCcs0ttzMXrJ0,2296
34
- compressed_tensors/quantization/lifecycle/forward.py,sha256=eLup6QDRUUp_Ozcas7RDRLIXBWjFbxn5gWbcAIJEGlw,15715
34
+ compressed_tensors/quantization/lifecycle/forward.py,sha256=qy6_3z5YWDIffiAjQxgmBRggZifA7z93F9vk2GajIIU,15703
35
35
  compressed_tensors/quantization/lifecycle/frozen.py,sha256=NiJw7NP7pcT6idWFa8vksgiLoT8oQ975e57S4QfD2QQ,1874
36
36
  compressed_tensors/quantization/lifecycle/helpers.py,sha256=C0mhy2vJ0fCjVeN4kFNhw8Eq1wkteBGHiZ36RVLThRY,944
37
- compressed_tensors/quantization/lifecycle/initialize.py,sha256=4_YG7jKl7d2-Cy58pOkMtInFRhvYahxYchesWMPxPVM,8862
38
- compressed_tensors/quantization/observers/__init__.py,sha256=4Sa7rqi5RB_S5bPO8KmncETiqDsoMBhwP37arlQym8s,764
37
+ compressed_tensors/quantization/lifecycle/initialize.py,sha256=2n309DPxeV_nrM5H_yfQOhF5kteu428qBd4CBzocscw,8908
38
+ compressed_tensors/quantization/observers/__init__.py,sha256=DYrttzq-8MHLZUzpX-xzzm4hrw6HcXkMkux82KBKb1M,738
39
39
  compressed_tensors/quantization/observers/base.py,sha256=5ovQicWPYHjIxr6-EkQ4lgOX0PpI9g23iSzKpxjM1Zg,8420
40
- compressed_tensors/quantization/observers/helpers.py,sha256=s_A23Qa_BLfOdHJCN5bm-qPWkhjjj_RIVrhSp1Y9Dtk,4211
41
- compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ7tbnP-J_86QTrEfjBn6Kh1C-H8,2165
40
+ compressed_tensors/quantization/observers/helpers.py,sha256=o9hg4E9b5cCb5PaEAj6jHiUWkNrKtYtv0b1pGg-T9B4,5516
42
41
  compressed_tensors/quantization/observers/min_max.py,sha256=sQXqU3z-voxIDfR_9mQzwQUflZj2sASm_G8CYaXntFw,3865
43
42
  compressed_tensors/quantization/observers/mse.py,sha256=Aeh-253Vbab1F8cYuBiGNn4OXWJ67wXQ_JVfl3mu2a8,6034
44
43
  compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
@@ -52,8 +51,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
52
51
  compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
53
52
  compressed_tensors/utils/safetensors_load.py,sha256=m08ANVuTBxQdoa6LufDgcNJ7wCLDJolyZljB8VEybAU,8578
54
53
  compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
55
- compressed_tensors_nightly-0.7.0.20241011.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
56
- compressed_tensors_nightly-0.7.0.20241011.dist-info/METADATA,sha256=bjfPZnEc6zDDypW9ZQDqg9cGTTk3SrChPKfF2GR-0VY,6799
57
- compressed_tensors_nightly-0.7.0.20241011.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
58
- compressed_tensors_nightly-0.7.0.20241011.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
59
- compressed_tensors_nightly-0.7.0.20241011.dist-info/RECORD,,
54
+ compressed_tensors_nightly-0.7.0.20241012.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
55
+ compressed_tensors_nightly-0.7.0.20241012.dist-info/METADATA,sha256=abBDZNma7TjQd4jZ_l754n5pcy3yUNVC5DFv2xMXOZs,6799
56
+ compressed_tensors_nightly-0.7.0.20241012.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
57
+ compressed_tensors_nightly-0.7.0.20241012.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
58
+ compressed_tensors_nightly-0.7.0.20241012.dist-info/RECORD,,
@@ -1,56 +0,0 @@
1
- # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing,
10
- # software distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from typing import Any, Optional, Tuple
16
-
17
- import torch
18
- from compressed_tensors.quantization.observers.base import Observer
19
- from compressed_tensors.quantization.observers.helpers import calculate_qparams
20
- from torch import FloatTensor, IntTensor, Tensor
21
-
22
-
23
- __all__ = ["MemorylessObserver"]
24
-
25
-
26
- @Observer.register("memoryless", alias=["dynamic"])
27
- class MemorylessObserver(Observer):
28
- """
29
- Implements a quantization observer that sets the scale and
30
- zero point based on the latest observed value without tracking state
31
- """
32
-
33
- def calculate_qparams(
34
- self,
35
- observed: Tensor,
36
- tensor_id: Optional[Any] = None,
37
- reduce_dims: Optional[Tuple[int]] = None,
38
- ) -> Tuple[FloatTensor, IntTensor]:
39
- """
40
- Returns the min and max values of observed tensor
41
-
42
- :param observed: observed tensor to calculate quantization parameters for
43
- :param tensor_id: optional id for tensor; not used for memoryless
44
- :param reduce_dims: optional tuple of dimensions to reduce along,
45
- returned scale and zero point will be shaped (1,) along the
46
- reduced dimensions
47
- :return: tuple of scale and zero point derived from the observed tensor
48
- """
49
-
50
- if not reduce_dims:
51
- min_val, max_val = torch.aminmax(observed)
52
- else:
53
- min_val = torch.amin(observed, dim=reduce_dims, keepdims=True)
54
- max_val = torch.amax(observed, dim=reduce_dims, keepdims=True)
55
-
56
- return calculate_qparams(min_val, max_val, self.quantization_args)