compressed-tensors-nightly 0.5.0.20240902__py3-none-any.whl → 0.5.0.20240903__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -108,6 +108,7 @@ class Compressor(RegistryMixin):
108
108
  prefix = name[: -(len(weight_suffix))]
109
109
  scale = model_state.get(merge_names(prefix, "weight_scale"), None)
110
110
  zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
111
+ g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
111
112
  if scale is not None:
112
113
  # weight is quantized, compress it
113
114
  quant_args = names_to_scheme[prefix]
@@ -115,6 +116,7 @@ class Compressor(RegistryMixin):
115
116
  weight=value,
116
117
  scale=scale,
117
118
  zero_point=zp,
119
+ g_idx=g_idx,
118
120
  quantization_args=quant_args,
119
121
  device="cpu",
120
122
  )
@@ -271,6 +271,9 @@ class ModelCompressor:
271
271
  v_proj_has_quant_output = 0
272
272
  for name, module in model.named_modules():
273
273
  if not hasattr(module, "quantization_scheme"):
274
+ # We still want to count non-quantized q_proj
275
+ if name.endswith(".q_proj"):
276
+ q_proj_has_no_quant_output += 1
274
277
  continue
275
278
  out_act = module.quantization_scheme.output_activations
276
279
  if name.endswith(".q_proj") and out_act is None:
@@ -348,7 +348,16 @@ def maybe_calibrate_or_quantize(
348
348
  update_parameter_data(module, updated_scale, f"{base_name}_scale")
349
349
  update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
350
350
 
351
- return fake_quantize(value, scale, zero_point, args, g_idx=g_idx)
351
+ scale = updated_scale
352
+ zero_point = updated_zero_point
353
+
354
+ return fake_quantize(
355
+ x=value,
356
+ scale=scale,
357
+ zero_point=zero_point,
358
+ args=args,
359
+ g_idx=g_idx,
360
+ )
352
361
 
353
362
 
354
363
  @torch.no_grad()
@@ -21,6 +21,7 @@ from compressed_tensors.quantization.lifecycle.forward import (
21
21
  wrap_module_forward_quantized,
22
22
  )
23
23
  from compressed_tensors.quantization.quant_args import (
24
+ ActivationOrdering,
24
25
  QuantizationArgs,
25
26
  QuantizationStrategy,
26
27
  )
@@ -179,8 +180,8 @@ def _initialize_scale_zero_point_observer(
179
180
  )
180
181
  module.register_parameter(f"{base_name}_zero_point", init_zero_point)
181
182
 
182
- # initialize with empty for actorder, to be populated by GPTQ or state_dict
183
- if quantization_args.actorder:
183
+ # only grouped activation ordering has g_idx
184
+ if quantization_args.actorder == ActivationOrdering.GROUP:
184
185
  g_idx_shape = (weight_shape[1],)
185
186
  g_idx_dtype = torch.int
186
187
  init_g_idx = Parameter(
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from enum import Enum
16
- from typing import Any, Dict, Optional
16
+ from typing import Any, Dict, Optional, Union
17
17
 
18
18
  import torch
19
19
  from pydantic import BaseModel, Field, field_validator, model_validator
@@ -25,6 +25,7 @@ __all__ = [
25
25
  "QuantizationStrategy",
26
26
  "QuantizationArgs",
27
27
  "round_to_quantized_type",
28
+ "ActivationOrdering",
28
29
  ]
29
30
 
30
31
  FP8_DTYPE = torch.float8_e4m3fn
@@ -51,6 +52,19 @@ class QuantizationStrategy(str, Enum):
51
52
  TOKEN = "token"
52
53
 
53
54
 
55
+ class ActivationOrdering(str, Enum):
56
+ """
57
+ Enum storing strategies for activation ordering
58
+
59
+ Group: reorder groups and weight\n
60
+ Weight: only reorder weight, not groups. Slightly lower latency and
61
+ accuracy compared to group actorder\n
62
+ """
63
+
64
+ GROUP = "group"
65
+ WEIGHT = "weight"
66
+
67
+
54
68
  class QuantizationArgs(BaseModel, use_enum_values=True):
55
69
  """
56
70
  User facing arguments used to define a quantization config for weights or
@@ -69,17 +83,17 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
69
83
  quantization. Note that enabling dynamic quantization will change the default
70
84
  observer to a memoryless one
71
85
  :param actorder: whether to apply group quantization in decreasing order of
72
- activation. Defaults to False for arbitrary ordering
86
+ activation. Defaults to None for arbitrary ordering
73
87
  """
74
88
 
75
89
  num_bits: int = 8
76
- type: QuantizationType = QuantizationType.INT.value
90
+ type: QuantizationType = QuantizationType.INT
77
91
  symmetric: bool = True
78
92
  group_size: Optional[int] = None
79
93
  strategy: Optional[QuantizationStrategy] = None
80
94
  block_structure: Optional[str] = None
81
95
  dynamic: bool = False
82
- actorder: bool = False
96
+ actorder: Optional[ActivationOrdering] = None
83
97
  observer: str = Field(
84
98
  default="minmax",
85
99
  description=(
@@ -108,8 +122,15 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
108
122
 
109
123
  return Observer.load_from_registry(self.observer, quantization_args=self)
110
124
 
125
+ @field_validator("type", mode="before")
126
+ def validate_type(cls, value) -> QuantizationType:
127
+ if isinstance(value, str):
128
+ return QuantizationType(value.lower())
129
+
130
+ return value
131
+
111
132
  @field_validator("group_size", mode="before")
112
- def validate_group(cls, value) -> int:
133
+ def validate_group(cls, value) -> Union[int, None]:
113
134
  if value is None:
114
135
  return value
115
136
 
@@ -121,18 +142,29 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
121
142
 
122
143
  return value
123
144
 
124
- @model_validator(mode="before")
125
- def validate_strategy(values) -> Dict[str, Any]:
126
- model_fields = QuantizationArgs.model_fields
127
- strategy = values.get("strategy", model_fields["strategy"].default)
128
- group_size = values.get("group_size", model_fields["group_size"].default)
129
- actorder = values.get("actorder", model_fields["actorder"].default)
145
+ @field_validator("strategy", mode="before")
146
+ def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
147
+ if isinstance(value, str):
148
+ return QuantizationStrategy(value.lower())
130
149
 
131
- if strategy is not None:
132
- strategy = QuantizationStrategy(strategy.lower())
150
+ return value
133
151
 
134
- else:
135
- # use group_size to determinine strategy if not given explicity
152
+ @field_validator("actorder", mode="before")
153
+ def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
154
+ if isinstance(value, str):
155
+ return ActivationOrdering(value.lower())
156
+
157
+ return value
158
+
159
+ @model_validator(mode="after")
160
+ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
161
+ # extract user-passed values from dictionary
162
+ strategy = model.strategy
163
+ group_size = model.group_size
164
+ actorder = model.actorder
165
+
166
+ # infer strategy
167
+ if strategy is None:
136
168
  if group_size is None:
137
169
  strategy = QuantizationStrategy.TENSOR
138
170
  elif group_size > 0:
@@ -145,6 +177,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
145
177
  "strategy='group' and group_size = -1 for 'channel'"
146
178
  )
147
179
 
180
+ # validate strategy and group
148
181
  if strategy == QuantizationStrategy.GROUP:
149
182
  if group_size is None or group_size <= 0:
150
183
  raise ValueError(
@@ -152,14 +185,16 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
152
185
  "set to a positive value"
153
186
  )
154
187
 
155
- if actorder and strategy != QuantizationStrategy.GROUP:
188
+ # validate activation ordering and strategy
189
+ if actorder is not None and strategy != QuantizationStrategy.GROUP:
156
190
  raise ValueError(
157
- "Group quantization must be specified in order to apply "
191
+ "Must use group quantization strategy in order to apply "
158
192
  "activation ordering"
159
193
  )
160
194
 
161
- values["strategy"] = strategy
162
- return values
195
+ # write back modified values
196
+ model.strategy = strategy
197
+ return model
163
198
 
164
199
  def pytorch_dtype(self) -> torch.dtype:
165
200
  if self.type == QuantizationType.FLOAT:
@@ -110,6 +110,7 @@ def is_preset_scheme(name: str) -> bool:
110
110
  """
111
111
  return name.upper() in PRESET_SCHEMES
112
112
 
113
+ UNQUANTIZED = dict()
113
114
 
114
115
  # 8 bit integer weights and 8 bit activations quantization
115
116
  W8A8 = dict(
@@ -208,6 +209,8 @@ FP8_DYNAMIC = dict(
208
209
  )
209
210
 
210
211
  PRESET_SCHEMES = {
212
+ # Unquantized (no-op)
213
+ "UNQUANTIZED": UNQUANTIZED,
211
214
  # Integer weight only schemes
212
215
  "W8A16": W8A16,
213
216
  "W4A16": W4A16,
@@ -181,7 +181,7 @@ def calculate_compression_ratio(model: Module) -> float:
181
181
  for parameter in model.parameters():
182
182
  uncompressed_bits = get_torch_bit_depth(parameter)
183
183
  compressed_bits = uncompressed_bits
184
- if is_module_quantized(submodule):
184
+ if is_module_quantized(submodule) and submodule.quantization_scheme.weights:
185
185
  compressed_bits = submodule.quantization_scheme.weights.num_bits
186
186
 
187
187
  num_weights = parameter.numel()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: compressed-tensors-nightly
3
- Version: 0.5.0.20240902
3
+ Version: 0.5.0.20240903
4
4
  Summary: Library for utilization of compressed safetensors of neural network models
5
5
  Home-page: https://github.com/neuralmagic/compressed-tensors
6
6
  Author: Neuralmagic, Inc.
@@ -2,11 +2,11 @@ compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6h
2
2
  compressed_tensors/base.py,sha256=Mq4mfVQcJhNpha-BXzpOfpmFIdl01o09BJE7D2oQ_00,796
3
3
  compressed_tensors/version.py,sha256=DdMT4o5D6_t26gTuvhF1Q9HPeXY6vV5g7XMprWuHLdI,1586
4
4
  compressed_tensors/compressors/__init__.py,sha256=wmX4VnkUTS63xBwK5-6w8FP78bNZpcdcqvf2KOEC5E4,1133
5
- compressed_tensors/compressors/base.py,sha256=4BO07h28Epbl2ED43lORnPGmBZ3pMdaoLYym_LJTpPQ,9846
5
+ compressed_tensors/compressors/base.py,sha256=sJB3QhvNHxwBmpoLy_obkJBuIZ2hY__Jd-Mf2-MAty8,9966
6
6
  compressed_tensors/compressors/dense.py,sha256=xcWECjcRY4INN6jC7vHx5wvUX3NmnKlxA9SVE1A6m2Q,1267
7
7
  compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
8
8
  compressed_tensors/compressors/marlin_24.py,sha256=e7fGUyZbjUpA5VUMCPxqcYPGNiwoDKupHJaXWCoVKRw,9410
9
- compressed_tensors/compressors/model_compressor.py,sha256=Yv2V8Ey6AFDg2Tmvwc7-E_AnMFkeIy_HVu62ct650AI,16507
9
+ compressed_tensors/compressors/model_compressor.py,sha256=gI6KKtH3eeWi2540Ayx-4bg9o8qjrvxlF4Gd_sqltGA,16678
10
10
  compressed_tensors/compressors/naive_quantized.py,sha256=z3h3ca5xKCN69mahutxcbzdv-OysiaxaM8P-Qum6zUQ,4823
11
11
  compressed_tensors/compressors/pack_quantized.py,sha256=27RVmJ2wg2dvCoawj407HSmKT3VPGJ6ujAMHlT26WlI,7571
12
12
  compressed_tensors/compressors/sparse_bitmask.py,sha256=kiDwBlFV0sJGLcIdDYxIiuF64ccgwDfqq1hWRQThYDc,8647
@@ -17,17 +17,17 @@ compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5y
17
17
  compressed_tensors/linear/__init__.py,sha256=fH6rjBYAxuwrTzBTlTjTgCYNyh6TCvCqajCz4Im4YrA,617
18
18
  compressed_tensors/linear/compressed_linear.py,sha256=G0gEFfxLAUsgRcnfSV-PKz1ZBNTVokOauOoup7SE1mw,3210
19
19
  compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
20
- compressed_tensors/quantization/quant_args.py,sha256=wSC2ve1P-XRwZUpqEaqvQpj1Xe0EGgmmPEjPk9YEnyg,6797
20
+ compressed_tensors/quantization/quant_args.py,sha256=Td71ap7oYxcrjAvRVafQ3hZv3BbmCL50Elyyv7EG0Rw,7733
21
21
  compressed_tensors/quantization/quant_config.py,sha256=NpVu8YJ4Xw2pIQW_PGaNaml8kx1bUnxkvb0jBYWbKdE,9971
22
- compressed_tensors/quantization/quant_scheme.py,sha256=_RKOFJI0T5xJVBLX63UeYkSY4EFAecsBnqzUIVBjeU0,6014
22
+ compressed_tensors/quantization/quant_scheme.py,sha256=VRvWweqwlhjYMrKf62fXKQTeoJGhjJa3tXnE-TuFdFA,6093
23
23
  compressed_tensors/quantization/lifecycle/__init__.py,sha256=MXE2E7GfIfRRfhrdGy2Og3AZOz5N59B0ZGFcsD89y6c,821
24
24
  compressed_tensors/quantization/lifecycle/apply.py,sha256=uftWFunr_CpCZM_qWfo2O1USXKB2qSYD1pBJsO8BuCU,15285
25
25
  compressed_tensors/quantization/lifecycle/calibration.py,sha256=PlS_EqCOPqJD3QKuLPXO9AOtDzXtQWvEBTynFv-FFVw,2698
26
26
  compressed_tensors/quantization/lifecycle/compressed.py,sha256=laNDwvhk4S925qWTPHCufo4uDdMo24NDV1qhsAkf5Iw,2225
27
- compressed_tensors/quantization/lifecycle/forward.py,sha256=fZMSrUXX2NnkQiappEpT5SO-6JxbX5wiw9hyjfKNIZo,13538
27
+ compressed_tensors/quantization/lifecycle/forward.py,sha256=PljD9pzATILEOiC3ZdHUTsfSbZdAa6iSIxWmvAHLG9I,13688
28
28
  compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
29
29
  compressed_tensors/quantization/lifecycle/helpers.py,sha256=TmLY_G5VP_Fg2Ywio_dxoHRTxOKZdT7_aG5S9WtD4zI,2424
30
- compressed_tensors/quantization/lifecycle/initialize.py,sha256=r8GNYIUYVHJ-539mHKnhhGysCluaOG6VieH6CQD4eeo,7112
30
+ compressed_tensors/quantization/lifecycle/initialize.py,sha256=S5Kwy16Da8WUIIpa1xVKc72MijJ5C_rqM6JjanZ7MGk,7133
31
31
  compressed_tensors/quantization/observers/__init__.py,sha256=4Sa7rqi5RB_S5bPO8KmncETiqDsoMBhwP37arlQym8s,764
32
32
  compressed_tensors/quantization/observers/base.py,sha256=5ovQicWPYHjIxr6-EkQ4lgOX0PpI9g23iSzKpxjM1Zg,8420
33
33
  compressed_tensors/quantization/observers/helpers.py,sha256=s_A23Qa_BLfOdHJCN5bm-qPWkhjjj_RIVrhSp1Y9Dtk,4211
@@ -35,7 +35,7 @@ compressed_tensors/quantization/observers/memoryless.py,sha256=jH_c6K3gxf4W3VNXQ
35
35
  compressed_tensors/quantization/observers/min_max.py,sha256=sQXqU3z-voxIDfR_9mQzwQUflZj2sASm_G8CYaXntFw,3865
36
36
  compressed_tensors/quantization/observers/mse.py,sha256=Aeh-253Vbab1F8cYuBiGNn4OXWJ67wXQ_JVfl3mu2a8,6034
37
37
  compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
38
- compressed_tensors/quantization/utils/helpers.py,sha256=YjXABJQUnelof-z7qcwck6fnrFLh4uMSrOmPiqNp_RY,8591
38
+ compressed_tensors/quantization/utils/helpers.py,sha256=pwvU613XRvMDtI5b39II5jukBl5OUCqoX0ofVRpOFRY,8633
39
39
  compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
40
40
  compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
41
41
  compressed_tensors/utils/__init__.py,sha256=gS4gSU2pwcAbsKj-6YMaqhm25udFy6ISYaWBf-myRSM,808
@@ -45,8 +45,8 @@ compressed_tensors/utils/permutations_24.py,sha256=kx6fsfDHebx94zsSzhXGyCyuC9sVy
45
45
  compressed_tensors/utils/permute.py,sha256=V6tJLKo3Syccj-viv4F7ZKZgJeCB-hl-dK8RKI_kBwI,2355
46
46
  compressed_tensors/utils/safetensors_load.py,sha256=m08ANVuTBxQdoa6LufDgcNJ7wCLDJolyZljB8VEybAU,8578
47
47
  compressed_tensors/utils/semi_structured_conversions.py,sha256=XKNffPum54kPASgqKzgKvyeqWPAkair2XEQXjkp7ho8,13489
48
- compressed_tensors_nightly-0.5.0.20240902.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
49
- compressed_tensors_nightly-0.5.0.20240902.dist-info/METADATA,sha256=C5qh78nBJycno_oq2ML1puURNBO0pKRLCNY2YrV5SMg,6799
50
- compressed_tensors_nightly-0.5.0.20240902.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
51
- compressed_tensors_nightly-0.5.0.20240902.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
52
- compressed_tensors_nightly-0.5.0.20240902.dist-info/RECORD,,
48
+ compressed_tensors_nightly-0.5.0.20240903.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
49
+ compressed_tensors_nightly-0.5.0.20240903.dist-info/METADATA,sha256=plHC3Fg0bs-UlLdWYSOLl7RoMbum05Vg-JLDaje0YrY,6799
50
+ compressed_tensors_nightly-0.5.0.20240903.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
51
+ compressed_tensors_nightly-0.5.0.20240903.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
52
+ compressed_tensors_nightly-0.5.0.20240903.dist-info/RECORD,,