PyPI - compressed-tensors-nightly - Versions diffs - 0.9.2.20250325__tar.gz → 0.9.2.20250328__tar.gz - Mend

@@ -82,11 +82,32 @@ class BaseQuantizationCompressor(BaseCompressor):
         """
         compressed_dict = {}
         weight_suffix = ".weight"
+        input_zp_suffix = ".input_zero_point"
+        weight_zp_suffix = ".weight_zero_point"
         _LOGGER.debug(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
         for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
+            # check if the parameter we're compressing is the weight zp
+            # or the input zp
+            is_weight_zp = name.endswith(weight_zp_suffix)
+            is_input_zp = name.endswith(input_zp_suffix)
+            # if we're saving the weight zp, fetch weight quant args
+            if is_weight_zp:
+                quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
+                if isinstance(quant_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    quant_args_zp = quant_args_zp[0]
+            # if we're saving the input zp, fetch input quant args
+            if is_input_zp:
+                input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
+                if isinstance(input_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    input_args_zp = input_args_zp[-1]
             if name.endswith(weight_suffix):
                 prefix = name[: -(len(weight_suffix))]
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
@@ -94,7 +115,11 @@ class BaseQuantizationCompressor(BaseCompressor):
                 g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
                 if scale is not None:
                     # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
+                    if isinstance(names_to_scheme[prefix], tuple):
+                        quant_args = names_to_scheme[prefix][0]
+                    else:
+                        quant_args = names_to_scheme[prefix]
                     compressed_data = self.compress_weight(
                         weight=value,
                         scale=scale,
@@ -107,7 +132,11 @@ class BaseQuantizationCompressor(BaseCompressor):
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            elif name.endswith("zero_point") and torch.all(value == 0):
+            # only save if asym
+            elif is_weight_zp and quant_args_zp.symmetric:
+                continue
+            # only save if asym
+            elif is_input_zp and input_args_zp.symmetric:
                 continue
             elif name.endswith("g_idx") and torch.any(value <= -1):
                 continue

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors-nightly
-Version: 0.9.2.20250325
+Version: 0.9.2.20250328
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

@@ -19,7 +19,7 @@ import os
 import re
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
 import compressed_tensors
 import torch
@@ -522,10 +522,13 @@ class ModelCompressor:
                 update_parameter_data(module, data, param_name)
-def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
+def map_modules_to_quant_args(
+    model: Module,
+) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
     """
     Given a pytorch model, map out the submodule name (usually linear layers)
-     to the QuantizationArgs
+    to the weight QuantizationArgs. If running input activation quantization, will also
+    map to the input QuantizationArgs in a tuple.
     :param model: pytorch model
     """
@@ -535,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
             if submodule.quantization_scheme.weights is not None:
                 name = fix_fsdp_module_name(name)
                 quantized_modules_to_args[name] = submodule.quantization_scheme.weights
+                if submodule.quantization_scheme.input_activations is not None:
+                    weight_args = quantized_modules_to_args.get(name)
+                    quantized_modules_to_args[name] = (
+                        weight_args,
+                        submodule.quantization_scheme.input_activations,
+                    )
     return quantized_modules_to_args

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: compressed-tensors-nightly
-Version: 0.9.2.20250325
+Version: 0.9.2.20250328
 Summary: Library for utilization of compressed safetensors of neural network models
 Home-page: https://github.com/neuralmagic/compressed-tensors
 Author: Neuralmagic, Inc.

compressed-tensors-nightly 0.9.2.20250325__tar.gz → 0.9.2.20250328__tar.gz

compressed-tensors-nightly 0.9.2.20250325tar.gz → 0.9.2.20250328tar.gz