PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/quantizers/quantizer_bitnet.py CHANGED Viewed

@@ -68,13 +68,12 @@ class BitNetHfQuantizer(HfQuantizer):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_bitnet_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_bitnet_linear(
@@ -87,10 +86,6 @@ class BitNetHfQuantizer(HfQuantizer):
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
         return max_memory
-    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
-        target_dtype = torch.int8
-        return target_dtype
     def is_serializable(self):
         return True

transformers/quantizers/quantizer_bnb_4bit.py CHANGED Viewed

@@ -51,15 +51,6 @@ class Bnb4BitHfQuantizer(HfQuantizer):
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
-        # This describes the additional items that are saved on the state dict (on the params themselves)
-        self.bnb_keys = [
-            f"quant_state.bitsandbytes__{self.quantization_config.bnb_4bit_quant_type}",
-            "absmax",
-            "quant_map",
-        ]
-        if self.quantization_config.bnb_4bit_use_double_quant:
-            self.bnb_keys.extend(["nested_absmax", "nested_quant_map"])
     def validate_environment(self, *args, **kwargs):
         if not is_accelerate_available():
             raise ImportError(
@@ -87,55 +78,25 @@ class Bnb4BitHfQuantizer(HfQuantizer):
                     "for more details. "
                 )
-    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
-        from accelerate.utils import CustomDtype
+    def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
+        "Return the element size (in bytes) for `param_name`."
+        if self.param_needs_quantization(model, param_name):
+            # 4 bit
+            return 0.5
-        if target_dtype != torch.int8:
-            logger.info("target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization")
-        return CustomDtype.INT4
+        return super().param_element_size(model, param_name, param)
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
         import bitsandbytes as bnb
-        # TODO: maybe remove
-        # # They are on the params themselves, so we cannot easily extract the module from the name
-        if any(param_name.endswith(x) for x in self.bnb_keys):
-            return True
         module, name = get_module_from_name(model, param_name)
         return isinstance(module, bnb.nn.Linear4bit) and name != "bias"
-    def get_param_name(self, param_name: str) -> str:
-        """
-        Get the right param_name in order to get the module associated with the param.
-        This is useful for quantized stats lile absmax or quant_map as we need to update the param_name to get the module as they are stored in ...weight.absmax.
-        """
-        if self.pre_quantized:
-            # We need to get the param name of quantized weights and not its components. Otherwise, we won't be able to get the nn.Module associated.
-            if any(param_name.endswith(x) for x in self.bnb_keys):
-                param_name = (
-                    param_name.rsplit(".", 1)[0] if "quant_state." not in param_name else param_name.rsplit(".", 2)[0]
-                )
-        return param_name
     def adjust_max_memory(self, max_memory: dict[str, int | str]) -> dict[str, int | str]:
         # need more space for buffers that are created during quantization
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
         return max_memory
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        # TODO: remove ? is it still true ? we will move to dtype = "auto" so it will likely be either fp16 or bf16
-        if dtype is None:
-            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
-            logger.info(
-                "Overriding dtype=%s with `dtype=torch.float16` due to "
-                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
-                "Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
-                " dtype=torch.float16 to remove this warning.",
-                dtype,
-            )
-            dtype = torch.float16
-        return dtype
     def update_device_map(self, device_map):
         if device_map is None:
             if torch.cuda.is_available():
@@ -159,13 +120,12 @@ class Bnb4BitHfQuantizer(HfQuantizer):
         self,
         model: "PreTrainedModel",
         device_map,
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_bnb_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+            model, self.quantization_config.llm_int8_skip_modules, model._keep_in_fp32_modules
         )
         if self.quantization_config.llm_int8_enable_fp32_cpu_offload:
@@ -192,10 +152,10 @@ class Bnb4BitHfQuantizer(HfQuantizer):
     def is_trainable(self) -> bool:
         return True
-    def _dequantize(self, model):
+    def _dequantize(self, model, dtype=None):
         from ..integrations import dequantize_and_replace
-        model = dequantize_and_replace(model, quantization_config=self.quantization_config)
+        model = dequantize_and_replace(model, quantization_config=self.quantization_config, dtype=dtype)
         return model
     def get_quantize_ops(self):

transformers/quantizers/quantizer_bnb_8bit.py CHANGED Viewed

@@ -83,19 +83,6 @@ class Bnb8BitHfQuantizer(HfQuantizer):
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
         return max_memory
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
-            logger.info(
-                "Overriding dtype=%s with `dtype=torch.float16` due to "
-                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
-                "Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
-                " dtype=torch.float16 to remove this warning.",
-                dtype,
-            )
-            dtype = torch.float16
-        return dtype
     def update_device_map(self, device_map):
         if device_map is None:
             if torch.cuda.is_available():
@@ -115,8 +102,12 @@ class Bnb8BitHfQuantizer(HfQuantizer):
             )
         return device_map
-    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
-        return torch.int8
+    def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
+        "Return the element size (in bytes) for `param_name`."
+        if self.param_needs_quantization(model, param_name):
+            # 8-bit
+            return 1
+        return super().param_element_size(model, param_name, param)
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
         import bitsandbytes as bnb
@@ -133,13 +124,12 @@ class Bnb8BitHfQuantizer(HfQuantizer):
         self,
         model: "PreTrainedModel",
         device_map,
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_bnb_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+            model, self.quantization_config.llm_int8_skip_modules, model._keep_in_fp32_modules
         )
         if self.quantization_config.llm_int8_enable_fp32_cpu_offload:
@@ -161,10 +151,10 @@ class Bnb8BitHfQuantizer(HfQuantizer):
     def is_trainable(self) -> bool:
         return True
-    def _dequantize(self, model):
+    def _dequantize(self, model, dtype=None):
         from ..integrations import dequantize_and_replace
-        model = dequantize_and_replace(model, quantization_config=self.quantization_config)
+        model = dequantize_and_replace(model, quantization_config=self.quantization_config, dtype=dtype)
         return model
     def get_quantize_ops(self):

transformers/quantizers/quantizer_compressed_tensors.py CHANGED Viewed

@@ -59,10 +59,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
             )
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            logger.info("Loading model using torch.float16 for compressed-tensors quantization")
-            dtype = torch.float16
-        elif dtype != torch.float16:
+        if dtype != torch.float16:
             logger.info("We suggest you to set `dtype=torch.float16` for better efficiency with compressed_tensors.")
         return dtype

transformers/quantizers/quantizer_eetq.py CHANGED Viewed

@@ -64,16 +64,7 @@ class EetqHfQuantizer(HfQuantizer):
                 )
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.float16
-            logger.info(
-                "Overriding dtype=%s with `dtype=torch.float16` due to "
-                "requirements of `eetq` to enable model loading in 8-bit. "
-                "Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
-                " dtype=torch.float16 to remove this warning.",
-                dtype,
-            )
-        elif dtype != torch.float16:
+        if dtype != torch.float16:
             logger.info("We suggest you to set `dtype=torch.float16` for better efficiency with EETQ.")
         return dtype
@@ -92,13 +83,12 @@ class EetqHfQuantizer(HfQuantizer):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_eetq_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_eetq_linear(

transformers/quantizers/quantizer_fbgemm_fp8.py CHANGED Viewed

@@ -84,19 +84,11 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
                 )
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.bfloat16
-            logger.info(
-                "Overriding dtype=%s with `dtype=torch.bloat16` due to "
-                "requirements of `fbgemm-gpu` to enable model loading in fp8. "
-                "Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
-                " dtype=torch.bfloat16 to remove this warning.",
-                dtype,
-            )
-        elif dtype == torch.float16:
-            raise ValueError(
-                "You cannot use FP8 with dtype=torch.float16. We recommend you passing dtype=torch.bfloat16"
+        if dtype != torch.bfloat16:
+            logger.warning_once(
+                f"Setting dtype to {dtype}, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16."
             )
+            dtype = torch.bfloat16
         return dtype
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
@@ -119,13 +111,12 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_fbgemm_fp8_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_fbgemm_fp8_linear(

transformers/quantizers/quantizer_finegrained_fp8.py CHANGED Viewed

@@ -33,7 +33,7 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
             return
         if not torch.cuda.is_available() and not is_torch_xpu_available():
-            if self.pre_quantized and not self.quantization_config.dequantize:
+            if self.pre_quantized:
                 logger.warning_once(
                     "Using FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is available"
                 )
@@ -46,10 +46,13 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
             compute_capability = torch.cuda.get_device_capability()
             major, minor = compute_capability
             if (major < 8) or (major == 8 and minor < 9):
-                raise ValueError(
+                logger.warning_once(
                     "FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100)"
-                    f", actual = `{major}.{minor}`"
+                    f", actual = `{major}.{minor}`. We will default to dequantizing the model to bf16. Feel free "
+                    f"to use a different quantization method like bitsandbytes or torchao"
                 )
+                self.quantization_config.dequantize = True
+                return
         device_map = kwargs.get("device_map")
         if device_map is None:
@@ -82,16 +85,22 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
                 return True
         return False
+    def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
+        "Return the element size (in bytes) for `param_name`."
+        if self.param_needs_quantization(model, param_name):
+            # 8 bit, this is neeed as when `pre_quantized`` is False, we don't set the dtype of the FP8Linear in order to correctly load the weights
+            return 1
+        return super().param_element_size(model, param_name, param)
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations.finegrained_fp8 import replace_with_fp8_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_fp8_linear(
@@ -103,7 +112,7 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
     # NOTE: TP is applied before quantization so this is only to add hooks.
     # Quantization is incompatible with DTensors, so we have to anyway have
-    # gathers! But it should be model independant -> figure out where to put
+    # gathers! But it should be model independent -> figure out where to put
     # the gather and that's it.
     def update_tp_plan(self, config):
         if "Qwen3" in config.__class__.__name__:
@@ -137,10 +146,6 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
     def is_trainable(self) -> bool:
         return False
-    def get_accelerator_warm_up_factor(self):
-        # Pre-processing is done cleanly, so we can allocate everything here
-        return 2
     def get_quantize_ops(self):
         from ..integrations.finegrained_fp8 import Fp8Quantize

transformers/quantizers/quantizer_fp_quant.py CHANGED Viewed

@@ -78,11 +78,11 @@ class FPQuantHfQuantizer(HfQuantizer):
                 )
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            logger.info("`dtype` is None. Setting `dtype=torch.bfloat16` for qutlass compatibility.")
+        if dtype != torch.bfloat16:
+            logger.warning_once(
+                f"Setting dtype to {dtype}, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16."
+            )
             dtype = torch.bfloat16
-        elif dtype != torch.bfloat16:
-            raise ValueError(f"Invalid `dtype` {dtype}. fp_quant quantization only supports `dtype=torch.bfloat16`.")
         return dtype
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:

transformers/quantizers/quantizer_gptq.py CHANGED Viewed

@@ -66,10 +66,7 @@ class GptqHfQuantizer(HfQuantizer):
             raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0")
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.float16
-            logger.info("Loading the model in `torch.float16`. To overwrite it, set `dtype` manually.")
-        elif dtype != torch.float16:
+        if dtype != torch.float16:
             logger.info("We suggest you to set `dtype=torch.float16` for better efficiency with GPTQ.")
         return dtype

transformers/quantizers/quantizer_higgs.py CHANGED Viewed

@@ -69,10 +69,7 @@ class HiggsHfQuantizer(HfQuantizer):
                 )
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            logger.info("`dtype` is None. Setting `dtype=torch.float16` for FLUTE compatibility.")
-            dtype = torch.float16
-        elif dtype != torch.float16 and dtype != torch.bfloat16:
+        if dtype != torch.float16 and dtype != torch.bfloat16:
             raise ValueError(
                 f"Invalid `dtype` {dtype}. HIGGS quantization only supports `dtype=torch.float16` or `dtype=torch.bfloat16`."
             )
@@ -116,13 +113,12 @@ class HiggsHfQuantizer(HfQuantizer):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_higgs_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         replace_with_higgs_linear(

transformers/quantizers/quantizer_mxfp4.py CHANGED Viewed

@@ -53,7 +53,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
         """Lazy import and initialize kernels only when needed"""
         if self.triton_kernels_hub is None:
             try:
-                from kernels import get_kernel
+                from ..integrations.hub_kernels import get_kernel
                 self.triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
             except ImportError:
@@ -135,18 +135,6 @@ class Mxfp4HfQuantizer(HfQuantizer):
                     "Please use a quantized checkpoint or remove the CPU or disk device from the device_map."
                 )
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.bfloat16
-            logger.info(
-                "Overriding dtype=%s with `dtype=torch.bfloat16` due to "
-                "requirements of `fbgemm-gpu` to enable model loading in fp4. "
-                "Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
-                " dtype=torch.bfloat16 to remove this warning.",
-                dtype,
-            )
-        return dtype
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
         from ..integrations import Mxfp4GptOssExperts
@@ -167,7 +155,6 @@ class Mxfp4HfQuantizer(HfQuantizer):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         use_kernels: bool = False,
         **kwargs,
     ):
@@ -182,7 +169,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
             self.quantization_config.dequantize = True
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_mxfp4_linear(
@@ -215,19 +202,6 @@ class Mxfp4HfQuantizer(HfQuantizer):
                 )
         return config
-    def get_param_name(self, param_name: str) -> str:
-        if self.quantization_config.dequantize:
-            if "_blocks" in param_name:
-                return param_name.replace("_blocks", "")
-            elif "_scales" in param_name:
-                return param_name.replace("_scales", "")
-        elif not self.pre_quantized:
-            if param_name.endswith("gate_up_proj"):
-                return param_name.replace("gate_up_proj", "gate_up_proj_blocks")
-            if param_name.endswith("down_proj"):
-                return param_name.replace("down_proj", "down_proj_blocks")
-        return param_name
     def get_state_dict_and_metadata(self, model):
         from ..integrations import Mxfp4GptOssExperts

transformers/quantizers/quantizer_quanto.py CHANGED Viewed

@@ -44,6 +44,13 @@ class QuantoHfQuantizer(HfQuantizer):
     def __init__(self, quantization_config: QuantoConfig, **kwargs):
         super().__init__(quantization_config, **kwargs)
+        map_to_param_size = {
+            "int8": 1,
+            "float8": 1,
+            "int4": 0.5,
+            "int2": 0.25,
+        }
+        self.quantized_param_size = map_to_param_size.get(self.quantization_config.weights, None)
     def validate_environment(self, *args, **kwargs):
         if not is_optimum_quanto_available():
@@ -83,25 +90,18 @@ class QuantoHfQuantizer(HfQuantizer):
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
         return max_memory
-    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
-        from accelerate.utils import CustomDtype
+    def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
+        "Return the element size (in bytes) for `param_name`."
+        if self.param_needs_quantization(model, param_name) and self.quantized_param_size is not None:
+            return self.quantized_param_size
-        mapping = {
-            "int8": torch.int8,
-            "float8": CustomDtype.FP8,
-            "int4": CustomDtype.INT4,
-            "int2": CustomDtype.INT2,
-        }
-        target_dtype = mapping[self.quantization_config.weights]
-        return target_dtype
+        return super().param_element_size(model, param_name, param)
-    def _process_model_before_weight_loading(
-        self, model: "PreTrainedModel", keep_in_fp32_modules: list[str] | None = None, **kwargs
-    ):
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
         from ..integrations import replace_with_quanto_layers
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         model = replace_with_quanto_layers(

transformers/quantizers/quantizer_spqr.py CHANGED Viewed

@@ -51,24 +51,19 @@ class SpQRHfQuantizer(HfQuantizer):
             raise ImportError("Using `spqr` quantization requires SpQR: `pip install spqr_quant[gpu]`")
     def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.float16
-            logger.info("Assuming SpQR inference on GPU and loading the model in `torch.float16`.")
-        elif dtype != torch.float16:
+        if dtype != torch.float16:
             raise ValueError(
-                "You cannot use any type other than torch.float16 for SpQR. Please either leave it None or set it to"
-                "torch.float16 explicitly."
+                "You cannot use any type other than torch.float16 for SpQR. Please set it totorch.float16 explicitly."
             )
         return dtype
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         replace_with_spqr_linear(
             model,

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl