PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/quantizers/quantizer_torchao.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING
 from packaging import version
 from .base import HfQuantizer
-from .quantizers_utils import get_module_from_name
+from .quantizers_utils import get_module_from_name, should_convert_module
 if TYPE_CHECKING:
@@ -94,19 +94,19 @@ class TorchAoHfQuantizer(HfQuantizer):
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
-        if isinstance(self.quantization_config.quant_type, str):
-            is_int_4 = "int4" in self.quantization_config.quant_type
-        else:
-            config_name = self.quantization_config.quant_type.__class__.__name__
-            is_int_4 = fuzzy_match_size(config_name) == "4"
-        # TODO: better way to get the serialized key names? Hard to read from torchao codebase
-        if is_int_4:
-            self.weight_ao_keys = ["qdata", "scale", "zero_point"]
+        self.quantized_param_size = None
+        quant_type = self.quantization_config.quant_type
+        if isinstance(quant_type, str):
+            map_to_param_size = {
+                "int4_weight_only": 0.5,
+                "int8_weight_only": 1,
+                "int8_dynamic_activation_int8_weight": 1,
+            }
+            if quant_type in map_to_param_size:
+                self.quantized_param_size = map_to_param_size[quant_type]
         else:
-            self.weight_ao_keys = ["qdata", "scale"]
-        # Instead of serializing the simple torch.Tensor like usual, torchao adds a `:_data` suffix so we need this
-        self.full_ao_keys = self.weight_ao_keys + ["_data"]
+            size_digit = fuzzy_match_size(quant_type.__class__.__name__)
+            self.quantized_param_size = 0.5 if size_digit == "4" else 1
     def validate_environment(self, *args, **kwargs):
         if not is_torchao_available():
@@ -134,22 +134,11 @@ class TorchAoHfQuantizer(HfQuantizer):
     def update_dtype(self, dtype):
         if self.quantization_config.quant_type == "int4_weight_only":
-            if dtype is not None and dtype != torch.bfloat16:
+            if dtype != torch.bfloat16:
                 logger.warning_once(
-                    f"Setting dtype to {dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the dtype to bfloat16."
-                )
-            if dtype is None:
-                logger.warning_once(
-                    "Setting dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set dtype=torch.bfloat16 to remove this warning."
+                    f"Setting dtype to {dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16."
                 )
                 dtype = torch.bfloat16
-        if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
-            if dtype is None:
-                logger.info(
-                    "Setting dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no dtype was specified in from_pretrained"
-                )
-                # we need to set the dtype, otherwise we have dtype mismatch when performing the quantized linear op
-                dtype = torch.float32
         return dtype
     def get_state_dict_and_metadata(self, model):
@@ -157,57 +146,27 @@ class TorchAoHfQuantizer(HfQuantizer):
         We flatten the state dict of tensor subclasses so that it is compatible with the safetensors format.
         """
         if TORCHAO_VERSION >= version.parse("0.15.0"):
-            return flatten_tensor_state_dict(model.state_dict()), {}
+            return flatten_tensor_state_dict(model.state_dict())
         else:
             raise RuntimeError(
                 f"In order to use safetensors with torchao, please use torchao version >= 0.15.0. Current version: {TORCHAO_VERSION}"
             )
-    def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        from accelerate.utils import CustomDtype
-        # Import AOBaseConfig directly since we know we have the right version
-        if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
-            from torchao.core.config import AOBaseConfig
-            quant_type = self.quantization_config.quant_type
-            if isinstance(quant_type, AOBaseConfig):
-                # Extract size digit using fuzzy match on the class name
-                config_name = quant_type.__class__.__name__
-                size_digit = fuzzy_match_size(config_name)
-                # Map the extracted digit to appropriate dtype
-                if size_digit == "4":
-                    return CustomDtype.INT4
-                else:
-                    # Default to int8
-                    return torch.int8
-            # Original mapping for non-AOBaseConfig types
-            map_to_target_dtype = {
-                "int4_weight_only": CustomDtype.INT4,
-                "int8_weight_only": torch.int8,
-                "int8_dynamic_activation_int8_weight": torch.int8,
-                "autoquant": None,
-            }
-            return map_to_target_dtype[self.quantization_config.quant_type]
-        else:
-            raise ValueError(
-                "You are using `device_map='auto'` on a torchao quantized model. To automatically compute"
-                " the appropriate device map, you should upgrade your `accelerate` library with "
-                "`pip install --upgrade accelerate`"
-            )
+    def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
+        "Return the element size (in bytes) for `param_name`."
+        if self.param_needs_quantization(model, param_name) and self.quantized_param_size is not None:
+            return self.quantized_param_size
+        return super().param_element_size(model, param_name, param)
     def adjust_max_memory(self, max_memory: dict[str, int | str]) -> dict[str, int | str]:
         # need more space for the quantization parameters (e.g. scale). Tested with int4 wo and group size = 128
         max_memory = {key: val * 0.9 for key, val in max_memory.items()}
         return max_memory
-    def _process_model_before_weight_loading(
-        self, model: "PreTrainedModel", keep_in_fp32_modules: list[str] | None = None, **kwargs
-    ):
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", checkpoint_files=None, **kwargs):
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         if self.quantization_config.include_input_output_embeddings:
             input_emb = model.get_input_embeddings()
@@ -217,16 +176,16 @@ class TorchAoHfQuantizer(HfQuantizer):
             self.modules_to_not_convert = [
                 x for x in self.modules_to_not_convert if x not in input_emb_names + output_emb_names
             ]
-        return
+        if checkpoint_files is not None:
+            # Torchao needs access to all metadata later
+            self.set_metadata(checkpoint_files)
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
-        if self.pre_quantized:
-            return False
         if self.quantization_config.quant_type == "autoquant":
             return False
         # check if the param_name is not in self.modules_to_not_convert
-        if any(key + "." in param_name or key == param_name for key in self.modules_to_not_convert):
+        if not should_convert_module(param_name, self.modules_to_not_convert):
             return False
         # we only quantize the weight of nn.Linear and nn.Embedding
@@ -253,22 +212,6 @@ class TorchAoHfQuantizer(HfQuantizer):
         return isinstance(module, tuple(_QUANTIZABLE)) and tensor_name == "weight"
-    def preprocess_model(self, model: "PreTrainedModel", config, dtype=None, checkpoint_files=None, **kwargs):
-        """
-        Setting model attributes and/or converting model before weights loading. At this point
-        the model should be initialized on the meta device so you can freely manipulate the skeleton
-        of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.
-        Args:
-            model (`~transformers.PreTrainedModel`):
-                The model to quantize
-            kwargs (`dict`, *optional*):
-                The keyword arguments that are passed along `_process_model_before_weight_loading`.
-        """
-        super().preprocess_model(model, config, dtype, checkpoint_files, **kwargs)
-        # Torchao needs access to all metadata later
-        self.set_metadata(checkpoint_files)
     def _process_model_after_weight_loading(self, model, **kwargs):
         """No process required for torchao quantized model"""
         if self.quantization_config.quant_type == "autoquant":
@@ -294,45 +237,6 @@ class TorchAoHfQuantizer(HfQuantizer):
             )
         return _is_torchao_serializable
-    def get_accelerator_warm_up_factor(self):
-        """
-        This factor is used in caching_allocator_warmup to determine how many bytes to pre-allocate for accelerator warmup.
-        - A factor of 2 means we pre-allocate the full memory footprint of the model.
-        - A factor of 4 means we pre-allocate half of that, and so on
-        However, when using TorchAO, calculating memory usage with param.numel() * param.element_size() doesn't give the correct size for quantized weights (like int4 or int8)
-        That's because TorchAO internally represents quantized tensors using subtensors and metadata, and the reported element_size() still corresponds to the dtype
-        not the actual bit-width of the quantized data.
-        To correct for this:
-        - Use a division factor of 8 for int4 weights
-        - Use a division factor of 4 for int8 weights
-        """
-        if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
-            from torchao.core.config import AOBaseConfig
-            quant_type = self.quantization_config.quant_type
-            # For autoquant case, it will be treated in the string implementation below in map_to_target_dtype
-            if isinstance(quant_type, AOBaseConfig):
-                # Extract size digit using fuzzy match on the class name
-                config_name = quant_type.__class__.__name__
-                size_digit = fuzzy_match_size(config_name)
-                if size_digit == "4":
-                    return 8
-                else:
-                    return 4
-        # Original mapping for non-AOBaseConfig types
-        map_to_target_dtype = {
-            "int4_weight_only": 8,
-            "int8_weight_only": 4,
-            "int8_dynamic_activation_int8_weight": 4,
-            "autoquant": 4,
-        }
-        return map_to_target_dtype[self.quantization_config.quant_type]
     @property
     def is_trainable(self) -> bool:
         supported_quant_types_for_training = [

transformers/quantizers/quantizer_vptq.py CHANGED Viewed

@@ -49,24 +49,15 @@ class VptqHfQuantizer(HfQuantizer):
         if not torch.cuda.is_available():
             raise RuntimeError("GPU is required to run VTPQ quantized model.")
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.float16
-            logger.info(
-                "Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `dtype` manually."
-            )
-        return dtype
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        keep_in_fp32_modules: list[str] | None = None,
         **kwargs,
     ):
         from ..integrations import replace_with_vptq_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
         )
         replace_with_vptq_linear(
             model,

transformers/testing_utils.py CHANGED Viewed

@@ -118,6 +118,7 @@ from .utils import (
     is_mistral_common_available,
     is_natten_available,
     is_nltk_available,
+    is_numba_available,
     is_onnx_available,
     is_openai_available,
     is_optimum_available,
@@ -130,6 +131,7 @@ from .utils import (
     is_pyctcdecode_available,
     is_pytesseract_available,
     is_pytest_available,
+    is_pytest_order_available,
     is_pytorch_quantization_available,
     is_quark_available,
     is_qutlass_available,
@@ -221,7 +223,7 @@ if is_torch_available():
     import torch
     from safetensors.torch import load_file
-    from .modeling_utils import PreTrainedModel
+    from .modeling_utils import FLASH_ATTN_KERNEL_FALLBACK, PreTrainedModel
     IS_ROCM_SYSTEM = torch.version.hip is not None
     IS_CUDA_SYSTEM = torch.version.cuda is not None
@@ -620,7 +622,7 @@ def require_flash_attn(test_case):
     try:
         from kernels import get_kernel
-        get_kernel("kernels-community/flash-attn2")
+        get_kernel(FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"])
     except Exception as _:
         kernels_available = False
@@ -1091,17 +1093,20 @@ def require_torch_large_gpu(test_case, memory: float = 20):
     )(test_case)
-def require_torch_large_accelerator(test_case, memory: float = 20):
+def require_torch_large_accelerator(test_case=None, *, memory: float = 20):
     """Decorator marking a test that requires an accelerator with more than `memory` GiB of memory."""
-    if torch_device != "cuda" and torch_device != "xpu":
-        return unittest.skip(reason=f"test requires a GPU or XPU with more than {memory} GiB of memory")(test_case)
-    torch_accelerator_module = getattr(torch, torch_device)
+    def memory_decorator(tc):
+        if torch_device not in ("cuda", "xpu"):
+            return unittest.skip(f"test requires a GPU or XPU with more than {memory} GiB of memory")(tc)
-    return unittest.skipUnless(
-        torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 > memory,
-        f"test requires a GPU or XPU with more than {memory} GiB of memory",
-    )(test_case)
+        torch_accel = getattr(torch, torch_device)
+        return unittest.skipUnless(
+            torch_accel.get_device_properties(0).total_memory / 1024**3 > memory,
+            f"test requires a GPU or XPU with more than {memory} GiB of memory",
+        )(tc)
+    return memory_decorator if test_case is None else memory_decorator(test_case)
 def require_torch_accelerator(test_case):
@@ -1381,6 +1386,13 @@ def require_pyctcdecode(test_case):
     return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case)
+def require_numba(test_case):
+    """
+    Decorator marking a test that requires numba
+    """
+    return unittest.skipUnless(is_numba_available(), "test requires numba")(test_case)
 def require_librosa(test_case):
     """
     Decorator marking a test that requires librosa
@@ -2659,9 +2671,13 @@ def run_first(test_case):
     single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
     allocation conflicts.
     """
-    import pytest
+    # Without this check, we get unwanted warnings when it's not installed
+    if is_pytest_order_available():
+        import pytest
-    return pytest.mark.order(1)(test_case)
+        return pytest.mark.order(1)(test_case)
+    else:
+        return test_case
 def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):

transformers/tokenization_mistral_common.py CHANGED Viewed

@@ -1114,7 +1114,7 @@ class MistralCommonBackend(PushToHubMixin):
                     max_length = self.model_max_length
         # Test if we have a padding token
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
             raise ValueError(
                 "Asking to pad but the tokenizer does not have a padding token. "
                 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
@@ -1851,8 +1851,9 @@ class MistralCommonBackend(PushToHubMixin):
             raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
         # Handle kwargs and AutoTokenizer/AutoProcessor case
+        # These kwargs are passed by AutoTokenizer/AutoProcessor but are not used by MistralCommonBackend
         if kwargs and not set(kwargs.keys()).issubset(
-            {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
+            {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto", "subfolder"}
         ):
             raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")

transformers/tokenization_utils_base.py CHANGED Viewed

@@ -972,7 +972,7 @@ class PreTrainedTokenizerBase(PushToHubMixin):
     # first name has to correspond to main model input name
     # to make sure `tokenizer.pad(...)` works correctly
-    model_input_names: list[str] = ["input_ids", "token_type_ids", "attention_mask"]
+    model_input_names: list[str] = ["input_ids", "attention_mask"]
     padding_side: str = "right"
     truncation_side: str = "right"
     slow_tokenizer_class = None
@@ -2152,9 +2152,10 @@ class PreTrainedTokenizerBase(PushToHubMixin):
         # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
         tokenizer_class = self.__class__.__name__
-        # tokenizers backend don't need to save added_tokens_decoder
+        # tokenizers backend don't need to save added_tokens_decoder and additional_special_tokens
         if any(base.__name__ == "TokenizersBackend" for base in self.__class__.__mro__):
             tokenizer_config.pop("added_tokens_decoder", None)
+            tokenizer_config.pop("additional_special_tokens", None)
         # Remove the Fast at the end if we can save the slow tokenizer
         if tokenizer_class.endswith("Fast") and getattr(self, "can_save_slow_tokenizer", False):

transformers/tokenization_utils_tokenizers.py CHANGED Viewed

@@ -30,6 +30,7 @@ from tokenizers import AddedToken, processors
 from tokenizers import Encoding as EncodingFast
 from tokenizers import Tokenizer as TokenizerFast
 from tokenizers.decoders import Decoder as DecoderFast
+from tokenizers.models import BPE, Unigram
 from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
 from .integrations.ggml import convert_gguf_tokenizer
@@ -121,7 +122,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
                 if isinstance(vocab, list):
                     vocab = list(map(tuple, vocab))  # TODO just for now
             elif cls.model.__name__ == "Unigram":
-                vocab = list(map(tuple, vocab))
+                if vocab and isinstance(vocab[0], (list, tuple)):
+                    vocab = [tuple(item) for item in vocab]
             elif cls.model.__name__ == "WordLevel":
                 vocab = {token: i for i, token in enumerate(vocab)}
             elif cls.model.__name__ == "BPE" or cls.model.__name__ == "WordPiece":
@@ -182,6 +184,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
                 local_kwargs["vocab"], local_kwargs["merges"] = TikTokenConverter(
                     vocab_file=vocab_file, extra_special_tokens=local_kwargs.get("extra_special_tokens")
                 ).extract_vocab_merges_from_model(vocab_file)
             return local_kwargs
         # Fallback to standard vocab/merges files if they existed!
@@ -236,6 +239,9 @@ class TokenizersBackend(PreTrainedTokenizerBase):
         add_prefix_space = kwargs.get("add_prefix_space", False)
         vocab_file = kwargs.get("vocab_file")
+        vocab = kwargs.get("vocab")
+        merges = kwargs.get("merges")
         fast_tokenizer = None
         if tokenizer_object is not None:
             fast_tokenizer = copy.deepcopy(tokenizer_object)
@@ -252,6 +258,15 @@ class TokenizersBackend(PreTrainedTokenizerBase):
             kwargs.update(tokenizer_config)
             if len(additional_kwargs) > 0:
                 kwargs.update(additional_kwargs)
+        elif self._tokenizer is None and vocab is not None:
+            # Build from vocab/merges extracted by convert_to_native_format
+            if merges is not None:
+                vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
+                fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
+            elif isinstance(vocab, dict):
+                fast_tokenizer = TokenizerFast(BPE(vocab=vocab, merges=[], fuse_unk=True, dropout=None))
+            elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
+                fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
         elif self._tokenizer is None:
             raise ValueError(
                 "Couldn't instantiate the backend tokenizer from one of: \n"
@@ -260,6 +275,11 @@ class TokenizersBackend(PreTrainedTokenizerBase):
                 "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
                 "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one."
             )
+        # Only set defaults when creating TokenizersBackend from scratch
+        if fast_tokenizer_file is None and tokenizer_object is None and self._tokenizer is None:
+            kwargs.setdefault("bos_token", "<s>")
+            kwargs.setdefault("eos_token", "</s>")
         if fast_tokenizer is not None:
             self._tokenizer = fast_tokenizer
@@ -289,6 +309,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
         # Set backend to "tokenizers" if not already set
         if "backend" not in kwargs:
             kwargs["backend"] = "tokenizers"
         explicit_bos_eos_in_kwargs = "add_bos_token" in kwargs or "add_eos_token" in kwargs
         self._add_bos_token = kwargs.get("add_bos_token", False)
         self._add_eos_token = kwargs.get("add_eos_token", False)
@@ -339,7 +360,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
                 tokens.append(token)
             if tokens:
                 # These tokens are from the special tokens map
-                self.add_tokens(tokens, special_tokens=True)
+                self.add_tokens(tokens)
         try:
             vocab_size = self._tokenizer.get_vocab_size()
@@ -900,6 +921,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
         if isinstance(token_ids, int):
             token_ids = [token_ids]
+        if isinstance(token_ids, dict):
+            token_ids = token_ids["input_ids"]
         return self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
     def _save_pretrained(

transformers/trainer.py CHANGED Viewed

@@ -1671,6 +1671,12 @@ class Trainer:
                 optimizer_cls = AdamW8bit
             else:
                 raise ValueError("Invalid optimizer")
+            optimizer_kwargs.update(
+                {
+                    "block_size": optim_args.get("block_size", 256),
+                    "bf16_stochastic_round": strtobool(optim_args.get("bf16_stochastic_round", "False")),
+                }
+            )
             optimizer_kwargs.update(adam_kwargs)
         elif args.optim in [
             OptimizerNames.SCHEDULE_FREE_RADAM,
@@ -2349,7 +2355,8 @@ class Trainer:
         if self.is_fsdp_enabled:
             self.model = self.model_wrapped = model
             # Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
-            dist.fsdp.register_fsdp_forward_method(self.model, "generate")
+            if hasattr(self.model, "generate"):
+                dist.fsdp.register_fsdp_forward_method(self.model, "generate")
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
         if model is not self.model:
@@ -3943,6 +3950,9 @@ class Trainer:
         # Both standard transformer models and Liger-patched models handle shift_labels correctly,
         # so we can directly use the computed loss from the model output.
         # See: https://huggingface.co/docs/accelerate/en/concept_guides/sequence_parallelism
+        if "labels" not in inputs and "shift_labels" in inputs:
+            # DeepSpeed SP Dataloader removes "labels" but we need it, otherwise, we won't compute the loss.
+            inputs["labels"] = inputs["shift_labels"]
         outputs = model(**inputs)
         loss = outputs.loss
@@ -4018,7 +4028,16 @@ class Trainer:
                     self._save(output_dir, state_dict=state_dict)
         elif self.is_deepspeed_enabled:
             try:
-                state_dict = self.accelerator.get_state_dict(self.deepspeed)
+                accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
+                    inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
+                )
+                zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
+                if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
+                    # When using PEFT with DeepSpeed ZeRO Stage 3,
+                    # we do not need to load the frozen parameters
+                    state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
+                else:
+                    state_dict = self.accelerator.get_state_dict(self.deepspeed)
                 if self.args.should_save:
                     self._save(output_dir, state_dict=state_dict)
             except ValueError:
@@ -4824,6 +4843,7 @@ class Trainer:
         if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
             return
+        self.callback_handler.on_push_begin(self.args, self.state, self.control)
         output_dir = self.args.output_dir
         # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
         modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
@@ -4918,6 +4938,8 @@ class Trainer:
             The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
             progress of the commit if `blocking=True`.
         """
+        self.callback_handler.on_push_begin(self.args, self.state, self.control)
         model_name = kwargs.pop("model_name", None)
         if model_name is None and self.args.should_save:
             if self.args.hub_model_id is None:

transformers/trainer_callback.py CHANGED Viewed

@@ -420,6 +420,11 @@ class TrainerCallback:
         Event called after a prediction step.
         """
+    def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called before pushing the model to the hub, at the beginning of Trainer.push_to_hub and Trainer._push_from_checkpoint.
+        """
 class CallbackHandler(TrainerCallback):
     """Internal class that just calls the list of callbacks in order."""
@@ -532,6 +537,9 @@ class CallbackHandler(TrainerCallback):
     def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
         return self.call_event("on_prediction_step", args, state, control)
+    def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        return self.call_event("on_push_begin", args, state, control, **kwargs)
     def call_event(self, event, args, state, control, **kwargs):
         for callback in self.callbacks:
             result = getattr(callback, event)(

transformers/trainer_seq2seq.py CHANGED Viewed

@@ -333,7 +333,11 @@ class Seq2SeqTrainer(Trainer):
             self.model.generation_config._from_model_config = False
         # Retrieves GenerationConfig from model.generation_config
+        # Update with defaults because earlier the generation config used ot be init
+        # with default values. Now we init it with `None` and keep defaults for BC
         gen_config = self.model.generation_config
+        default_gen_config = gen_config._get_default_generation_params()
+        gen_config.update(**default_gen_config, defaults_only=True)
         # in case the batch is shorter than max length, the output should be padded
         if generated_tokens.shape[-1] < gen_config.max_length:
             generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)

transformers/training_args.py CHANGED Viewed

@@ -1530,16 +1530,14 @@ class TrainingArguments:
             self.greater_is_better = not self.metric_for_best_model.endswith("loss")
         if is_torch_available():
             if self.bf16 or self.bf16_full_eval:
-                if self.use_cpu and not is_torch_xla_available():
-                    # cpu
-                    raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
-                elif not self.use_cpu:
-                    if not is_torch_bf16_gpu_available() and not is_torch_xla_available():  # added for tpu support
-                        error_message = "Your setup doesn't support bf16/gpu."
-                        if is_torch_cuda_available():
-                            error_message += " You need Ampere+ GPU with cuda>=11.0"
-                        # gpu
-                        raise ValueError(error_message)
+                if (
+                    not self.use_cpu and not is_torch_bf16_gpu_available() and not is_torch_xla_available()
+                ):  # added for tpu support
+                    error_message = "Your setup doesn't support bf16/gpu. You need to assign use_cpu if you want to train the model on CPU"
+                    if is_torch_cuda_available():
+                        error_message += " You need Ampere+ GPU with cuda>=11.0"
+                    # gpu
+                    raise ValueError(error_message)
         if self.fp16 and self.bf16:
             raise ValueError("At most one of fp16 and bf16 can be True, but not both")

transformers/utils/__init__.py CHANGED Viewed

@@ -49,6 +49,7 @@ from .generic import (
     PaddingStrategy,
     TensorType,
     TransformersKwargs,
+    _is_tensor_or_array_like,
     can_return_loss,
     can_return_tuple,
     expand_dims,
@@ -144,6 +145,7 @@ from .import_utils import (
     is_gguf_available,
     is_gptqmodel_available,
     is_grokadamw_available,
+    is_grouped_mm_available,
     is_habana_gaudi1,
     is_hadamard_available,
     is_hqq_available,
@@ -168,6 +170,7 @@ from .import_utils import (
     is_ninja_available,
     is_nltk_available,
     is_num2words_available,
+    is_numba_available,
     is_onnx_available,
     is_openai_available,
     is_optimum_available,
@@ -182,6 +185,7 @@ from .import_utils import (
     is_pyctcdecode_available,
     is_pytesseract_available,
     is_pytest_available,
+    is_pytest_order_available,
     is_pytorch_quantization_available,
     is_quanto_greater,
     is_quark_available,

transformers/utils/attention_visualizer.py CHANGED Viewed

@@ -21,7 +21,7 @@ from ..models.auto.auto_factory import _get_model_class
 from ..models.auto.configuration_auto import AutoConfig
 from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
 from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
-from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES, AutoTokenizer
+from ..models.auto.tokenization_auto import AutoTokenizer
 from .import_utils import is_torch_available
@@ -199,12 +199,12 @@ class AttentionMaskVisualizer:
             if "token_type_ids" in inputs:  # TODO inspect signature of update causal mask
                 kwargs["token_type_ids"] = inputs["token_type_ids"]
             tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-        elif self.config.model_type in TOKENIZER_MAPPING_NAMES:
+        else:
             tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
             tokens = tokenizer.tokenize(input_sentence)
             attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
-        else:
-            raise ValueError(f"Model type {model.config.model_type} does not support attention visualization")
+            if attention_mask is None:
+                raise ValueError(f"Model type {self.config.model_type} does not support attention visualization")
         model.config._attn_implementation = "eager"
         model.train()

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl