PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/generation/configuration_utils.py CHANGED Viewed

@@ -20,12 +20,11 @@ import os
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from dataclasses import dataclass, is_dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 from huggingface_hub import create_repo
 from .. import __version__
-from ..configuration_utils import PreTrainedConfig
 from ..utils import (
     GENERATION_CONFIG_NAME,
     ExplicitEnum,
@@ -38,6 +37,7 @@ from ..utils import (
 if TYPE_CHECKING:
+    from ..configuration_utils import PreTrainedConfig
     from ..modeling_utils import PreTrainedModel
@@ -104,18 +104,18 @@ class GenerationConfig(PushToHubMixin):
     Arg:
         > Parameters that control the length of the output
-        max_length (`int`, *optional*, defaults to 20):
+        max_length (`int`, *optional*):
             `max_new_tokens` is recommended for controlling how many tokens the model generates.
             `max_length` remains for backward compatibility.
         max_new_tokens (`int`, *optional*):
             The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
-        min_length (`int`, *optional*, defaults to 0):
+        min_length (`int`, *optional*):
             The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
             `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
         min_new_tokens (`int`, *optional*):
             The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
-        early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+        early_stopping (`bool` or `str`, *optional*):
             Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
             `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
             heuristic is applied and the generation stops when is it very unlikely to find better candidates;
@@ -129,17 +129,17 @@ class GenerationConfig(PushToHubMixin):
         > Parameters that control the generation strategy used
-        do_sample (`bool`, *optional*, defaults to `False`):
+        do_sample (`bool`, defaults to `False`):
             Whether or not to use sampling ; use greedy decoding otherwise.
-        num_beams (`int`, *optional*, defaults to 1):
+        num_beams (`int`, *optional*):
             Number of beams for beam search. 1 means no beam search.
         > Parameters that control the cache
-        use_cache (`bool`, *optional*, defaults to `True`):
+        use_cache (`bool`, defaults to `True`):
             Whether or not the model should use the past last key/values attentions (if applicable to the model) to
             speed up decoding.
-        cache_implementation (`str`, *optional*, default to `None`):
+        cache_implementation (`str`, *optional*):
             Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
             - `"dynamic"`: [`DynamicCache`]
@@ -155,11 +155,11 @@ class GenerationConfig(PushToHubMixin):
         > Parameters for manipulation of the model output logits
-        temperature (`float`, *optional*, defaults to 1.0):
+        temperature (`float`, *optional*):
             The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
-        top_k (`int`, *optional*, defaults to 50):
+        top_k (`int`, *optional*):
             The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50.
-        top_p (`float`, *optional*, defaults to 1.0):
+        top_p (`float`, *optional*):
             If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
             `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
         min_p (`float`, *optional*):
@@ -172,41 +172,41 @@ class GenerationConfig(PushToHubMixin):
             is kept whose *renormalized* entropy is less than or equal to `top_h` times the entropy of the full distribution.
             Smaller values (e.g., 0.2–0.5) lead to more focused, deterministic outputs, while values closer to 1.0 allow more
             randomness and diversity. Typical values are in the 0.3–0.6 range.
-        typical_p (`float`, *optional*, defaults to 1.0):
+        typical_p (`float`, *optional*):
             Local typicality measures how similar the conditional probability of predicting a target token next is to
             the expected conditional probability of predicting a random token next, given the partial text already
             generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
             add up to `typical_p` or higher are kept for generation. See [this
             paper](https://huggingface.co/papers/2202.00666) for more details.
-        epsilon_cutoff (`float`, *optional*, defaults to 0.0):
+        epsilon_cutoff (`float`, *optional*):
             If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
             `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
             size of the model. See [Truncation Sampling as Language Model
             Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
-        eta_cutoff (`float`, *optional*, defaults to 0.0):
+        eta_cutoff (`float`, *optional*):
             Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
             0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
             exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
             probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
             depending on the size of the model. See [Truncation Sampling as Language Model
             Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
-        repetition_penalty (`float`, *optional*, defaults to 1.0):
+        repetition_penalty (`float`, *optional*):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://huggingface.co/papers/1909.05858) for more details.
-        encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
+        encoder_repetition_penalty (`float`, *optional*):
             The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
             original input. 1.0 means no penalty.
-        length_penalty (`float`, *optional*, defaults to 1.0):
+        length_penalty (`float`, *optional*):
             Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
             the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
             likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
             `length_penalty` < 0.0 encourages shorter sequences.
-        no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+        no_repeat_ngram_size (`int`, *optional*):
             If set to int > 0, all ngrams of that size can only occur once.
         bad_words_ids (`list[list[int]]`, *optional*):
             List of list of token ids that are not allowed to be generated. Check
             [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
-        renormalize_logits (`bool`, *optional*, defaults to `False`):
+        renormalize_logits (`bool`, defaults to `False`):
             Whether to renormalize the logits after applying all the logits processors (including the custom
             ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
             are normalized but some logit processors break the normalization.
@@ -217,7 +217,7 @@ class GenerationConfig(PushToHubMixin):
         forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
             The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
             list to set multiple *end-of-sequence* tokens.
-        remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
+        remove_invalid_values (`bool`, defaults to `model.config.remove_invalid_values`):
             Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash.
             Note that using `remove_invalid_values` can slow down generation.
         exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
@@ -234,7 +234,7 @@ class GenerationConfig(PushToHubMixin):
             Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
             sequence being selected, while negative biases do the opposite. Check
             [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
-        token_healing (`bool`, *optional*, defaults to `False`):
+        token_healing (`bool`, defaults to `False`):
             Heal tail tokens of prompts by replacing them with their appropriate extensions.
             This enhances the quality of completions for prompts affected by greedy tokenization bias.
         guidance_scale (`float`, *optional*):
@@ -250,18 +250,18 @@ class GenerationConfig(PushToHubMixin):
         num_return_sequences (`int`, *optional*, defaults to 1):
             The number of independently computed returned sequences for each element in the batch.
-        output_attentions (`bool`, *optional*, defaults to `False`):
+        output_attentions (`bool`, defaults to `False`):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more details.
-        output_hidden_states (`bool`, *optional*, defaults to `False`):
+        output_hidden_states (`bool`, defaults to `False`):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more details.
-        output_scores (`bool`, *optional*, defaults to `False`):
+        output_scores (`bool`, defaults to `False`):
             Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-        output_logits (`bool`, *optional*):
+        output_logits (`bool`, defaults to `False`):
             Whether or not to return the unprocessed prediction logit scores. See `logits` under returned tensors for
             more details.
-        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+        return_dict_in_generate (`bool`, defaults to `False`):
             Whether or not to return a [`~utils.ModelOutput`], as opposed to returning exclusively the generated
             sequence. This flag must be set to `True` to return the generation cache (when `use_cache` is `True`)
             or optional outputs (see flags starting with `output_`)
@@ -277,7 +277,7 @@ class GenerationConfig(PushToHubMixin):
         > Generation parameters exclusive to encoder-decoder models
-        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+        encoder_no_repeat_ngram_size (`int`, *optional*):
             If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
             `decoder_input_ids`.
         decoder_start_token_id (`int` or `list[int]`, *optional*):
@@ -286,20 +286,20 @@ class GenerationConfig(PushToHubMixin):
             (e.g. multilingual models with different target languages in one batch)
         > Generation parameters exclusive to assistant generation
-        is_assistant (`bool`, *optional*, defaults to `False`):
+        is_assistant (`bool`, defaults to `False`):
             Whether the model is an assistant (draft) model.
-        num_assistant_tokens (`int`, *optional*, defaults to 20):
+        num_assistant_tokens (`int`, *optional*):
             Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
             checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
             more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
             model requires lots of corrections, lower speed-ups are reached.
-        num_assistant_tokens_schedule (`str`, *optional*, defaults to `"constant"`):
+        num_assistant_tokens_schedule (`str`, *optional*):
             Defines the schedule at which max assistant tokens shall be changed during inference.
             - `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
               reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
             - `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
             - `"constant"`: `num_assistant_tokens` stays unchanged during generation
-        assistant_confidence_threshold (`float`, *optional*, defaults to 0.4):
+        assistant_confidence_threshold (`float`, *optional*):
             The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower
             than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
             (defined by `num_assistant_tokens`) is not yet reached. The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes, biased towards avoiding false negatives.
@@ -313,11 +313,11 @@ class GenerationConfig(PushToHubMixin):
         assistant_early_exit(`int`, *optional*):
             If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
             models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
-        assistant_lookbehind(`int`, *optional*, defaults to 10):
+        assistant_lookbehind(`int`, *optional*):
             If set to a positive integer, the re-encodeing process will additionally consider the last `assistant_lookbehind` assistant tokens
             to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
             See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
-        target_lookbehind(`int`, *optional*, defaults to 10):
+        target_lookbehind(`int`, *optional*):
             If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens
             to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
             See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
@@ -327,7 +327,7 @@ class GenerationConfig(PushToHubMixin):
         compile_config (CompileConfig, *optional*):
             If using a compilable cache, this controls how `generate` will `compile` the forward pass for faster
             inference.
-        disable_compile (`bool`, *optional*):
+        disable_compile (`bool`, defaults to `False`):
             Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
             specific criteria are met, including using a compilable cache. Please open an issue if you find the
             need to use this flag.
@@ -337,38 +337,36 @@ class GenerationConfig(PushToHubMixin):
     def __init__(self, **kwargs):
         # Parameters that control the length of the output
-        self.max_length = kwargs.pop("max_length", 20)
+        self.max_length = kwargs.pop("max_length", None)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
-        self.min_length = kwargs.pop("min_length", 0)
+        self.min_length = kwargs.pop("min_length", None)
         self.min_new_tokens = kwargs.pop("min_new_tokens", None)
-        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.early_stopping = kwargs.pop("early_stopping", None)
         self.max_time = kwargs.pop("max_time", None)
         self.stop_strings = kwargs.pop("stop_strings", None)
         # Parameters that control the generation strategy used
         self.do_sample = kwargs.pop("do_sample", False)
-        self.num_beams = kwargs.pop("num_beams", 1)
+        self.num_beams = kwargs.pop("num_beams", None)
         # Parameters that control the cache
         self.use_cache = kwargs.pop("use_cache", True)
         self.cache_implementation = kwargs.pop("cache_implementation", None)
         self.cache_config = kwargs.pop("cache_config", None)
-        self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
         # Parameters for manipulation of the model output logits
-        self.temperature = kwargs.pop("temperature", 1.0)
-        self.top_k = kwargs.pop("top_k", 50)
-        self.top_p = kwargs.pop("top_p", 1.0)
+        self.temperature = kwargs.pop("temperature", None)
+        self.top_k = kwargs.pop("top_k", None)
+        self.top_p = kwargs.pop("top_p", None)
         self.min_p = kwargs.pop("min_p", None)
         self.top_h = kwargs.pop("top_h", None)
-        self.typical_p = kwargs.pop("typical_p", 1.0)
-        self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
-        self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
-        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
-        self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
-        self.length_penalty = kwargs.pop("length_penalty", 1.0)
-        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.typical_p = kwargs.pop("typical_p", None)
+        self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", None)
+        self.eta_cutoff = kwargs.pop("eta_cutoff", None)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", None)
+        self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", None)
+        self.length_penalty = kwargs.pop("length_penalty", None)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", None)
         self.bad_words_ids = kwargs.pop("bad_words_ids", None)
         self.renormalize_logits = kwargs.pop("renormalize_logits", False)
         self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
@@ -381,20 +379,16 @@ class GenerationConfig(PushToHubMixin):
         self.token_healing = kwargs.pop("token_healing", False)
         self.guidance_scale = kwargs.pop("guidance_scale", None)
-        watermarking_config = kwargs.pop("watermarking_config", None)
-        if watermarking_config is None:
-            self.watermarking_config = None
-        elif isinstance(watermarking_config, BaseWatermarkingConfig):
-            self.watermarking_config = watermarking_config
-        else:
-            self.watermarking_config = WatermarkingConfig.from_dict(watermarking_config)
+        self.watermarking_config = kwargs.pop("watermarking_config", None)
+        if isinstance(self.watermarking_config, dict):
+            self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config)
         # Parameters that define the output variables of `generate`
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
         self.output_attentions = kwargs.pop("output_attentions", False)
         self.output_hidden_states = kwargs.pop("output_hidden_states", False)
         self.output_scores = kwargs.pop("output_scores", False)
-        self.output_logits = kwargs.pop("output_logits", None)
+        self.output_logits = kwargs.pop("output_logits", False)
         self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
         # Special tokens that can be used at generation time
@@ -403,57 +397,57 @@ class GenerationConfig(PushToHubMixin):
         self.eos_token_id = kwargs.pop("eos_token_id", None)
         # Generation parameters exclusive to encoder-decoder models
-        self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
+        self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", None)
         self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
         # Assistant generation
-        self.is_assistant = False
-        self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
-        self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
-        self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
+        self.is_assistant = kwargs.pop("is_assistant", False)
+        self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", None)
+        self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", None)
+        self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", None)
         self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
         self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
         self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
-        ## assistant generation for different tokenizers, the windows size for assistant/target model
-        self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10)
-        self.target_lookbehind = kwargs.pop("target_lookbehind", 10)
+        self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", None)
+        self.target_lookbehind = kwargs.pop("target_lookbehind", None)
         # Performance
         self.compile_config = kwargs.pop("compile_config", None)
         self.disable_compile = kwargs.pop("disable_compile", False)
-        # Deprecated (moved to the Hub). TODO joao, manuel: remove in v4.62.0
+        # Deprecated (moved to the Hub). TODO remove for v5
         self.low_memory = kwargs.pop("low_memory", None)
         self.penalty_alpha = kwargs.pop("penalty_alpha", None)
         self.dola_layers = kwargs.pop("dola_layers", None)
-        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
-        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+        self.diversity_penalty = kwargs.pop("diversity_penalty", None)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", None)
         self.constraints = kwargs.pop("constraints", None)
         self.force_words_ids = kwargs.pop("force_words_ids", None)
-        # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
-        # interface.
-        self._from_model_config = kwargs.pop("_from_model_config", False)
-        self._commit_hash = kwargs.pop("_commit_hash", None)
-        self.transformers_version = kwargs.pop("transformers_version", __version__)
+        self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
-        # Ensure backward compatibility for models that use `forced_bos_token_id` within their config
-        if self._from_model_config and kwargs.get("force_bos_token_to_be_generated", False):
-            self.forced_bos_token_id = self.bos_token_id
-            logger.warning_once(
-                f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
-            )
+        # Common attributes
+        self._commit_hash = kwargs.pop("_commit_hash", None)
+        self._from_model_config = kwargs.pop("_from_model_config", None)
+        self.transformers_version = kwargs.pop("transformers_version", None)
         # Additional attributes without default values
         if not self._from_model_config:
-            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
-            # model's default configuration file
+            # we don't want to copy values from the model config if we're initializing
+            # a `GenerationConfig` from a model's default configuration file
             for key, value in kwargs.items():
                 try:
                     setattr(self, key, value)
                 except AttributeError as err:
                     logger.error(f"Can't set {key} with value {value} for {self}")
                     raise err
+        else:
+            # Ensure backward compatibility for models that use `forced_bos_token_id` within their config
+            if kwargs.get("force_bos_token_to_be_generated", False):
+                self.forced_bos_token_id = self.bos_token_id
+                logger.warning_once(
+                    f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
+                )
         # Validate the values of the attributes
         self.validate()
@@ -488,8 +482,8 @@ class GenerationConfig(PushToHubMixin):
         # property and part of the `__repr__`
         if self.constraints is not None or self.force_words_ids is not None:
             generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
-        elif self.num_beams == 1:
-            if self.do_sample is False:
+        elif self.num_beams is None or self.num_beams == 1:
+            if not self.do_sample:
                 if (
                     self.top_k is not None
                     and self.top_k > 1
@@ -502,9 +496,9 @@ class GenerationConfig(PushToHubMixin):
             else:
                 generation_mode = GenerationMode.SAMPLE
         else:
-            if self.num_beam_groups > 1:
+            if self.num_beam_groups is not None and self.num_beam_groups > 1:
                 generation_mode = GenerationMode.GROUP_BEAM_SEARCH
-            elif self.do_sample is True:
+            elif self.do_sample:
                 generation_mode = GenerationMode.BEAM_SAMPLE
             else:
                 generation_mode = GenerationMode.BEAM_SEARCH
@@ -537,6 +531,45 @@ class GenerationConfig(PushToHubMixin):
                 )
         return generation_mode
+    @staticmethod
+    def _get_default_generation_params() -> dict[str, Any]:
+        return {
+            "max_length": 20,
+            "min_length": 0,
+            "do_sample": False,
+            "early_stopping": False,
+            "num_beams": 1,
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "typical_p": 1.0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "encoder_no_repeat_ngram_size": 0,
+            "bad_words_ids": None,
+            "num_return_sequences": 1,
+            "output_scores": False,
+            "return_dict_in_generate": False,
+            "forced_bos_token_id": None,
+            "forced_eos_token_id": None,
+            "remove_invalid_values": False,
+            "exponential_decay_length_penalty": None,
+            "suppress_tokens": None,
+            "begin_suppress_tokens": None,
+            "epsilon_cutoff": 0.0,
+            "eta_cutoff": 0.0,
+            "encoder_repetition_penalty": 1.0,
+            "num_assistant_tokens": 20,
+            "num_assistant_tokens_schedule": "constant",
+            "assistant_confidence_threshold": 0.4,
+            "assistant_lookbehind": 10,
+            "target_lookbehind": 10,
+            # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
+        }
     def validate(self, strict=False):
         """
         Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
@@ -552,7 +585,7 @@ class GenerationConfig(PushToHubMixin):
         # 1. Validation of individual attributes
         # 1.1. Decoding attributes
-        if self.early_stopping not in {True, False, "never"}:
+        if self.early_stopping not in {None, True, False, "never"}:
             raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
         if self.max_new_tokens is not None and self.max_new_tokens <= 0:
             raise ValueError(f"`max_new_tokens` must be greater than 0, but is {self.max_new_tokens}.")
@@ -583,9 +616,9 @@ class GenerationConfig(PushToHubMixin):
         # 2. Validation of attribute combinations
         # 2.1. detect sampling-only parameterization when not in sampling mode
-        if self.do_sample is False:
+        if not self.do_sample:
             greedy_wrong_parameter_msg = (
-                "`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
+                "`do_sample` is set not to set `True`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
                 "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
             )
             if self.temperature is not None and self.temperature != 1.0:
@@ -614,42 +647,42 @@ class GenerationConfig(PushToHubMixin):
                 )
         # 2.2. detect beam-only parameterization when not in beam mode
-        if self.num_beams == 1:
+        if self.num_beams is None or self.num_beams == 1:
             single_beam_wrong_parameter_msg = (
-                "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
+                "`num_beams` is set to {num_beams}. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
                 "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`."
             )
-            if self.early_stopping is not False:
+            if self.early_stopping is not None and self.early_stopping is not False:
                 minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="early_stopping", flag_value=self.early_stopping
+                    num_beams=self.num_beams, flag_name="early_stopping", flag_value=self.early_stopping
                 )
             if self.length_penalty is not None and self.length_penalty != 1.0:
                 minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="length_penalty", flag_value=self.length_penalty
+                    num_beams=self.num_beams, flag_name="length_penalty", flag_value=self.length_penalty
                 )
         # 2.4. check `num_return_sequences`
-        if self.num_return_sequences != 1:
-            if self.num_beams == 1:
-                if self.do_sample is False:
+        if self.num_return_sequences > 1:
+            if self.num_beams is None or self.num_beams == 1:
+                if not self.do_sample:
                     raise ValueError(
-                        "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
-                        f"(got {self.num_return_sequences})."
+                        "Greedy methods (do_sample != True) without beam search do not support "
+                        f"`num_return_sequences` different than 1 (got {self.num_return_sequences})."
                     )
-            elif self.num_return_sequences > self.num_beams:
+            elif self.num_beams is not None and self.num_return_sequences > self.num_beams:
                 raise ValueError(
                     f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
                     f"({self.num_beams})."
                 )
         # 2.5. check cache-related arguments
-        if self.use_cache is False:
+        if not self.use_cache:
             # In this case, all cache-related arguments should be unset. However, since `use_cache=False` is often used
             # passed to `generate` directly to hot-fix cache issues, let's raise a warning instead of an error
             # (otherwise a user might need to overwrite several parameters).
             no_cache_warning = (
-                "You have set `use_cache` to `False`, but {cache_arg} is set to {cache_arg_value}. {cache_arg} will "
-                "have no effect."
+                "You have not set `use_cache` to `True`, but {cache_arg} is set to {cache_arg_value}."
+                "{cache_arg} will have no effect."
             )
             for arg_name in ("cache_implementation", "cache_config"):
                 if getattr(self, arg_name) is not None:
@@ -658,9 +691,9 @@ class GenerationConfig(PushToHubMixin):
                     )
         # 2.6. other incorrect combinations
-        if self.return_dict_in_generate is not True:
+        if not self.return_dict_in_generate:
             for extra_output_flag in self.extra_output_flags:
-                if getattr(self, extra_output_flag) is True:
+                if getattr(self, extra_output_flag):
                     minor_issues[extra_output_flag] = (
                         f"`return_dict_in_generate` is NOT set to `True`, but `{extra_output_flag}` is. When "
                         f"`return_dict_in_generate` is not `True`, `{extra_output_flag}` is ignored."
@@ -676,7 +709,6 @@ class GenerationConfig(PushToHubMixin):
             "streamer",
             "negative_prompt_ids",
             "negative_prompt_attention_mask",
-            "use_model_defaults",
         )
         for arg in generate_arguments:
             if hasattr(self, arg):
@@ -1101,7 +1133,7 @@ class GenerationConfig(PushToHubMixin):
             writer.write(self.to_json_string(use_diff=use_diff, keys_to_pop=keys_to_pop))
     @classmethod
-    def from_model_config(cls, model_config: PreTrainedConfig | dict) -> "GenerationConfig":
+    def from_model_config(cls, model_config: Union["PreTrainedConfig", dict]) -> "GenerationConfig":
         """
         Instantiates a [`GenerationConfig`] from a [`PreTrainedConfig`]. This function is useful to convert legacy
         [`PreTrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
@@ -1118,23 +1150,28 @@ class GenerationConfig(PushToHubMixin):
         # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
         config_dict = {key: value for key, value in config_dict.items() if value is not None}
         generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
         # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
         # generation config (which in turn is defined from the outer attributes of model config).
-        if not isinstance(model_config, dict):
-            decoder_config = model_config.get_text_config(decoder=True)
-            if decoder_config is not model_config:
-                default_generation_config = GenerationConfig()
-                decoder_config_dict = decoder_config.to_dict()
-                for attr in generation_config.to_dict():
-                    is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
-                    if attr in decoder_config_dict and is_unset:
-                        setattr(generation_config, attr, decoder_config_dict[attr])
+        if isinstance(model_config, dict):
+            decoder_possible_text_config_names = ("decoder", "generator", "text_config")
+            for text_config_name in decoder_possible_text_config_names:
+                if text_config := model_config.get(text_config_name):
+                    model_config = text_config
+                    break
+        else:
+            model_config = model_config.get_text_config(decoder=True)
+            model_config = model_config.to_dict()
+        default_generation_config = GenerationConfig()
+        for attr in generation_config.to_dict():
+            is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
+            if attr in model_config and is_unset:
+                setattr(generation_config, attr, model_config[attr])
         # If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
-        if generation_config.return_dict_in_generate is False:
+        if not generation_config.return_dict_in_generate:
             if any(
                 getattr(generation_config, extra_output_flag, False)
                 for extra_output_flag in generation_config.extra_output_flags
@@ -1145,12 +1182,14 @@ class GenerationConfig(PushToHubMixin):
         generation_config._original_object_hash = hash(generation_config)
         return generation_config
-    def update(self, **kwargs):
+    def update(self, defaults_only=False, **kwargs):
         """
         Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
         returning all the unused kwargs.
         Args:
+            defaults_only (`bool`, *optional*, defaults to `False`):
+                Whether to update all keys in config with `kwargs` or only those that are set to `None` (i.e. default value).
             kwargs (`dict[str, Any]`):
                 Dictionary of attributes to tentatively update this class.
@@ -1160,8 +1199,9 @@ class GenerationConfig(PushToHubMixin):
         to_remove = []
         for key, value in kwargs.items():
             if hasattr(self, key):
-                setattr(self, key, value)
-                to_remove.append(key)
+                if not defaults_only or getattr(self, key) is None:
+                    setattr(self, key, value)
+                    to_remove.append(key)
         # Confirm that the updated instance is still valid
         self.validate()

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl