PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/generation/continuous_batching/scheduler.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Scheduler(ABC):
         self.waiting_requests_order.append(state.request_id)
     @abstractmethod
-    def schedule_batch(self, token_budget: int, cache_budget: int) -> list[RequestState]:
+    def schedule_batch(self, token_budget: int, cache_budget: int) -> list[RequestState] | None:
         """Schedules requests for the next batch based on available token and cache budgets. This method selects which
         requests should be processed in the current batch, considering the budgets and the scheduler's prioritization
         rules. The token_budget is the maximum number of tokens that can be processed in a batch, and the cache_budget
@@ -64,7 +64,7 @@ class Scheduler(ABC):
     @traced
     def has_pending_requests(self) -> bool:
         """Checks if there are requests ready to be processed."""
-        return len(self.active_requests) or len(self.waiting_requests)
+        return bool(len(self.active_requests) or len(self.waiting_requests))
     @traced
     def finish_request(self, request_id: str, evict_from_cache: bool = True) -> None:
@@ -160,9 +160,11 @@ class Scheduler(ABC):
         request_ids_to_remove_from_waiting: set[str],
     ) -> None:
         """Schedules a request for the current batch, updating the request's status according to the token budget left.
+        After a request is scheduled, it is part of the next batch unless there is an error.
         If the request has children (for parallel decoding), it ensures at least one token remains before the request is
         forked."""
         # If the request has one or more children we make sure not to prefill it entirely
+        # This does not check the request state, but DECODING request already have children set to 0.
         if state.num_children > 0 and token_budget >= len(request_tokens) - 1:
             token_budget = len(request_tokens) - 1
             self._requests_to_fork.append(state)
@@ -189,48 +191,27 @@ class Scheduler(ABC):
             state.remaining_prefill_tokens = request_tokens[token_budget:]
             state.tokens_to_process = request_tokens[:token_budget]
+    def _process_candidates(
+        self,
+        candidates: list[RequestState],
+        token_budget: int,
+        cache_budget: int,
+        request_ids_to_remove_from_waiting: set[str],
+        safety_margin: float = 0.0,
+    ) -> tuple[list[RequestState], bool]:
+        """Schedules candidate requests for the current batch.
-# TODO: further common-ize the two classes
-@attach_tracer()
-class FIFOScheduler(Scheduler):
-    """This scheduler processes requests in the order they arrive, meaning decoding requests has priority over
-    prefilling requests. Additionally, it includes a safety margin mechanism to prevent cache exhaustion. By default,
-    when 80% of the cache is full, new requests will not be scheduled to prioritize decoding active requests."""
-    def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False, safety_margin: float = 0.2):
-        """Initializes the FIFO scheduler. The safety margin is the percentage of free blocks under which we stop
-        scheduling new prefill requests, so safety_margin = 0.1 means that when there is less than 10% of free blocks,
-        or equivalently when more than 90% of blocks are already allocated, we stop scheduling new prefill requests.
+        This method contains the common logic shared by all schedulers: it checks token and cache budgets, allocates
+        cache blocks if needed, updates request states, and tracks which waiting requests should be removed from the
+        waiting queue.
         """
-        super().__init__(cache, retain_cache_on_finish)
-        self.safety_margin = safety_margin
-    @traced
-    def schedule_batch(self, token_budget: int, cache_budget: int) -> list[RequestState] | None:
-        priority_states: list[RequestState] = []
-        second_priority_states: list[RequestState] = []
         scheduled_requests = []
-        for state in self.active_requests.values():
-            if state.status == RequestStatus.DECODING:
-                priority_states.append(state)
-            if state.status in [RequestStatus.SPLIT_PENDING_REMAINDER, RequestStatus.PREFILLING_SPLIT]:
-                second_priority_states.append(state)
-        # Add waiting requests to second priority
-        if not self.block_new_requests:
-            for req_id in self.waiting_requests_order:
-                second_priority_states.append(self.waiting_requests[req_id])
-        candidates = priority_states + second_priority_states
-        request_ids_to_remove_from_waiting = set()
-        safety_margins = self.safety_margin * self.cache.num_blocks
         one_allocation_failed = False
+        safety_margins = safety_margin * self.cache.num_blocks
         for state in candidates:
-            # If we are out the safety margin, we only accept decoding requests or the first prefill request
             num_free_blocks = self.cache.get_num_free_blocks()
+            # If we are out the safety margin, we only accept decoding requests or the first prefill request
             outside_safety_margin = num_free_blocks < safety_margins
             if outside_safety_margin and scheduled_requests and state.status != RequestStatus.DECODING:
                 logger.info(
@@ -256,8 +237,8 @@ class FIFOScheduler(Scheduler):
             # If the allocation would not be successful, we move on to the next request
             if not allocation_successful:
                 one_allocation_failed = True
-                # If we have reached a request that was waiting, all subsequent requests are also waiting, and will need
-                # allocation as well. So if there is no more free blocks, we can safely break out of the loop.
+                # If we reached a waiting request and the cache is full, all subsequent waiting requests will need
+                # allocation as well, so we can safely break out of the scheduling loop.
                 if num_free_blocks == 0 and state.request_id in self.waiting_requests:
                     logger.info(f"Breaking mid-loop for request {state.request_id} because the cache is full")
                     break
@@ -289,11 +270,59 @@ class FIFOScheduler(Scheduler):
             if token_budget == 0 or cache_budget == 0:
                 break
-        # We remove waiting requests before checking requests were scheduled, because there might have been prefill matches
+        return scheduled_requests, one_allocation_failed
+    def _cleanup_waiting_queue(self, request_ids_to_remove_from_waiting: set[str]) -> None:
+        """Removes processed requests from the waiting queue order."""
         self.waiting_requests_order = deque(
             [req_id for req_id in self.waiting_requests_order if req_id not in request_ids_to_remove_from_waiting]
         )
+# TODO: further common-ize the two classes
+@attach_tracer()
+class FIFOScheduler(Scheduler):
+    """This scheduler processes requests in the order they arrive, meaning decoding requests has priority over
+    prefilling requests. Additionally, it includes a safety margin mechanism to prevent cache exhaustion. By default,
+    when 80% of the cache is full, new requests will not be scheduled to prioritize decoding active requests."""
+    def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False, safety_margin: float = 0.2):
+        """Initializes the FIFO scheduler. The safety margin is the percentage of free blocks under which we stop
+        scheduling new prefill requests, so safety_margin = 0.1 means that when there is less than 10% of free blocks,
+        or equivalently when more than 90% of blocks are already allocated, we stop scheduling new prefill requests.
+        """
+        super().__init__(cache, retain_cache_on_finish)
+        self.safety_margin = safety_margin
+    @traced
+    def schedule_batch(self, token_budget: int, cache_budget: int) -> list[RequestState] | None:
+        priority_states: list[RequestState] = []
+        second_priority_states: list[RequestState] = []
+        for state in self.active_requests.values():
+            if state.status == RequestStatus.DECODING:
+                priority_states.append(state)
+            if state.status in [RequestStatus.SPLIT_PENDING_REMAINDER, RequestStatus.PREFILLING_SPLIT]:
+                second_priority_states.append(state)
+        # Add waiting requests to second priority
+        if not self.block_new_requests:
+            for req_id in self.waiting_requests_order:
+                second_priority_states.append(self.waiting_requests[req_id])
+        candidates = priority_states + second_priority_states
+        request_ids_to_remove_from_waiting = set()
+        scheduled_requests, one_allocation_failed = self._process_candidates(
+            candidates,
+            token_budget,
+            cache_budget,
+            request_ids_to_remove_from_waiting,
+            safety_margin=self.safety_margin,
+        )
+        # We remove waiting requests before checking requests were scheduled, because there might have been prefill matches
+        self._cleanup_waiting_queue(request_ids_to_remove_from_waiting)
         # If no requests were scheduled and the cache is full, we signal it by returning None
         if not scheduled_requests and one_allocation_failed:
             return None
@@ -313,7 +342,6 @@ class PrefillFirstScheduler(Scheduler):
     def schedule_batch(self, token_budget: int, cache_budget: int) -> list[RequestState] | None:
         priority_states: list[RequestState] = []
         second_priority_states: list[RequestState] = []
-        scheduled_requests = []
         for state in self.active_requests.values():
             # XXX: when cache is full, state can stay on `PREFILLING_SPLIT` so we need to take those into account
@@ -329,62 +357,16 @@ class PrefillFirstScheduler(Scheduler):
         candidates = priority_states + second_priority_states
         request_ids_to_remove_from_waiting = set()
-        one_allocation_failed = False
-        for state in candidates:
-            # Check cache budget
-            cache_needed = state.current_len()
-            cache_needed = (
-                cache_needed if self.cache_budget_module is None else cache_needed % self.cache_budget_module
-            )
-            if cache_budget < cache_needed:
-                continue
-            # Infer the tokens that will be present in the batch if token budget is enough
-            request_tokens = self._infer_request_tokens(state, request_ids_to_remove_from_waiting)
-            # Account for token budget
-            request_len = min(len(request_tokens), token_budget)
-            # Check there will be enough cache for the new tokens
-            allocation_successful = self._allocate_blocks_if_needed(state, request_len)
-            # If the allocation would not be successful, we move on to the next request
-            if not allocation_successful:
-                one_allocation_failed = True
-                # If the request was waiting, all requests afterwards will need allocation, so we break if the cache is full
-                if state.request_id in self.waiting_requests and self.cache.get_num_free_blocks() == 0:
-                    break
-                continue
-            # If this point is reached, it means we can safely schedule the request
-            self._schedule_request(state, request_tokens, token_budget, request_ids_to_remove_from_waiting)
-            request_len = len(state.tokens_to_process)  # it may change after scheduling
-            scheduled_requests.append(state)
-            # Update the token and cache budgets
-            token_budget -= request_len
-            cache_budget -= cache_needed
-            # If using prefix sharing, we make note of the blocks that will be computed in the forward pass
-            if self.cache.allow_block_sharing:
-                tokens_in_current_block = state.current_len() % self.cache.block_size
-                tokens_after_forward = tokens_in_current_block + request_len
-                complete_blocks = tokens_after_forward // self.cache.block_size
-                self.cache.blocks_to_complete[state.request_id] = complete_blocks
-            # Remove the request from the waiting queue and mark it as removed
-            req_id = state.request_id
-            was_waiting = self.waiting_requests.pop(req_id, None) is not None
-            if was_waiting:
-                request_ids_to_remove_from_waiting.add(req_id)
-            # Early exit of the loop if we have no budget left
-            if token_budget == 0 or cache_budget == 0:
-                break
+        scheduled_requests, one_allocation_failed = self._process_candidates(
+            candidates,
+            token_budget,
+            cache_budget,
+            request_ids_to_remove_from_waiting,
+            safety_margin=0.0,
+        )
         # We remove waiting requests before checking requests were scheduled, because there might have been prefill matches
-        self.waiting_requests_order = deque(
-            [req_id for req_id in self.waiting_requests_order if req_id not in request_ids_to_remove_from_waiting]
-        )
+        self._cleanup_waiting_queue(request_ids_to_remove_from_waiting)
         # If no requests were scheduled and the cache is full, we signal it by returning None
         if not scheduled_requests and one_allocation_failed:

transformers/generation/logits_process.py CHANGED Viewed

@@ -20,7 +20,6 @@ from typing import TYPE_CHECKING
 import numpy as np
 import torch
-from ..pytorch_utils import isin_mps_friendly
 from ..utils import add_start_docstrings
 from ..utils.logging import get_logger
@@ -93,6 +92,12 @@ class LogitsProcessorList(list):
         return scores
+    def set_continuous_batching_context(self, logits_indices: torch.Tensor, cu_seq_lens_q: torch.Tensor) -> None:
+        """Forwards the continuous batching metadata to all logit processors that need it."""
+        for processor in self:
+            if hasattr(processor, "set_continuous_batching_context"):
+                processor.set_continuous_batching_context(logits_indices, cu_seq_lens_q)
 class MinLengthLogitsProcessor(LogitsProcessor):
     r"""
@@ -148,7 +153,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        eos_token_mask = isin_mps_friendly(vocab_tensor, self.eos_token_id)
+        eos_token_mask = torch.isin(vocab_tensor, self.eos_token_id)
         scores_processed = scores.clone()
         if input_ids.shape[-1] < self.min_length:
             scores_processed = torch.where(eos_token_mask, -math.inf, scores)
@@ -220,7 +225,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
         new_tokens_length = input_ids.shape[-1] - self.prompt_length_to_skip
         scores_processed = scores.clone()
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        eos_token_mask = isin_mps_friendly(vocab_tensor, self.eos_token_id)
+        eos_token_mask = torch.isin(vocab_tensor, self.eos_token_id)
         if new_tokens_length < self.min_new_tokens:
             scores_processed = torch.where(eos_token_mask, -math.inf, scores)
@@ -1847,7 +1852,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.begin_suppress_tokens)
+        suppress_token_mask = torch.isin(vocab_tensor, self.begin_suppress_tokens)
         scores_processed = scores
         if input_ids.shape[-1] == self.begin_index:
             scores_processed = torch.where(suppress_token_mask, -float("inf"), scores)
@@ -1890,7 +1895,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
-        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device))
+        suppress_token_mask = torch.isin(vocab_tensor, self.suppress_tokens.to(scores.device))
         scores = torch.where(suppress_token_mask, -float("inf"), scores)
         return scores

transformers/generation/stopping_criteria.py CHANGED Viewed

@@ -8,7 +8,6 @@ import numpy as np
 import torch
 from torch.nn import functional as F
-from ..pytorch_utils import isin_mps_friendly
 from ..tokenization_utils_base import PreTrainedTokenizerBase
 from ..utils import add_start_docstrings, logging
@@ -468,7 +467,7 @@ class EosTokenCriteria(StoppingCriteria):
     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
         self.eos_token_id = self.eos_token_id.to(input_ids.device)
-        is_done = isin_mps_friendly(input_ids[:, -1], self.eos_token_id)
+        is_done = torch.isin(input_ids[:, -1], self.eos_token_id)
         return is_done

transformers/generation/utils.py CHANGED Viewed

@@ -42,17 +42,15 @@ from ..dynamic_module_utils import (
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..integrations.fsdp import is_fsdp_managed_module
 from ..masking_utils import create_masks_for_generate
-from ..pytorch_utils import isin_mps_friendly
 from ..tokenization_python import ExtensionsTrie
 from ..utils import (
     ModelOutput,
     TransformersKwargs,
     is_accelerate_available,
-    is_hqq_available,
-    is_optimum_quanto_available,
     is_torchdynamo_exporting,
     logging,
 )
+from ..utils.generic import is_flash_attention_requested
 from .candidate_generator import (
     AssistantVocabTranslatorCache,
     AssistedCandidateGenerator,
@@ -861,11 +859,9 @@ class GenerationMixin(ContinuousMixin):
         if not is_input_ids:
             return default_attention_mask
-        is_pad_token_in_inputs = (pad_token_id is not None) and (
-            isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any()
-        )
+        is_pad_token_in_inputs = (pad_token_id is not None) and (torch.isin(inputs_tensor, pad_token_id).any())
         is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
-            isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any()
+            torch.isin(eos_token_id, pad_token_id).any()
         )
         can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
         attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long()
@@ -1772,9 +1768,9 @@ class GenerationMixin(ContinuousMixin):
         """
         # parameterization priority:
         # user-defined kwargs or `generation_config` > `self.generation_config` > global default values
-        # TODO: (raushan) doesn't make sense to allow kwargs and `generation_config`. Should be mutually exclusive!
         # TODO (joao): per-model generation config classes.
+        generation_config_provided = generation_config is not None
         if generation_config is None:
             # Users may modify `model.config` to control generation. This is a legacy behavior and is not supported anymore
             if len(self.config._get_generation_parameters()) > 0:
@@ -1810,6 +1806,16 @@ class GenerationMixin(ContinuousMixin):
         if generation_config.cache_implementation == "hybrid":
             generation_config.cache_implementation = None
+        # It doesn't make sense to allow kwargs and `generation_config`, that should be mutually exclusive
+        if generation_config_provided and set(kwargs.keys()) - set(model_kwargs.keys()):
+            generation_kwargs = set(kwargs.keys()) - set(model_kwargs.keys())
+            logger.warning_once(
+                f"Passing `generation_config` together with generation-related "
+                f"arguments=({generation_kwargs}) is deprecated and will be removed in future versions. "
+                "Please pass either a `generation_config` object OR all generation "
+                "parameters explicitly, but not both.",
+            )
         # Finally keep output_xxx args in `model_kwargs` so it can be passed to `forward`
         output_attentions = generation_config.output_attentions
         output_hidden_states = generation_config.output_hidden_states
@@ -1847,20 +1853,19 @@ class GenerationMixin(ContinuousMixin):
         model_kwargs["cache_position"] = cache_position
         return model_kwargs
-    def _get_cache(self, cache_implementation: str, batch_size: int, max_cache_len: int, model_kwargs) -> Cache:
+    def _prepare_static_cache(
+        self, cache_implementation: str, batch_size: int, max_cache_len: int, model_kwargs
+    ) -> Cache:
         """
         Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
         new `generate` call requires a larger cache or uses a different batch size.
         Returns the resulting cache object.
         """
-        requires_cross_attention_cache = (
-            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
-        )
         offload_cache = "offloaded" in cache_implementation
         if hasattr(self, "_cache"):
-            cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
+            cache_to_check = self._cache.self_attention_cache if self.config.is_encoder_decoder else self._cache
         need_new_cache = (
             not hasattr(self, "_cache")
@@ -1869,7 +1874,7 @@ class GenerationMixin(ContinuousMixin):
             or cache_to_check.max_cache_len < max_cache_len
         )
-        if requires_cross_attention_cache and hasattr(self, "_cache"):
+        if self.config.is_encoder_decoder and hasattr(self, "_cache"):
             need_new_cache = (
                 need_new_cache
                 or self._cache.cross_attention_cache.max_cache_len != model_kwargs["encoder_outputs"][0].shape[1]
@@ -1882,7 +1887,7 @@ class GenerationMixin(ContinuousMixin):
                 "offloading": offload_cache,
             }
             self._cache = StaticCache(**self_attention_cache_kwargs)
-            if requires_cross_attention_cache:
+            if self.config.is_encoder_decoder:
                 cross_attention_cache_kwargs = {
                     "config": self.config.get_text_config(decoder=True),
                     "max_cache_len": model_kwargs["encoder_outputs"][0].shape[1],
@@ -1925,12 +1930,9 @@ class GenerationMixin(ContinuousMixin):
         instantiated, writes it to `model_kwargs`, under the name expected by the model.
         """
-        is_hybrid_cache = any(class_name in self.__class__.__name__.lower() for class_name in ["mamba", "falconh1"])
-        cache_name = "past_key_values" if not is_hybrid_cache else "cache_params"
-        requires_cross_attention_cache = (
-            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
-        )
+        # TODO @raushan, unify cache arg naming for all models
+        is_linear_attn_cache = "mamba" in self.__class__.__name__.lower()
+        cache_name = "past_key_values" if not is_linear_attn_cache else "cache_params"
         # Quick escape route 1: if the user specifies a cache, we only need to check for conflicting `generate` arguments
         user_defined_cache = model_kwargs.get(cache_name)
@@ -1962,76 +1964,55 @@ class GenerationMixin(ContinuousMixin):
         # Otherwise we NEED to prepare a cache, based on `generation_config.cache_implementation`
-        # TODO(joao): support static caches in assisted generation. assisted generation needs to roll back caches,
-        # which is only supported in dynamic caches atm
-        if (
-            generation_mode == GenerationMode.ASSISTED_GENERATION
-            and generation_config.cache_implementation is not None
-        ):
-            logger.warning_once(
-                "An assistant model is provided, using a dynamic cache instead of a cache of type="
-                f"'{generation_config.cache_implementation}'."
-            )
-            generation_config.cache_implementation = None
         # Assisted decoding and contrastive search require cache rollback, which is incompatible with sliding layers.
         # To handle this, we skip passing the model config to DynamicCache (forcing a full-layer cache).
         # The "dynamic_full" option is a shortcut for generate() users to avoid sliding layers on their own.
-        if (
-            generation_mode in (GenerationMode.ASSISTED_GENERATION, GenerationMode.CONTRASTIVE_SEARCH)
-            or generation_config.cache_implementation == "dynamic_full"
-        ):
-            dynamic_cache_kwargs = {}
-        else:
-            dynamic_cache_kwargs = {"config": self.config.get_text_config(decoder=True)}
-        if generation_config.cache_implementation is not None:
-            if generation_config.cache_implementation in ALL_STATIC_CACHE_IMPLEMENTATIONS:
-                if generation_config.cache_implementation in DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS:
-                    logger.warning_once(
-                        f"Using `cache_implementation='{generation_config.cache_implementation}' is deprecated. "
-                        f"Please only use one of {STATIC_CACHE_IMPLEMENTATIONS}, and the layer structure will be "
-                        "inferred automatically."
-                    )
-                model_kwargs[cache_name] = self._get_cache(
-                    cache_implementation=generation_config.cache_implementation,
-                    batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
-                    max_cache_len=max_cache_length,
-                    model_kwargs=model_kwargs,
+        if generation_mode in (GenerationMode.ASSISTED_GENERATION, GenerationMode.CONTRASTIVE_SEARCH):
+            if generation_config.cache_implementation is not None:
+                logger.warning_once(
+                    "An assistant model is provided, using a dynamic cache instead of a cache of type="
+                    f"'{generation_config.cache_implementation}'."
                 )
-            elif generation_config.cache_implementation == "quantized":
-                if self.config.is_encoder_decoder or not self._supports_default_dynamic_cache():
-                    raise ValueError(
-                        "This model does not support the quantized cache. If you want your model to support quantized "
-                        "cache, please open an issue and tag @zucchini-nlp."
-                    )
+            generation_config.cache_implementation = "dynamic_full"
+        dynamic_cache_kwargs = {}
+        if generation_config.cache_implementation != "dynamic_full":
+            dynamic_cache_kwargs["config"] = self.config.get_text_config(decoder=True)
+        if generation_config.cache_implementation == "offloaded":
+            dynamic_cache_kwargs["offloading"] = True
+        if generation_config.cache_implementation in ALL_STATIC_CACHE_IMPLEMENTATIONS:
+            if generation_config.cache_implementation in DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS:
+                logger.warning_once(
+                    f"Using `cache_implementation='{generation_config.cache_implementation}' is deprecated "
+                    f"and will be removed in v5.13. Please only use one of {STATIC_CACHE_IMPLEMENTATIONS}, "
+                    "and the layer structure will be inferred automatically."
+                )
+            model_kwargs["past_key_values"] = self._prepare_static_cache(
+                cache_implementation=generation_config.cache_implementation,
+                batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
+                max_cache_len=max_cache_length,
+                model_kwargs=model_kwargs,
+            )
+        elif generation_config.cache_implementation == "quantized":
+            if self.config.is_encoder_decoder or not self._supports_default_dynamic_cache():
+                raise ValueError(
+                    "This model does not support the quantized cache. If you want your model to support quantized "
+                    "cache, please open an issue and tag @zucchini-nlp."
+                )
+            cache_config = generation_config.cache_config if generation_config.cache_config is not None else {}
+            cache_config.setdefault("config", self.config.get_text_config(decoder=True))
+            backend = cache_config.pop("backend", "quanto")
+            model_kwargs["past_key_values"] = QuantizedCache(backend=backend, **cache_config)
+        # i.e. `cache_implementation` in [None, "dynamic", "offloaded", "dynamic_full"]
+        # TODO: prepare linear cache from a single API, instead of creating in modeling code
+        else:
+            model_kwargs["past_key_values"] = DynamicCache(**dynamic_cache_kwargs)
-                cache_config = generation_config.cache_config if generation_config.cache_config is not None else {}
-                # Add the config if it was not provided, as it's a required argument
-                if "config" not in cache_config:
-                    cache_config["config"] = self.config.get_text_config()
-                # Pop the backend from the config (defaults to quanto if not defined)
-                backend = cache_config.pop("backend", "quanto")
-                if backend == "quanto" and not is_optimum_quanto_available():
-                    raise ImportError(
-                        "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto "
-                        "backend. Please install it via  with `pip install optimum-quanto`"
-                    )
-                elif backend == "HQQ" and not is_hqq_available():
-                    raise ImportError(
-                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
-                        "Please install it via  with `pip install hqq`"
-                    )
-                model_kwargs[cache_name] = QuantizedCache(backend=backend, **cache_config)
-            elif generation_config.cache_implementation == "offloaded":
-                model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs, offloading=True)
-            elif "dynamic" in generation_config.cache_implementation:
-                model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs)
-        # TODO (joao): this logic is incomplete, e.g. `offloaded` should apply to both caches. Refactor this function
-        # to correctly pass parameterization to both caches.
         if (
-            requires_cross_attention_cache
+            self.config.is_encoder_decoder
             and "past_key_values" in model_kwargs
             and not isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
         ):
@@ -2102,10 +2083,7 @@ class GenerationMixin(ContinuousMixin):
             raise ValueError(
                 "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
             )
-        if (
-            eos_token_tensor is not None
-            and isin_mps_friendly(elements=eos_token_tensor, test_elements=pad_token_tensor).any()
-        ):
+        if eos_token_tensor is not None and torch.isin(eos_token_tensor, pad_token_tensor).any():
             if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
                 logger.warning_once(
                     "The attention mask is not set and cannot be inferred from input because pad token is same as "
@@ -2172,13 +2150,13 @@ class GenerationMixin(ContinuousMixin):
             # Finally: if we can compile, disable tokenizers parallelism
             os.environ["TOKENIZERS_PARALLELISM"] = "0"
-            # If we use FA2 and a static cache, we cannot compile with fullgraph
-            if self.config._attn_implementation == "flash_attention_2":
+            # If we use FA and a static cache, we cannot compile with fullgraph
+            if is_flash_attention_requested(self.config):
                 # only raise warning if the user passed an explicit compile-config
                 if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
                     logger.warning_once(
-                        "When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
-                        "FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
+                        "When using Flash Attention and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
+                        "FA introduces graph breaks. We overrode the option with `fullgraph=False`."
                     )
                     generation_config.compile_config.fullgraph = False
@@ -2187,7 +2165,9 @@ class GenerationMixin(ContinuousMixin):
     @contextmanager
     def _optimize_model_for_decode(self):
         original_experts_implementation = self.config._experts_implementation
-        if original_experts_implementation == "grouped_mm":
+        # On non-CPU devices, 'batched_mm' can trade off a bit of memory (by duplicating selected experts weights)
+        # for much better speed during decoding, especially for smaller inputs. On CPU, grouped_mm is usually better.
+        if original_experts_implementation == "grouped_mm" and self.device.type != "cpu":
             logger.info_once(
                 "We will be switching to 'batched_mm' for the decoding stage as it is much more performant than 'grouped_mm' on smaller inputs. "
                 "If you experience any issues with this, please open an issue on the Hugging Face Transformers GitHub repository.",
@@ -2197,7 +2177,7 @@ class GenerationMixin(ContinuousMixin):
         try:
             yield
         finally:
-            if original_experts_implementation == "grouped_mm":
+            if original_experts_implementation == "grouped_mm" and self.device.type != "cpu":
                 self.set_experts_implementation(original_experts_implementation)
     def _get_deprecated_gen_repo(

transformers/image_processing_utils.py CHANGED Viewed

@@ -38,9 +38,6 @@ INIT_SERVICE_KWARGS = [
 class BaseImageProcessor(ImageProcessingMixin):
     valid_kwargs = ImagesKwargs
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     @property
     def is_fast(self) -> bool:
         """

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl