PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/generation/candidate_generator.py CHANGED Viewed

@@ -41,7 +41,9 @@ if TYPE_CHECKING:
 class CandidateGenerator:
     """Abstract base class for all candidate generators that can be applied during assisted generation."""
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         """
         Fetches the candidates to be tried for the current input.
@@ -117,11 +119,16 @@ class AssistedCandidateGenerator(CandidateGenerator):
         # Prepare the assistant and the starting number of candidate tokens
         self.assistant_model = assistant_model
-        self.num_assistant_tokens = assistant_model.generation_config.num_assistant_tokens
-        self.assistant_confidence_threshold = assistant_model.generation_config.assistant_confidence_threshold
+        # Prepare the generation config by updating with default values if not already set by users
+        self.assistant_generation_config = copy.deepcopy(assistant_model.generation_config)
+        global_defaults = self.assistant_generation_config._get_default_generation_params()
+        self.assistant_generation_config.update(**global_defaults, defaults_only=True)
+        self.num_assistant_tokens = self.assistant_generation_config.num_assistant_tokens
+        self.assistant_confidence_threshold = self.assistant_generation_config.assistant_confidence_threshold
         # Set eos in assistant same as in target model
-        self.assistant_model.generation_config.eos_token_id = generation_config.eos_token_id
+        self.assistant_generation_config.eos_token_id = generation_config.eos_token_id
         # Prepare the kwargs for the assistant model
         assistant_kwargs = {}
@@ -138,10 +145,10 @@ class AssistedCandidateGenerator(CandidateGenerator):
         # If the assistant is an encoder-decoder model, assume the encoder is different on the assistant.
         if assistant_model.config.is_encoder_decoder:
             inputs_tensor, model_input_name, assistant_kwargs = assistant_model._prepare_model_inputs(
-                inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_kwargs
+                inputs_tensor, self.assistant_generation_config.bos_token_id, assistant_kwargs
             )
             assistant_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
-                inputs_tensor, assistant_kwargs, model_input_name, assistant_model.generation_config
+                inputs_tensor, assistant_kwargs, model_input_name, self.assistant_generation_config
             )
         elif "encoder_outputs" in model_kwargs:
             assistant_kwargs["encoder_outputs"] = model_kwargs["encoder_outputs"]
@@ -189,13 +196,15 @@ class AssistedCandidateGenerator(CandidateGenerator):
         if (
             is_sklearn_available()
-            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and self.assistant_generation_config.assistant_confidence_threshold
             and type(self) is AssistedCandidateGenerator
         ):
             self.probs = []
             self.matches = []
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         """
         Fetches the candidates to be tried for the current input.
@@ -216,7 +225,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
         # Update past key values and masks
         self._update_past_and_masks(input_ids)
         # Generate candidates
-        generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens)
+        generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens, is_first_iteration)
         candidate_ids, candidate_logits = self._generate_candidates(generation_args)
         return candidate_ids, candidate_logits
@@ -236,7 +245,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
         # Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
         # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
         # cost of forecasting incorrect assistant tokens.
-        if self.assistant_model.generation_config.num_assistant_tokens_schedule in {
+        if self.assistant_generation_config.num_assistant_tokens_schedule in {
             "heuristic",
             "heuristic_transient",
         }:
@@ -250,7 +259,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
         # This adaptation is not compatible with UAG, as it relies on the number of matched tokens based on the draft vocabulary, which is unavailable in UAG.
         if (
             is_sklearn_available()
-            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and self.assistant_generation_config.assistant_confidence_threshold
             and type(self) is AssistedCandidateGenerator
         ):
             # update self.matches
@@ -276,7 +285,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
                 optimal_threshold_index = np.argmin(costs)
                 best_threshold = thresholds[optimal_threshold_index]
-                self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold
+                self.assistant_generation_config.assistant_confidence_threshold = best_threshold
     def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> tuple[int, int]:
         """Calculate the minimum and maximum number of new tokens to generate."""
@@ -304,23 +313,50 @@ class AssistedCandidateGenerator(CandidateGenerator):
         return has_past_key_values
-    def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> dict:
+    def _prepare_generation_args(
+        self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int, is_first_iteration: bool
+    ) -> dict:
         """Prepare arguments for the generation call."""
-        return {
-            self.input_ids_key: input_ids,
-            "min_new_tokens": min_new_tokens,
-            "max_new_tokens": max_new_tokens,
-            "generation_config": self.generation_config,
-            "logits_processor": self.logits_processor,
-        }
+        # Generate candidates. Run prefill-specific logic in first generation and prepare model kwargs.
+        # Some models prepare inputs differently depending on first vs subsequent iterations.(e.g. VLMs)
+        # Assisted generation however calls internally `self.generate()` many times and technically will
+        # lead to many `is_first_iteration's`. This way we can call prefill only once per assistant model
+        if is_first_iteration:
+            generation_args = self.assistant_model._get_initial_cache_position(
+                input_ids.shape[1], input_ids.device, self.assistant_kwargs.copy()
+            )
+            generation_args = self.assistant_model.prepare_inputs_for_generation(
+                input_ids, is_first_iteration=True, **generation_args
+            )
+            # NOTE: `prepare_inputs_for_generation` creates inputs that can't be used when continuing generation with past-cache
+            # therefore we manually re-assign full input ids and other args. It is a known issue, due to legacy reasons we
+            # have to pass whole input ids to `generate()` including past tokens which are in encoded in cache
+            generation_args[self.input_ids_key] = input_ids
+            for model_input_name in ["position_ids", "token_type_ids", "decoder_position_ids", "cache_position"]:
+                generation_args.pop(model_input_name, None)
+        else:
+            generation_args = {self.input_ids_key: input_ids}
+        generation_args.update(
+            {
+                "min_new_tokens": min_new_tokens,
+                "max_new_tokens": max_new_tokens,
+                "generation_config": self.generation_config,
+                "logits_processor": self.logits_processor,
+            }
+        )
+        generation_args.update(
+            {k: self.assistant_kwargs[k] for k in self.assistant_kwargs if k not in generation_args}
+        )
+        return generation_args
     def _generate_candidates(self, generation_args: dict) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """Generate candidate sequences using the assistant model."""
-        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        assistant_output = self.assistant_model.generate(**generation_args)
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
         if (
             is_sklearn_available()
-            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and self.assistant_generation_config.assistant_confidence_threshold
             and type(self) is AssistedCandidateGenerator
         ):
             scores_tensor = torch.cat(assistant_output.scores, dim=0)
@@ -383,8 +419,8 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
         self.assistant_tokenizer = assistant_tokenizer
         self.prev_target_ids_len: int | None = None
         self.prev_assistant_ids = None
-        self.target_lookbehind = assistant_model.generation_config.target_lookbehind
-        self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind
+        self.target_lookbehind = self.assistant_generation_config.target_lookbehind
+        self.assistant_lookbehind = self.assistant_generation_config.assistant_lookbehind
     @staticmethod
     def _get_longest_diag_dict(input_matrix, nonzero_idx):
@@ -494,7 +530,9 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
         dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
         return dest_ids.to(input_ids.device)
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         """
         Fetches the candidates to be tried for the current input.
@@ -520,10 +558,12 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
         self._update_past_and_masks(assistant_input_ids, remove_from_pkv)
-        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+        generation_args = self._prepare_generation_args(
+            assistant_input_ids, min_new_tokens, max_new_tokens, is_first_iteration
+        )
         self.assistant_kwargs.pop("attention_mask", None)
-        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        assistant_output = self.assistant_model.generate(**generation_args)
         new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences)
         # Update state
@@ -919,7 +959,9 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
         self._target_seq_len_with_candidates: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         """
         Simplified version of get_candidates that uses the translator cache for token conversion.
         """
@@ -931,7 +973,9 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
             return input_ids, None
         self._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens)
-        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+        generation_args = self._prepare_generation_args(
+            assistant_input_ids, min_new_tokens, max_new_tokens, is_first_iteration
+        )
         # Ensure scores are returned
         generation_args["generation_config"].output_scores = True
@@ -1045,7 +1089,9 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
         if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
             raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         """
         Fetches the candidates to be tried for the current input.
@@ -1202,7 +1248,9 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
         self.assistant_early_exit = self.generation_config.assistant_early_exit
         self.generation_config.assistant_early_exit = None
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, is_first_iteration: bool
+    ) -> tuple[torch.LongTensor, torch.FloatTensor]:
         # Temporarily sets the number of hidden layers to the early exit value
         base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
         original_num_hidden_layers = base_model.config.num_hidden_layers

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl