PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/generation/continuous_batching/continuous_api.py CHANGED Viewed

@@ -14,11 +14,9 @@
 # limitations under the License.
 import queue
 import threading
+from abc import abstractmethod
 from collections.abc import Generator
 from contextlib import contextmanager
-from dataclasses import dataclass
-from functools import partial
-from itertools import count
 from math import ceil
 from time import perf_counter
@@ -29,10 +27,11 @@ from tqdm.contrib.logging import logging_redirect_tqdm
 from ...configuration_utils import PretrainedConfig
 from ...generation.configuration_utils import CompileConfig, GenerationConfig
-from ...generation.logits_process import LogitsProcessor
+from ...generation.logits_process import LogitsProcessorList
 from ...utils.logging import logging
 from ...utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced
 from .cache import PagedAttentionCache
+from .input_ouputs import ContinuousBatchingIOs, attn_mask_is_needed
 from .requests import GenerationOutput, RequestState, RequestStatus, logger
 from .scheduler import SCHEDULER_MAPPING, FIFOScheduler, Scheduler
@@ -69,109 +68,19 @@ def pad_by_intervals(size: int, max_value: int, nb_intervals: int) -> int:
     return min(padded, max_value)
-def attn_mask_is_needed(config: PretrainedConfig) -> bool:
-    """Checks if attention mask is needed for the given (config)."""
-    return config._attn_implementation in ["paged|eager", "paged|sdpa"]
+# We cannot use `PreTrainedModel` for circular import reasons, so this helps keep track of the basic types
+class ProtoPretrainedModel(nn.Module):
+    config: PretrainedConfig
+    dtype: torch.dtype
+    device: torch.device
+    @abstractmethod
+    def set_attn_implementation(self, attn_implementation: str) -> None:
+        pass
-def build_attention_mask(
-    attention_mask: torch.Tensor,
-    cumulative_seqlens_q: list[int],
-    cumulative_seqlens_k: list[int],
-    sliding_window: int = 1,
-) -> None:
-    """Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
-    will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
-    equivalent) so it's more of an attention score bias tensor.
-    The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
-    Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.
-    An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:
-    CAUSAL MASK:
-           █ █ █ █ █ ░ ░ ░
-           █ █ █ █ █ █ ░ ░
-           █ █ █ █ █ █ █ ░
-           █ █ █ █ █ █ █ █
-    SLIDING WINDOW MASK:
-         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the left
-       <─┴─>
-     ░ █ | █ █ █ █ █ █ █ █
-     ░ ░ | █ █ █ █ █ █ █ █
-     ░ ░ | ░ █ █ █ █ █ █ █
-     ░ ░ | ░ ░ █ █ █ █ █ █
-    ATTENTION MASK (sum of causal and sliding window masks):
-           █ █ █ █ █ ░ ░ ░
-           █ █ █ █ █ █ ░ ░
-           ░ █ █ █ █ █ █ ░
-           ░ ░ █ █ █ █ █ █
-    Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:
-    CAUSAL MASK:
-           █ █ █ ░ ░
-           █ █ █ █ ░
-           █ █ █ █ █
-    SLIDING WINDOW MASK:
-         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the left
-        <┴>
-         | ░ █ █ █ █
-         | ░ ░ █ █ █
-         | ░ ░ ░ █ █
-    ATTENTION MASK (sum of causal and sliding window masks):
-           ░ █ █ ░ ░
-           ░ ░ █ █ ░
-           ░ ░ ░ █ █
-    """
-    min_value = torch.finfo(attention_mask.dtype).min
-    for i in range(len(cumulative_seqlens_q) - 1):
-        seqlen_q = cumulative_seqlens_q[i + 1] - cumulative_seqlens_q[i]
-        seqlen_k = cumulative_seqlens_k[i + 1] - cumulative_seqlens_k[i]
-        if seqlen_q < seqlen_k and seqlen_q >= 1:
-            causal_diagonal = seqlen_k - seqlen_q + 1
-        else:
-            causal_diagonal = 1
-        query_range = slice(cumulative_seqlens_q[i], cumulative_seqlens_q[i + 1])
-        key_range = slice(cumulative_seqlens_k[i], cumulative_seqlens_k[i + 1])
-        # Apply causal mask
-        minus_inf = torch.full(
-            attention_mask[..., query_range, key_range].shape,
-            min_value,
-            dtype=attention_mask.dtype,
-            device=attention_mask.device,
-        )
-        masked = torch.triu(minus_inf, diagonal=causal_diagonal)
-        # Apply sliding window mask if needed
-        if sliding_window > 1:
-            sliding_diagonal = seqlen_k - seqlen_q - sliding_window
-            masked += torch.tril(minus_inf, diagonal=sliding_diagonal)
-        # Replace in attention mask
-        attention_mask[..., query_range, key_range] = masked
-@dataclass
-class PagedAttentionArgs:
-    input_ids: torch.Tensor
-    attention_mask: torch.Tensor | None
-    position_ids: torch.Tensor
-    cumulative_seqlens_q: torch.Tensor
-    cumulative_seqlens_k: torch.Tensor
-    max_seqlen_q: int
-    max_seqlen_k: int
-    write_index: list[torch.Tensor]
-    read_index: list[torch.Tensor]
-    logits_indices: torch.Tensor
-    cache: PagedAttentionCache
-    use_cache: bool = False
+    @abstractmethod
+    def _get_logits_processor(self, generation_config: GenerationConfig) -> LogitsProcessorList:
+        pass
 # Continuous Batch Processor (Internal Logic)
@@ -238,160 +147,14 @@ class ContinuousBatchProcessor:
         self.max_batch_tokens = cache.max_batch_tokens
         self.metrics = ContinuousBatchProcessorMetrics(cache.max_batch_tokens)
-        # Setup static tensors
-        self.actual_query_length = 0  # This is the actual number of queries tokens in the batch
-        self.actual_key_length = 0  # This is the actual number of keys/values tokens in the batch
-        self.actual_batch_size = 0  # This is the actual number of requests in the batch
-        self.actual_index_sizes = [(0, 0) for _ in range(cache.num_groups)]
-        self.setup_static_tensors(cache.num_groups)
-    @traced(standalone=True)
-    def setup_static_tensors(self, num_groups: int) -> None:
-        """Setup the static tensors that are used for storage during the generation step. No other tensor will be
-        allowed for the inputs or the outputs of the generation step."""
-        self.num_pages = self.cache.num_blocks * self.cache.block_size
-        self.tensor_metadata = {"dtype": torch.int32, "device": self.model_device}
-        # Some tensors always have the same shape regardless of the model
-        self.input_ids = torch.empty((1, self.max_batch_tokens), **self.tensor_metadata)
-        self.position_ids = torch.empty((1, self.max_batch_tokens), **self.tensor_metadata)
-        self.cumulative_seqlens_q = torch.empty((self.max_batch_tokens + 1,), **self.tensor_metadata)
-        self.max_seqlen_q = 0
-        self.logits_indices = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)
-        self.output_ids = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)
-        # For some kwargs, we have a dict of tensors with as many items as there are attention types
-        layer_types = getattr(self.config, "layer_types", None)
-        if layer_types is None:
-            sliding_window = getattr(self.config, "sliding_window", 1)
-            layer_types = ["full_attention"] if sliding_window in [1, None] else ["sliding_attention"]
-        layer_types = list(set(layer_types))
-        self.cumulative_seqlens_k = {
-            l_type: torch.empty((self.max_batch_tokens + 1), **self.tensor_metadata) for l_type in layer_types
-        }
-        self.max_seqlen_k = dict.fromkeys(layer_types, 0)
-        if attn_mask_is_needed(self.config):
-            attn_mask_kwargs = {
-                "size": (1, 1, self.max_batch_tokens, self.num_pages + self.max_batch_tokens),
-                "dtype": self.model_dtype,
-                "device": self.model_device,
-            }
-            self.attention_mask = {layer_type: torch.empty(**attn_mask_kwargs) for layer_type in layer_types}
-        else:
-            self.attention_mask = None
-        # For other kwargs, we need a list of tensors with as many tensors as there are groups
-        self.write_index_storage = [
-            torch.empty((self.max_batch_tokens,), **self.tensor_metadata) for _ in range(num_groups)
-        ]
-        self.read_index_storage = [
-            torch.empty((self.num_pages + self.max_batch_tokens), **self.tensor_metadata) for _ in range(num_groups)
-        ]
-        # For read index, the +T is because there are -1 for seqlen_q when model uses a sliding window
-        # After allocating empty tensors, we reset them to the right value
-        self.reset_static_tensors(full_reset=True)
-    @traced
-    @torch.no_grad()
-    def reset_static_tensors(self, full_reset: bool = False) -> None:
-        """Reset static tensors for the next batch. In between batches, reset only the parts that were used in the last
-        batch, but for initialisation, we can reset everything using the (full_reset) flag."""
-        # Compute the slice to reset
-        q_len = self.write_index_storage[0].size(-1) if full_reset else self.actual_query_length
-        k_len = self.read_index_storage[0].size(-1) if full_reset else self.actual_key_length
-        b_size = self.write_index_storage[0].size(0) if full_reset else self.actual_batch_size
-        # Reset the attributes that always have the same shape
-        self.input_ids[:, :q_len].zero_()
-        self.position_ids[:, :q_len].zero_()
-        self.cumulative_seqlens_q[: b_size + 1].zero_()
-        self.max_seqlen_q = 0
-        self.logits_indices[:q_len].fill_(-1)
-        self.output_ids[:q_len].fill_(-1)
-        # Reset the attributes that are either tensors or dict of tensors
-        for layer_type in self.cumulative_seqlens_k:
-            self.cumulative_seqlens_k[layer_type][: b_size + 1].zero_()
-            self.max_seqlen_k[layer_type] = 0
-            if self.attention_mask is not None:
-                self.attention_mask[layer_type][:, :, :q_len, :k_len].fill_(torch.finfo(self.model_dtype).min)
-        # Reset the attributes that are lists of tensors
-        for i in range(self.cache.num_groups):
-            self.write_index_storage[i][:q_len].fill_(-2)  # -1 is used to let the cache where new states go
-            self.read_index_storage[i][: q_len + k_len].fill_(-2)  # same
-    def get_model_kwargs(self, padded_q_size: int = 0, padded_kv_cache_size: int = 0) -> PagedAttentionArgs:
-        """Get model keyword arguments for the current batch, eventually padding the query dimension to (padded_q_size)
-        and the keys/values dimension to (padded_kv_cache_size). The padding is only useful if we want static shapes,
-        like when using cuda graphs AND only activated if both Q and KV are padded."""
-        # Compute the slice to return, with the given padding if we are using cuda graphs
-        use_padding = padded_q_size > 0 and padded_kv_cache_size > 0
-        q_len = padded_q_size if use_padding else self.actual_query_length
-        b_size = padded_q_size if use_padding else self.actual_batch_size
-        # If there is padding, the size of the KV is the nb of padded Q tokens + the size padded of the padded KV cache
-        padded_kv_size = padded_q_size + padded_kv_cache_size
-        # Prepare the kwargs, the attributes that are either tensors or dict of tensors are initialized to empty dicts
-        kwargs = {
-            "input_ids": self.input_ids[:, :q_len],
-            "position_ids": self.position_ids[:, :q_len],
-            "cu_seq_lens_q": self.cumulative_seqlens_q[: b_size + 1],
-            "max_seqlen_q": self.max_seqlen_q,
-            "logits_indices": self.logits_indices[:q_len],
-            "cu_seq_lens_k": {},
-            "max_seqlen_k": {},
-            "attention_mask": {},
-            "read_index": [],
-            "write_index": [],
-            "cache": self.cache,
-            "use_cache": False,
-        }
-        # If we use constant-sized slicing, there are some "padding" queries tokens which FA has some issues with. In
-        # some models like Qwen3-4B-Instruct-2507, if we don't include these tokens in cumulative_seqlens_q, there are
-        # some NaNs in the output logits even for non-padded tokens.
-        if use_padding:
-            self.max_seqlen_q = max(self.max_seqlen_q, q_len - self.total_seqlen_q)
-            self.cumulative_seqlens_q[self.actual_batch_size + 1 :] = q_len
-            # FIXME: is there another way to avoid this? It has a very slight impact on performance (~5 tok/s)
-        # For the attributes that are lists of tensors, we construct list of tensor references
-        for i, (read_index_size, write_index_size) in enumerate(self.actual_index_sizes):
-            read_index_size = padded_kv_size if use_padding else read_index_size
-            write_index_size = padded_q_size if use_padding else write_index_size
-            kwargs["read_index"].append(self.read_index_storage[i][:read_index_size])
-            kwargs["write_index"].append(self.write_index_storage[i][:write_index_size])
-        # For the attributes that are dict of tensors, we replace the dict with a tensor if there is only one entry
-        layer_types = list(self.cumulative_seqlens_k.keys())
-        if len(layer_types) > 1:
-            for layer_type, seqlens_k in self.cumulative_seqlens_k.items():
-                kwargs["cu_seq_lens_k"][layer_type] = seqlens_k[: b_size + 1]
-                kwargs["max_seqlen_k"][layer_type] = self.max_seqlen_k[layer_type]
-                if self.attention_mask is not None:
-                    k_len = padded_kv_size if use_padding else seqlens_k[b_size]
-                    kwargs["attention_mask"][layer_type] = self.attention_mask[layer_type][..., :q_len, :k_len]
-        else:
-            layer_type = layer_types[0]
-            kwargs["cu_seq_lens_k"] = self.cumulative_seqlens_k[layer_type][: b_size + 1]
-            kwargs["max_seqlen_k"] = self.max_seqlen_k[layer_type]
-            if self.attention_mask is not None:
-                k_len = padded_kv_size if use_padding else self.cumulative_seqlens_k[layer_type][b_size]
-                kwargs["attention_mask"] = self.attention_mask[layer_type][..., :q_len, :k_len]
-        if self.attention_mask is None:
-            kwargs["attention_mask"] = None
-        return kwargs
+        # Setup inputs and outputs
+        self.inputs_and_outputs = ContinuousBatchingIOs(cache, config, model_device, model_dtype)
     def __repr__(self) -> str:
         return (
             f"ContinuousBatchProcessor(input_queue={self.input_queue}, output_queue={self.output_queue}, "
             f"active_requests={self.scheduler.active_requests}, waiting_requests={self.scheduler.waiting_requests})"
-            + self.get_model_kwargs().__repr__()
+            + self.inputs_and_outputs.get_model_kwargs().__repr__()
         )
     @traced
@@ -408,7 +171,7 @@ class ContinuousBatchProcessor:
                 break
             except Exception as e:
                 logger.error(f"Error processing new request: {e}", exc_info=True)
-                state: RequestState = locals().get("state")
+                state: RequestState = locals().get("state")  # type:ignore
                 if state is not None:
                     self._handle_request_error(e, state)
@@ -467,83 +230,30 @@ class ContinuousBatchProcessor:
         self.metrics.record_queue_metrics(len(self.scheduler.active_requests), len(self.scheduler.waiting_requests))
         # Schedule the next batch of requests, stop if there are no requests in the batch
-        self.requests_in_batch = self.scheduler.schedule_batch(self.max_batch_tokens, self.num_pages)
+        requests_in_batch = self.scheduler.schedule_batch(self.max_batch_tokens, self.cache.num_pages)
         # If requests_in_batch is None, it means we need to offload some requests if possible
-        if self.requests_in_batch is None:
+        if requests_in_batch is None:
             if len(self.scheduler.active_requests) > 1:
                 self.soft_reset_one_request()
+                return False
             else:
                 raise RuntimeError("No requests can be scheduled and no request can be offloaded.")
         # If it's an empty list, it means we have no requests to process
+        self.requests_in_batch = requests_in_batch
         if not self.requests_in_batch:
             return False
         # Otherwise, we can continue with the non-empty batch
         self.metrics.record_batch_metrics(self.requests_in_batch)
+        self.inputs_and_outputs.prepare_batch_tensors(requests_in_batch)
-        # Reset the static tensors used for storage
-        self.reset_static_tensors()  # FIXME: why does this make the generation faster?
-        # Prepare accumulators
-        self.actual_query_length = 0
-        self.actual_key_length = 0
-        self.actual_batch_size = 0
-        input_ids = []
-        position_ids = []
-        cumulative_seqlens_q = [0]
-        logits_indices = []
-        cumulative_seqlens_k = {layer_type: [0] for layer_type in self.cumulative_seqlens_k}
-        read_index = [[] for _ in range(self.cache.num_groups)]
-        write_index = [[] for _ in range(self.cache.num_groups)]
-        # Go through all the requests in the batch
-        for state in self.requests_in_batch:
-            # First we retrieve the lengths related to the request
-            past_length = state.position_offset
-            query_length = len(state.tokens_to_process)
-            seqlens_k = self.cache.get_seqlens_k(state.request_id, past_length, query_length)
-            # Then we update the total lengths that are used for slicing
-            self.actual_query_length += query_length
-            # total_key_length is used to slice the keys so we need to take the max of all the key lengths
-            self.actual_key_length += max(seqlens_k.values())
-            self.actual_batch_size += 1
-            # And the attribute tracking the position in the request object
-            state.position_offset += query_length
-            # Then we accumulate for the object used in the kwargs
-            input_ids.extend(state.tokens_to_process)
-            position_ids.extend(range(past_length, past_length + query_length))
-            cumulative_seqlens_q.append(cumulative_seqlens_q[-1] + query_length)
-            self.max_seqlen_q = max(self.max_seqlen_q, query_length)
-            if not state.remaining_prefill_tokens:
-                logits_indices.append(cumulative_seqlens_q[-1] - 1)
-            for layer_type, layer_type_seqlen_k in seqlens_k.items():
-                cumulative_seqlens_k[layer_type].append(cumulative_seqlens_k[layer_type][-1] + layer_type_seqlen_k)
-                self.max_seqlen_k[layer_type] = max(self.max_seqlen_k[layer_type], layer_type_seqlen_k)
-            self.cache.extend_read_indices(state.request_id, past_length, query_length, read_index)
-            self.cache.extend_write_indices(state.request_id, past_length, query_length, write_index)
-        # When looping over request is done, we can build the actual tensors
-        self._build_tensors(
-            input_ids,
-            position_ids,
-            read_index,
-            write_index,
-            cumulative_seqlens_q,
-            cumulative_seqlens_k,
-            logits_indices,
-        )
+        # Record the memory metrics of the KV cache
         self.metrics.record_kv_cache_memory_metrics(self.cache)
         if logger.isEnabledFor(logging.DEBUG):
-            ck = max(cumulative_seqlens_k[layer_type][-1] for layer_type in self.cumulative_seqlens_k)
+            cumulative_seqlens_q = self.inputs_and_outputs.cumulative_seqlens_q
+            cumulative_seqlens_k = self.inputs_and_outputs.cumulative_seqlens_k
+            ck = max(cumulative_seqlens_k[layer_type][-1] for layer_type in cumulative_seqlens_k)
             logger.debug(
                 f"Scheduled: {len(self.requests_in_batch)}, Waiting: {len(self.scheduler.waiting_requests)}, "
                 f"Active: {len(self.scheduler.active_requests)}. cum Q: {cumulative_seqlens_q[-1]}. "
@@ -551,52 +261,6 @@ class ContinuousBatchProcessor:
             )
         return True
-    @traced
-    def _build_tensors(
-        self,
-        input_ids: list[int],
-        position_ids: list[int],
-        read_index: list[list[int]],
-        write_index: list[list[int]],
-        cumulative_seqlens_q: list[int],
-        cumulative_seqlens_k: dict[str, list[int]],
-        logits_indices: list[int],
-    ) -> None:
-        """Builds the actual tensors for the current batch, by modifying the already allocated tensors in place."""
-        to_tensor = partial(torch.tensor, **self.tensor_metadata)
-        # Those kwargs always have the same type regardless of the model
-        self.input_ids[:, : len(input_ids)] = to_tensor(input_ids)
-        self.position_ids[:, : len(position_ids)] = to_tensor(position_ids)
-        self.cumulative_seqlens_q[: len(cumulative_seqlens_q)] = to_tensor(cumulative_seqlens_q)
-        self.logits_indices[: len(logits_indices)] = to_tensor(logits_indices)
-        self.total_seqlen_q = cumulative_seqlens_q[-1]
-        # Those kwargs are either dict of tensors or tensors, so we need to handle both cases
-        for layer_type, layer_type_seqlens_k in cumulative_seqlens_k.items():
-            self.cumulative_seqlens_k[layer_type][: len(layer_type_seqlens_k)] = to_tensor(layer_type_seqlens_k)
-            if self.attention_mask is not None:
-                build_attention_mask(
-                    attention_mask=self.attention_mask[layer_type],
-                    cumulative_seqlens_q=cumulative_seqlens_q,
-                    cumulative_seqlens_k=layer_type_seqlens_k,
-                    sliding_window=self.sliding_window if layer_type == "sliding_attention" else 1,
-                )
-        # The index only contain references to the storage tensors, so we update the storage and their references
-        self.read_index = []
-        self.write_index = []
-        for i, group_read_indices, group_write_indices in zip(count(), read_index, write_index):
-            self.read_index_storage[i][: len(group_read_indices)] = to_tensor(group_read_indices)
-            self.write_index_storage[i][: len(group_write_indices)] = to_tensor(group_write_indices)
-            self.actual_index_sizes[i] = (len(group_read_indices), len(group_write_indices))
-    @traced
-    def _get_new_tokens(self, num_new_tokens: int) -> list[int]:
-        indices = self.logits_indices[:num_new_tokens]
-        new_tokens = self.output_ids[indices]
-        return new_tokens.tolist()
     @traced
     def _maybe_send_output(self, state: RequestState) -> None:
         """Send output to the queue based on streaming mode and request state."""
@@ -606,13 +270,13 @@ class ContinuousBatchProcessor:
     @traced
     def update_batch(self) -> None:
         """Update request states based on generated tokens."""
-        new_tokens = self._get_new_tokens(len(self.requests_in_batch))
+        new_tokens = self.inputs_and_outputs.output_ids[: len(self.requests_in_batch)].tolist()
         current_logits_index = 0
         for state in self.requests_in_batch:
             # If the request has no remaining prompt ids, it means prefill has already ended or just finished
             if len(state.remaining_prefill_tokens) == 0:
-                # If there are no generated tokens yet, it means prefill just ended
-                if state.generated_len() == 0:
+                # If there is just one temporary token, it means prefill just ended
+                if state.generated_len() == 1:
                     self.metrics.record_ttft_metric(state.created_time, state.request_id)
                     state.status = RequestStatus.DECODING
@@ -640,15 +304,15 @@ class ContinuousBatchProcessor:
         copy_source, copy_destination = [], []
         while self.scheduler._requests_to_fork:
             # Get the number of children and reset it so it's not forked again
-            state = self.scheduler._requests_to_fork.pop()
-            num_children = state.num_children
-            state.num_children = 0
+            state_to_fork = self.scheduler._requests_to_fork.pop()
+            num_children = state_to_fork.num_children
+            state_to_fork.num_children = 0
             # Create the new request and add them to the scheduler
-            new_request_ids = [f"{state.request_id}__child#{i}" for i in range(num_children)]
+            new_request_ids = [f"{state_to_fork.request_id}__child#{i}" for i in range(num_children)]
             for new_request_id in new_request_ids:
-                self.scheduler.active_requests[new_request_id] = state.fork(new_request_id)
+                self.scheduler.active_requests[new_request_id] = state_to_fork.fork(new_request_id)
             # Fork the cache
-            copy_src, copy_dst = self.cache.fork_request(state.request_id, new_request_ids)
+            copy_src, copy_dst = self.cache.fork_request(state_to_fork.request_id, new_request_ids)
             copy_source.extend(copy_src)
             copy_destination.extend(copy_dst)
             # FIXME: if fork cant be done, create a new pending request without forking instead of crashing everything
@@ -692,8 +356,8 @@ class ContinuousBatchProcessor:
         self.scheduler.waiting_requests_order.clear()
     @traced
-    @torch.no_grad
-    def _generation_step(self, model: nn.Module, logit_processor: LogitsProcessor, do_sample: bool) -> None:
+    @torch.no_grad()
+    def _generation_step(self, model: nn.Module, logit_processor: LogitsProcessorList, do_sample: bool) -> None:
         """Perform a single generation step."""
         # If a compile config is specified, we compile the forward pass once in a wrapper
@@ -710,14 +374,18 @@ class ContinuousBatchProcessor:
         # If inputs are static sized, we find the padded sizes of the queries and keys/values
         if self._pad_inputs:
-            padded_q = pad_by_intervals(self.actual_query_length, self.max_batch_tokens, self.q_padding_intervals)
-            max_read_index_size = max(self.actual_index_sizes[i][0] for i in range(self.cache.num_groups))
+            actual_query_length = self.inputs_and_outputs.actual_query_length
+            actual_index_sizes = self.inputs_and_outputs.actual_index_sizes
+            padded_q = pad_by_intervals(actual_query_length, self.max_batch_tokens, self.q_padding_intervals)
+            max_read_index_size = max(actual_index_sizes[i][0] for i in range(self.cache.num_groups))
             # The space planned for query tokens will be added later, so we remove it from the space planned for KV
-            padded_read_index_size = pad_by_intervals(max_read_index_size, self.num_pages, self.kv_padding_intervals)
+            padded_read_index_size = pad_by_intervals(
+                max_read_index_size, self.cache.num_pages, self.kv_padding_intervals
+            )
         else:
             padded_q, padded_read_index_size = 0, 0
         # Retrieve the model kwargs with or without padding
-        batch_data = self.get_model_kwargs(padded_q, padded_read_index_size)
+        batch_data = self.inputs_and_outputs.get_model_kwargs(padded_q, padded_read_index_size)
         # If we are not using cuda graphs, we perform the generation step and return
         if self._graphs is None:
@@ -746,21 +414,23 @@ class ContinuousBatchProcessor:
     @traced
     def _forward_process_and_sample(
-        self, model: nn.Module, batch_data: dict, logit_processor: LogitsProcessor, do_sample: bool
+        self, model: nn.Module, batch_data: dict, logit_processor: LogitsProcessorList, do_sample: bool
     ) -> None:
         """This function performs the forward pass, logits processing, and sampling; which are broken down into smaller
         function to be easier to trace with OpenTelemetry."""
         logits = self._model_forward(model, batch_data)
         # if self.log_prob_generation:    batch_processor.output_probs.copy_(logits)  # TODO
         probs = self._process_logit(batch_data, logits, logit_processor)
-        self._sample(probs, do_sample)
+        self._sample(probs, batch_data, do_sample)
     @traced(span_name="model_forward")
     def _model_forward(self, model: nn.Module, batch_data: dict) -> torch.Tensor:
         return model(**batch_data).logits
     @traced(span_name="logit_processing")
-    def _process_logit(self, batch_data: dict, logits: torch.Tensor, logit_processor: LogitsProcessor) -> torch.Tensor:
+    def _process_logit(
+        self, batch_data: dict, logits: torch.Tensor, logit_processor: LogitsProcessorList
+    ) -> torch.Tensor:
         # Pass continuous batching context to logits processor if it supports it.
         if hasattr(logit_processor, "set_continuous_batching_context"):
             logit_processor.set_continuous_batching_context(batch_data["logits_indices"], batch_data["cu_seq_lens_q"])
@@ -770,13 +440,13 @@ class ContinuousBatchProcessor:
         # NOTE: to be an exact match with generate, we should also convert logits2d to float32 here, but it's not needed in practice
         logits_2d = logits.view(batch_size * seq_len, vocab_size)
         input_ids_2d = batch_data["input_ids"].view(batch_size * seq_len)
-        # Process with 2D tensors
-        processed_logits_2d = logit_processor(input_ids_2d, logits_2d)
+        # Process with 2D tensors#
+        processed_logits_2d = logit_processor(input_ids_2d, logits_2d)  # type: ignore[arg-type]
         # Reshape back to 3D
         return processed_logits_2d.view(batch_size, seq_len, vocab_size)
     @traced(span_name="sampling")
-    def _sample(self, probs: torch.Tensor, do_sample: bool) -> None:
+    def _sample(self, probs: torch.Tensor, batch_data: dict, do_sample: bool) -> None:
         if do_sample:
             probs = nn.functional.softmax(probs, dim=-1)
             # probs[0] has shape [seq_len, vocab_size], multinomial returns [seq_len, 1]
@@ -785,7 +455,10 @@ class ContinuousBatchProcessor:
             next_tokens = torch.argmax(probs, dim=-1)  # shape is [1, seq_len]
             next_tokens = next_tokens.squeeze(0)  # shape is [seq_len]
         tokens = next_tokens.size(0)  # Get seq_len dimension
-        self.output_ids[:tokens].copy_(next_tokens)
+        #
+        indices = batch_data["logits_indices"][:tokens]
+        next_tokens = next_tokens[indices]
+        self.inputs_and_outputs.output_ids[:tokens].copy_(next_tokens)
 # Manager Class (User Interface)
@@ -799,7 +472,7 @@ class ContinuousBatchingManager:
     def __init__(
         self,
-        model: nn.Module,
+        model: ProtoPretrainedModel,
         generation_config: GenerationConfig,
         manual_eviction: bool = False,
         max_queue_size: int = 0,
@@ -840,7 +513,7 @@ class ContinuousBatchingManager:
         self.generation_config = generation_config
         self.log_prob_generation = getattr(generation_config, "log_prob_generation", False)
         self.do_sample = getattr(generation_config, "do_sample", True)
-        self.logit_processor = self.model._get_logits_processor(generation_config)
+        self.logit_processor: LogitsProcessorList = self.model._get_logits_processor(generation_config)
         num_return_sequences = getattr(generation_config, "num_return_sequences", None)
         self.num_return_sequences = num_return_sequences if num_return_sequences is not None else 1
@@ -879,6 +552,11 @@ class ContinuousBatchingManager:
         If none of the above criteria are met, we use a default heuristic based on the attention implementation: we turn
         on cuda graphs if and only if no attention mask is needed.
         """
+        # If cuda is not available, we cannot use cuda graphs
+        if not torch.cuda.is_available():
+            if use_cuda_graph:
+                logger.warning(f"use_cuda_graph is True but {torch.cuda.is_available() = }: turning off cuda graphs.")
+            return False
         # If use_cuda_graph is specified, we follow the user's choice
         if use_cuda_graph is not None:
             return use_cuda_graph
@@ -902,7 +580,7 @@ class ContinuousBatchingManager:
         logger.warning(
             f"No behavior specified for use_cuda_graph, defaulting to {use_cuda_graph = } because "
             f"{self.model.config._attn_implementation = }. If you want to save memory, turn off cuda graphs, but "
-            "they can improve performances."
+            "they tend to improve performances by a lot."
         )
         return use_cuda_graph
@@ -1013,14 +691,17 @@ class ContinuousBatchingManager:
         streaming: bool = False,
         record_timestamps: bool = False,
     ) -> None:
-        # If there is prefix sharing, we sort the inputs to maximize cache hits
+        # Infer the request ids of all incoming requests
+        with self._request_lock:
+            request_ids = [f"req_{i}" for i in range(self._request_counter, self._request_counter + len(inputs))]
+            self._request_counter += len(inputs)
+        # If there is prefix sharing, we sort the inputs to maximize cache hits but keep the order of the requests
+        ids_and_inputs = list(zip(request_ids, inputs))
         if self._use_prefix_sharing:
-            inputs = sorted(inputs, reverse=True)
+            ids_and_inputs = sorted(ids_and_inputs, key=lambda x: x[1], reverse=True)
         # Add requests in order
-        for input_ids in inputs:
-            self.add_request(
-                input_ids, max_new_tokens=max_new_tokens, streaming=streaming, record_timestamps=record_timestamps
-            )
+        for request_id, input_ids in ids_and_inputs:
+            self.add_request(input_ids, request_id, max_new_tokens, streaming, record_timestamps)
     def cancel_request(self, request_id: str) -> None:
         """Cancel a request by its ID.
@@ -1073,7 +754,9 @@ class ContinuousBatchingManager:
     @traced
     def _generation_step(self) -> None:
-        """Perform a single generation step. This is cuda graphed"""
+        """Perform a single generation step. This is mostly cuda graphed"""
+        if self.batch_processor is None:
+            raise RuntimeError("Tried to perform a generation step before the batch processor was initialized.")
         self.batch_processor._generation_step(self.model, self.logit_processor, self.do_sample)
     def _run_generation_loop(self) -> None:
@@ -1139,14 +822,6 @@ class ContinuousBatchingManager:
         # Loop body ends if there is no requests in the batch
         if not batch_processor.prepare_next_batch():
             return
-        # Debug logging of the current memory usage -- commented out because it's often not used even in debug
-        # if logger.level < logging.DEBUG:
-        #     device, total, reserved, allocated = get_device_and_memory_breakdown()
-        #     available_memory = total - max(allocated, reserved)
-        #     logger.debug(
-        # f"[Memory] Device: {device}, Total: {total}, Reserved: {reserved}, Allocated: {allocated}, Available: {available_memory}"
-        #     )
         self._generation_step()
         batch_processor.update_batch()
@@ -1181,6 +856,8 @@ class ContinuousBatchingManager:
 class ContinuousMixin:
     """Mixin class for models to add continuous batching capabilities."""
+    generation_config: GenerationConfig
     @contextmanager
     def continuous_batching_context_manager(
         self,
@@ -1246,7 +923,7 @@ class ContinuousMixin:
         # Create and return the manager
         return ContinuousBatchingManager(
-            model=self,
+            model=self,  # type: ignore
             generation_config=gen_config,
             manual_eviction=manual_eviction,
             max_queue_size=max_queue_size,
@@ -1328,8 +1005,19 @@ class ContinuousMixin:
                     else:
                         if not manager.is_running():
                             logger.error("Generation thread terminated unexpectedly.")
+                            # This helps get some information in stdout
+                            print("Returning results of generate_batch despite unexpected termination.")
                             break
             except Exception as e:
                 logger.error(f"Error during batch generation: {e}", exc_info=True)
-        return results
+        # Re-order requests to match the order of the inputs
+        reordered_results = {}
+        for i in range(len(inputs)):
+            # We cannot guarantee that the generation succeeded for all requests, so we need to check if the request is in the results
+            result = results.get(f"req_{i}")
+            if result is not None:
+                reordered_results[f"req_{i}"] = result
+            else:
+                logger.error(f"Request req_{i} not found in results.")
+        return reordered_results

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl