PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/generation/continuous_batching/continuous_api.py CHANGED Viewed

@@ -259,7 +259,7 @@ class ContinuousBatchProcessor:
         self.cumulative_seqlens_q = torch.empty((self.max_batch_tokens + 1,), **self.tensor_metadata)
         self.max_seqlen_q = 0
         self.logits_indices = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)
-        self.output_ids = torch.empty((1, self.max_batch_tokens), **self.tensor_metadata)
+        self.output_ids = torch.empty((self.max_batch_tokens,), **self.tensor_metadata)
         # For some kwargs, we have a dict of tensors with as many items as there are attention types
         layer_types = getattr(self.config, "layer_types", None)
@@ -311,7 +311,7 @@ class ContinuousBatchProcessor:
         self.cumulative_seqlens_q[: b_size + 1].zero_()
         self.max_seqlen_q = 0
         self.logits_indices[:q_len].fill_(-1)
-        self.output_ids[:, :q_len].fill_(-1)
+        self.output_ids[:q_len].fill_(-1)
         # Reset the attributes that are either tensors or dict of tensors
         for layer_type in self.cumulative_seqlens_k:
@@ -447,7 +447,7 @@ class ContinuousBatchProcessor:
         self.metrics.record_batch_metrics(self.requests_in_batch)
         # Reset the static tensors used for storage
-        self.reset_static_tensors()  # TODO: this might be unnecessary
+        self.reset_static_tensors()  # FIXME: why does this make the generation faster?
         # Prepare accumulators
         self.actual_query_length = 0
@@ -557,13 +557,10 @@ class ContinuousBatchProcessor:
             self.actual_index_sizes[i] = (len(group_read_indices), len(group_write_indices))
     @traced
-    def _sync(self) -> list[int]:
-        if self.output_ids is not None:
-            try:
-                return self.output_ids.tolist()[0]
-            except Exception:
-                return [0, 1]
-        return [0, 0]
+    def _get_new_tokens(self, num_new_tokens: int) -> list[int]:
+        indices = self.logits_indices[:num_new_tokens]
+        new_tokens = self.output_ids[indices]
+        return new_tokens.tolist()
     @traced
     def _maybe_send_output(self, state: RequestState) -> None:
@@ -574,29 +571,56 @@ class ContinuousBatchProcessor:
     @traced
     def update_batch(self) -> None:
         """Update request states based on generated tokens."""
-        out_tokens = self._sync()
-        for i, state in enumerate(self.requests_in_batch):
+        new_tokens = self._get_new_tokens(len(self.requests_in_batch))
+        current_logits_index = 0
+        for state in self.requests_in_batch:
             # If the request has no remaining prompt ids, it means prefill has already ended or just finished
             if len(state.remaining_prefill_tokens) == 0:
-                self.metrics.record_ttft_metric(state.created_time, state.request_id)
-                state.status = RequestStatus.DECODING
-                token = out_tokens[self.logits_indices[i]]
+                # If there are no generated tokens yet, it means prefill just ended
+                if state.generated_len() == 0:
+                    self.metrics.record_ttft_metric(state.created_time, state.request_id)
+                    state.status = RequestStatus.DECODING
+                token = new_tokens[current_logits_index]
                 state.tokens_to_process = [token]
+                current_logits_index += 1
                 # Update the request and stop if it is complete
                 is_finished = state.update_and_check_completion(token)
                 # We mark the completed blocks as such
-                self.cache.mark_blocks_as_complete(state)
+                self.cache.mark_shareable_blocks_as_complete(state)
                 if is_finished:
                     self.metrics.record_request_completion(state.created_time, state.request_id)
                     self.scheduler.finish_request(state.request_id, evict_from_cache=(not self.manual_eviction))
                 self._maybe_send_output(state)
             #  Otherwise, the request is still prefilling, but the prefill has been split
             elif state.status == RequestStatus.PREFILLING_SPLIT:
-                self.cache.mark_blocks_as_complete(state)
+                self.cache.mark_shareable_blocks_as_complete(state)
                 state.status = RequestStatus.SPLIT_PENDING_REMAINDER
             else:
                 raise ValueError(f"Request {state.request_id} is in an unexpected state: {state.status}")
+        # If some requests need to be forked, we do it now
+        copy_source, copy_destination = [], []
+        while self.scheduler._requests_to_fork:
+            # Get the number of children and reset it so it's not forked again
+            state = self.scheduler._requests_to_fork.pop()
+            num_children = state.num_children
+            state.num_children = 0
+            # Create the new request and add them to the scheduler
+            new_request_ids = [f"{state.request_id}__child#{i}" for i in range(num_children)]
+            for new_request_id in new_request_ids:
+                self.scheduler.active_requests[new_request_id] = state.fork(new_request_id)
+            # Fork the cache
+            copy_src, copy_dst = self.cache.fork_request(state.request_id, new_request_ids)
+            copy_source.extend(copy_src)
+            copy_destination.extend(copy_dst)
+            # FIXME: if fork cant be done, create a new pending request without forking instead of crashing everything
+        # The copy induced by the fork is done in one go (if it's even needed)
+        if copy_source:
+            self.cache.copy_cache(copy_source, copy_destination)
         if self.cache.get_num_free_blocks() == 0:
             raise ValueError("No more free blocks")
@@ -727,12 +751,11 @@ class ContinuousBatchProcessor:
             probs = nn.functional.softmax(probs, dim=-1)
             # probs[0] has shape [seq_len, vocab_size], multinomial returns [seq_len, 1]
             next_tokens = torch.multinomial(probs[0], num_samples=1).squeeze(-1)  # Now [seq_len]
-            # Add batch dimension back to match argmax output
-            next_tokens = next_tokens.unsqueeze(0)  # Now [1, seq_len]
         else:
-            next_tokens = torch.argmax(probs, dim=-1)  # Already [1, seq_len]
-        tokens = next_tokens.size(1)  # Get seq_len dimension
-        self.output_ids[:, :tokens].copy_(next_tokens)
+            next_tokens = torch.argmax(probs, dim=-1)  # shape is [1, seq_len]
+            next_tokens = next_tokens.squeeze(0)  # shape is [seq_len]
+        tokens = next_tokens.size(0)  # Get seq_len dimension
+        self.output_ids[:tokens].copy_(next_tokens)
 # Manager Class (User Interface)
@@ -752,7 +775,7 @@ class ContinuousBatchingManager:
         max_queue_size: int = 0,
         num_q_padding_intervals: int = 0,
         num_kv_padding_intervals: int = 0,
-        allow_prefix_sharing: bool = True,
+        allow_block_sharing: bool = True,
     ) -> None:
         """Initialize the continuous batching manager.
@@ -762,30 +785,37 @@ class ContinuousBatchingManager:
             max_queue_size: Maximum size of the request queue (0 = unlimited)
             num_q_padding_intervals: (optional) Number of intervals used to pad the query dimension
             num_kv_padding_intervals: (optional) Number of intervals used to pad the keys/values dimension
-            allow_prefix_sharing: (optional) Whether to allow prefix sharing if the model has only full attention layers
+            allow_block_sharing: (optional) Whether to allow block sharing if the model has some full attention layers
         """
-        # Reloade paged version if necessary
+        # Reload paged version of the attention implementation if necessary
         if "paged|" not in model.config._attn_implementation:
             model.set_attn_implementation(f"paged|{model.config._attn_implementation}")
+        # Internal arguments
         self.model = model.eval()
-        generation_config = model.generation_config if generation_config is None else generation_config
-        self.generation_config = generation_config
+        self.manual_eviction = manual_eviction
+        self._allow_block_sharing = allow_block_sharing
+        self._use_prefix_sharing = allow_block_sharing  # approximation until the cache is created
         self.input_queue = queue.Queue(maxsize=max_queue_size)
         self.output_queue = queue.Queue()
         self.stop_event = threading.Event()
-        self.log_prob_generation = getattr(generation_config, "log_prob_generation", False)
+        self.batch_processor: ContinuousBatchProcessor | None = None
         self._generation_thread = None
         self._request_counter = 0
         self._request_lock = threading.Lock()
-        self.model.generation_config.top_p = None
+        # Generation config related arguments
+        generation_config = model.generation_config if generation_config is None else generation_config
+        self.generation_config = generation_config
+        self.log_prob_generation = getattr(generation_config, "log_prob_generation", False)
         self.do_sample = getattr(generation_config, "do_sample", True)
         self.logit_processor = self.model._get_logits_processor(generation_config)
-        self.profile = getattr(generation_config, "profile", False)  # TODO: not supported yet
-        self.manual_eviction = manual_eviction
-        self.batch_processor: ContinuousBatchProcessor | None = None
-        self._allow_prefix_sharing = allow_prefix_sharing
+        self.num_return_sequences = getattr(generation_config, "num_return_sequences", 1)
+        # self.model.generation_config.top_p = None NOTE: figure out why this was here
+        # Cuda graph behavior is determined below using either user-specified arguments or heuristics
         self.use_cuda_graph = self._decide_use_cuda_graphs(
             use_cuda_graph=getattr(generation_config, "use_cuda_graph", None),
             num_q_padding_intervals=num_q_padding_intervals,
@@ -799,6 +829,7 @@ class ContinuousBatchingManager:
             num_kv_padding_intervals if num_kv_padding_intervals > 0 else NUM_KV_PADDING_INTERVALS
         )
+        # Log probability generation is not supported yet (TODO)
         if self.log_prob_generation:
             raise NotImplementedError("log_prob_generation is not supported yet")
@@ -932,6 +963,7 @@ class ContinuousBatchingManager:
         state = RequestState(
             request_id=request_id,
             initial_tokens=list(input_ids),
+            num_children=self.num_return_sequences - 1,
             record_timestamps=record_timestamps,
             tokens_to_process=list(input_ids),
             max_new_tokens=max_new_tokens,
@@ -950,6 +982,10 @@ class ContinuousBatchingManager:
         streaming: bool = False,
         record_timestamps: bool = False,
     ) -> None:
+        # If there is prefix sharing, we sort the inputs to maximize cache hits
+        if self._use_prefix_sharing:
+            inputs = sorted(inputs, reverse=True)
+        # Add requests in order
         for input_ids in inputs:
             self.add_request(
                 input_ids, max_new_tokens=max_new_tokens, streaming=streaming, record_timestamps=record_timestamps
@@ -1020,8 +1056,9 @@ class ContinuousBatchingManager:
                 self.model.device,
                 self.model.dtype,
                 tp_size=getattr(self.model, "_tp_size", None),  # Use model's actual TP setting
-                allow_prefix_sharing=self._allow_prefix_sharing,
+                allow_block_sharing=self._allow_block_sharing,
             )
+            self._use_prefix_sharing = paged_attention_cache.use_prefix_sharing  # update the approximation
             logger.debug(f"PagedAttentionCache created in {perf_counter() - t0} seconds")
             scheduler = None
@@ -1080,10 +1117,6 @@ class ContinuousBatchingManager:
             )
         self._generation_step()
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()  # FIXME: why is this needed?
-        # Processor updates the batch after generation step is truly over
         batch_processor.update_batch()
     @traced
@@ -1125,7 +1158,7 @@ class ContinuousMixin:
         max_queue_size: int = 0,
         num_q_cuda_graphs: int = 0,
         num_kv_cuda_graphs: int = 0,
-        allow_prefix_sharing: bool = True,
+        allow_block_sharing: bool = True,
         block: bool = True,
         timeout: float | None = None,
     ) -> Generator[ContinuousBatchingManager]:
@@ -1135,7 +1168,7 @@ class ContinuousMixin:
             max_queue_size,
             num_q_cuda_graphs,
             num_kv_cuda_graphs,
-            allow_prefix_sharing,
+            allow_block_sharing,
         )
         manager.start()
         try:
@@ -1154,7 +1187,7 @@ class ContinuousMixin:
         max_queue_size: int = 0,
         num_q_padding_intervals: int = 0,
         num_kv_padding_intervals: int = 0,
-        allow_prefix_sharing: bool = True,
+        allow_block_sharing: bool = True,
     ) -> ContinuousBatchingManager:
         """Initialize a manager for continuous batching inference.
@@ -1164,7 +1197,7 @@ class ContinuousMixin:
             max_queue_size: Maximum size of the input request queue
             num_q_padding_intervals: Number of intervals used to pad the query dimension
             num_kv_padding_intervals: Number of intervals used to pad the keys/values dimension
-            allow_prefix_sharing: A flag to allow prefix sharing if the model has only full attention layers
+            allow_block_sharing: A flag to allow block sharing if the model has some full attention layers
         Returns:
             `ContinuousBatchingManager`: The manager instance to add requests and retrieve results.
@@ -1188,7 +1221,7 @@ class ContinuousMixin:
             max_queue_size=max_queue_size,
             num_q_padding_intervals=num_q_padding_intervals,
             num_kv_padding_intervals=num_kv_padding_intervals,
-            allow_prefix_sharing=allow_prefix_sharing,
+            allow_block_sharing=allow_block_sharing,
         )
     # TODO: support streaming
@@ -1200,7 +1233,7 @@ class ContinuousMixin:
         generation_config: GenerationConfig | None = None,
         num_q_padding_intervals: int = 0,
         num_kv_padding_intervals: int = 0,
-        allow_prefix_sharing: bool = True,
+        allow_block_sharing: bool = True,
         record_timestamps: bool = False,
         progress_bar: bool = True,
         **kwargs,
@@ -1212,7 +1245,7 @@ class ContinuousMixin:
             generation_config: Optional generation configuration
             num_q_padding_intervals: Number of intervals used to pad the query dimension
             num_kv_padding_intervals: Number of intervals used to pad the keys/values dimension
-            allow_prefix_sharing: A flag to allow prefix sharing if the model has only full attention layers
+            allow_block_sharing: A flag to allow block sharing if the model has some full attention layers
             record_timestamps: If set to true, the requests will have a timestamp for each token generated
             progress_bar: If set to true, a progress bar will be displayed
             **kwargs: Additional generation parameters
@@ -1228,26 +1261,30 @@ class ContinuousMixin:
         # Initialize manager with the batch inputs
         results = {}
-        num_requests = len(inputs)
-        with (
-            self.continuous_batching_context_manager(
-                generation_config=generation_config,
-                num_q_cuda_graphs=num_q_padding_intervals,
-                num_kv_cuda_graphs=num_kv_padding_intervals,
-                allow_prefix_sharing=allow_prefix_sharing,
-                block=True,
-                timeout=5,
-            ) as manager,
-            logging_redirect_tqdm([logger]),
-            tqdm(
-                total=num_requests,
-                disable=(not progress_bar),
-                desc=f"Solving {num_requests} requests",
-                unit="request",
-            ) as pbar,
-        ):
+        gen_cfg = self.generation_config if generation_config is None else generation_config
+        num_requests = len(inputs) * gen_cfg.num_return_sequences
+        # Prepare context managers for the main loop
+        manager_cm = self.continuous_batching_context_manager(
+            generation_config=generation_config,
+            num_q_cuda_graphs=num_q_padding_intervals,
+            num_kv_cuda_graphs=num_kv_padding_intervals,
+            allow_block_sharing=allow_block_sharing,
+            block=True,
+            timeout=5,
+        )
+        logging_cm = logging_redirect_tqdm([logger])
+        pbar_cm = tqdm(
+            total=num_requests,
+            disable=(not progress_bar),
+            desc=f"Solving {num_requests} requests",
+            unit="request",
+        )
+        # Main loop
+        with manager_cm as manager, logging_cm, pbar_cm as pbar:
             try:
-                manager.add_requests(inputs=inputs, max_new_tokens=kwargs.get("max_new_tokens"))
+                manager.add_requests(
+                    inputs=inputs, max_new_tokens=kwargs.get("max_new_tokens"), record_timestamps=record_timestamps
+                )
                 finished_count = 0
                 while finished_count < num_requests:
                     result = manager.get_result(timeout=1)

transformers/generation/continuous_batching/requests.py CHANGED Viewed

@@ -101,6 +101,8 @@ class RequestState:
     Attributes:
         request_id (str): The ID of the generation request.
+        initial_tokens (list[int]): The initial prompt tokens.
+        num_children (int): The number of children requests
         full_prompt_ids (list[int] | None): The tokens IDs of the full prompt.
         prompt_ids (list[int] | None): The tokens IDs currently being processed.
         remaining_prompt_ids (list[int]): The tokens IDs remaining to be processed (for split requests).
@@ -121,6 +123,7 @@ class RequestState:
     initial_tokens: list[int]  # Initial prompt tokens
     # Optional fields
     record_timestamps: bool = False  # Whether to record timestamps for the generated tokens
+    num_children: int = 0  # Number of children requests
     # Internal fields
     tokens_to_process: list[int] | None = None  # Tokens IDs currently being processed
     remaining_prefill_tokens: list[int] = field(default_factory=list)  # For split requests, prefill left to process
@@ -181,7 +184,7 @@ class RequestState:
         Returns:
             bool: True if the request is now complete, False otherwise
         """
-        # Only update if we're in decoding state
+        # Only update if we're in decoding state # TODO: seems useless (always true) -- remove this
         if self.status != RequestStatus.DECODING:
             return False
@@ -227,3 +230,27 @@ class RequestState:
             error=self.error,
             timestamps=self.timestamps,
         )
+    def fork(self, new_request_id: str) -> "RequestState":
+        """Fork the request into a new request with the same state expect for request_id, created_time and lifespan."""
+        t = time.perf_counter()
+        new_request = RequestState(
+            request_id=new_request_id,
+            initial_tokens=self.initial_tokens,
+            num_children=self.num_children,
+            tokens_to_process=self.tokens_to_process[:],
+            remaining_prefill_tokens=self.remaining_prefill_tokens[:],
+            generated_tokens=self.generated_tokens[:],
+            allocated_blocks=self.allocated_blocks,
+            position_offset=self.position_offset,
+            status=self.status,
+            max_new_tokens=self.max_new_tokens,
+            eos_token_id=self.eos_token_id,
+            streaming=self.streaming,
+            created_time=t,
+            lifespan=(t, -1),
+            timestamps=None if self.timestamps is None else self.timestamps[:],
+            error=self.error,
+            record_timestamps=self.record_timestamps,
+        )
+        return new_request

transformers/generation/continuous_batching/scheduler.py CHANGED Viewed

@@ -36,6 +36,7 @@ class Scheduler(ABC):
         self.retain_cache_on_finish = retain_cache_on_finish
         self._cancellation_lock = threading.Lock()
         self._requests_to_cancel: set[str] = set()
+        self._requests_to_fork: list[RequestState] = []
     @traced
     def add_waiting_request(self, state: RequestState):
@@ -151,8 +152,13 @@ class Scheduler(ABC):
         else:
             request_tokens = state.tokens_to_process
+        # If the request has one or more children we make sure not to prefill it entrirely
+        if state.num_children > 0 and token_budget >= len(request_tokens) - 1:
+            token_budget = len(request_tokens) - 1
+            self._requests_to_fork.append(state)
+        # Case: we can process the entire prompt/remainder
         if len(request_tokens) < token_budget:
-            # Can process the entire prompt/remainder
             if state.status == RequestStatus.PENDING:
                 self.active_requests[state.request_id] = state
                 state.status = RequestStatus.PREFILLING
@@ -161,8 +167,9 @@ class Scheduler(ABC):
                 state.status = RequestStatus.PREFILLING
                 state.tokens_to_process = state.remaining_prefill_tokens
                 state.remaining_prefill_tokens = []
+        # Otherwise: we need to split the request
         else:
-            # Need to split the request
             if state.status == RequestStatus.PENDING:
                 self.active_requests[state.request_id] = state
                 state.status = RequestStatus.PREFILLING_SPLIT
@@ -229,7 +236,7 @@ class FIFOScheduler(Scheduler):
             # Update the token budget
             token_budget -= request_len
             # If using prefix sharing, we make note of the blocks that will be computed in the forward pass
-            if self.cache.use_prefix_sharing:
+            if self.cache.allow_block_sharing:
                 tokens_in_current_block = state.current_len() % self.cache.block_size
                 tokens_after_forward = tokens_in_current_block + request_len
                 complete_blocks = tokens_after_forward // self.cache.block_size
@@ -295,7 +302,7 @@ class PrefillFirstScheduler(Scheduler):
             # Update the token budget
             token_budget -= request_len
             # If using prefix sharing, we make note of the blocks that will be computed in the forward pass
-            if self.cache.use_prefix_sharing:
+            if self.cache.allow_block_sharing:
                 tokens_in_current_block = state.current_len() % self.cache.block_size
                 tokens_after_forward = tokens_in_current_block + request_len
                 complete_blocks = tokens_after_forward // self.cache.block_size

transformers/generation/stopping_criteria.py CHANGED Viewed

@@ -430,7 +430,7 @@ class StopStringCriteria(StoppingCriteria):
         initial_match = end_lengths > 0
         # Tokens continue the string if the cumsum() so far is one of the valid positions for that token
-        # Note that we're actually tracking one cumsum() for for each possible end_length
+        # Note that we're actually tracking one cumsum() for each possible end_length
         later_match = torch.any(cumsum[:, :-1, :, None] == valid_positions[:, :, :, :, None], axis=-2)
         # The match vector is a boolean vector that indicates which positions have valid tokens

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl