PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/generation/continuous_batching/cache.py CHANGED Viewed

@@ -121,7 +121,7 @@ class PagedAttentionCache:
         device: torch.device,
         dtype: torch.dtype = torch.float16,
         tp_size: int | None = None,
-        allow_prefix_sharing: bool = True,
+        allow_block_sharing: bool = True,
     ) -> None:
         """Initialize a paged attention cache for efficient memory usage. Also turns in prefix sharing if the model has
         only full attention layers.
@@ -132,7 +132,8 @@ class PagedAttentionCache:
             device: Device for the cache tensors
             dtype: Data type of the cache
             tp_size: Tensor parallelism size
-            allow_prefix_sharing: A flag to allow prefix sharing if the model has only full attention layers.
+            allow_block_sharing: A flag to allow block sharing. If the model has some full attention layers, then prefix
+                sharing is enabled as well.
         """
         self.config = config
         self.dtype = dtype
@@ -209,7 +210,7 @@ class PagedAttentionCache:
         self.key_cache: list[torch.Tensor] = []
         self.value_cache: list[torch.Tensor] = []
         # We add two extra tokens to the cache to handle padding and generally discard unwanted tokens
-        self.cache_shape = (num_blocks * self.block_size + 2, self.num_key_value_heads, self.head_dim)
+        self.cache_shape = ((num_blocks + 2) * self.block_size, self.num_key_value_heads, self.head_dim)
         for _ in range(group_size):
             new_layer_key_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
             new_layer_value_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
@@ -220,19 +221,20 @@ class PagedAttentionCache:
         logger.info(f"{self.cache_shape = } {self.key_cache[0].shape = } {self.key_cache[0].numel() = }")
         # Block management data structures
+        self.allow_block_sharing = allow_block_sharing
         self.group_cache_managers: list[CacheAllocator] = []
         for i, group_type in enumerate(group_types):
             if group_type == "full_attention":
-                cm = FullAttentionCacheAllocator(i, self.block_size)
+                cm = FullAttentionCacheAllocator(i, self.block_size, allow_block_sharing=allow_block_sharing)
             elif group_type == "sliding_attention":
                 cm = SlidingAttentionCacheAllocator(i, self.block_size, config.sliding_window)
             else:
                 raise ValueError(f"Invalid group type: {group_type}")
             self.group_cache_managers.append(cm)
-        # We only use prefix sharing if the whole model has only full attention layers
-        self.use_prefix_sharing = allow_prefix_sharing and group_types == ["full_attention"]
-        self._block_manager = BlockManager(num_blocks, self.block_size, self.use_prefix_sharing)
+        # We only use prefix sharing if the whole model has only full attention layers and block sharing is allowed
+        self.use_prefix_sharing = allow_block_sharing and group_types == ["full_attention"]
+        self._block_manager = BlockManager(num_blocks, self.block_size)
         self.blocks_to_complete: dict[str, int] = {}
         self._total_prefix_length: int = 0  # a counter to measure the impact of prefix sharing, also used in tests
@@ -352,7 +354,8 @@ class PagedAttentionCache:
         allocated_blocks = []
         for b in range(len(prompt_ids) // self.block_size):
             tokens = prompt_ids[b * self.block_size : (b + 1) * self.block_size]
-            current_hash = self._block_manager.compute_hash(current_hash, tokens)
+            # Prefix sharing is only supported when there is only one full attention layer group, so group_id=0.
+            current_hash = self._block_manager.compute_hash(current_hash, tokens, group_id=0)
             block_id = self._block_manager._hash_to_id.get(current_hash)
             if block_id is not None:
                 allocated_blocks.append(block_id)
@@ -369,18 +372,44 @@ class PagedAttentionCache:
         self._total_prefix_length += prefix_length
         return prefix_length
-    def mark_blocks_as_complete(self, state: RequestState) -> None:
-        """Marks the blocks that have been computed in the forward pass as complete. If prefix sharing is off, this is
-        a no-op."""
-        num_complete_blocks = 0 if not self.use_prefix_sharing else self.blocks_to_complete.pop(state.request_id)
+    def mark_shareable_blocks_as_complete(self, state: RequestState) -> None:
+        """Marks the blocks allocated to a request (state) as complete if they are shareable and they have been computed
+        in the forward pass. A complete block is a block where the KV cache has been fully computed: if the block has
+        enough space to hold the cache for N tokens, the block is marked as complete when the cache data is present for
+        the N tokens. If block sharing is off, this is a no-op."""
+        num_complete_blocks = 0 if not self.allow_block_sharing else self.blocks_to_complete.pop(state.request_id)
         if num_complete_blocks == 0:
             return None
-        cm = self.group_cache_managers[0]  # if prefix sharing is on, there is only one group
-        self._block_manager.mark_blocks_as_complete(
-            num_complete_blocks=num_complete_blocks,
-            allocated_blocks=cm.block_table[state.request_id],
-            prompt_ids=(state.initial_tokens + state.generated_tokens),
-        )
+        for cm in self.group_cache_managers:
+            if cm.uses_block_sharing:
+                self._block_manager.mark_shareable_blocks_as_complete(
+                    num_complete_blocks=num_complete_blocks,
+                    allocated_blocks=cm.block_table[state.request_id],
+                    prompt_ids=(state.initial_tokens + state.generated_tokens),
+                )
+    def copy_cache(self, source_blocks: list[int], forked_blocks: list[int]) -> None:
+        """Copy the cache from the source blocks to the forked blocks."""
+        source_blocks = torch.tensor(source_blocks, device=self.device, dtype=torch.int32)
+        forked_blocks = torch.tensor(forked_blocks, device=self.device, dtype=torch.int32)
+        for key_cache, value_cache in zip(self.key_cache, self.value_cache):
+            key_cache = key_cache.view(-1, self.block_size, self.num_key_value_heads, self.head_dim)
+            value_cache = value_cache.view(-1, self.block_size, self.num_key_value_heads, self.head_dim)
+            key_cache[forked_blocks] = key_cache[source_blocks]
+            value_cache[forked_blocks] = value_cache[source_blocks]
+        # FIXME: consolidate the cache into a single tensor of shape (group_size, 2, *self.k_or_v_cache_shape)
+        # This will allow for  better .update and a single copy instead of one per cache tensor
+    def fork_request(self, source_request_id: str, destination_request_ids: list[str]) -> tuple[list[int], list[int]]:
+        """Fork the cache of a request (state) into the one of a list of requests with the given (dst_request_ids)."""
+        # These lists will be the accumulators for the source and destination blocks for the cache copy
+        source_blocks, destination_blocks = [], []
+        # Main fork loop
+        for cm in self.group_cache_managers:
+            src_blocks, dst_blocks = cm.fork_blocks(source_request_id, destination_request_ids, self._block_manager)
+            source_blocks.extend(src_blocks)
+            destination_blocks.extend(dst_blocks)
+        return source_blocks, destination_blocks
 # TODO: rework computation with the groups and their sizes

transformers/generation/continuous_batching/cache_manager.py CHANGED Viewed

@@ -31,20 +31,21 @@ def reverse_enumerate(xs: list[T]) -> Iterator[tuple[int, T]]:
         index -= 1
-class Block:
+class Block:  # TODO: rename to ShareableBlock and update the docs
     """A class to represent a block managed by the block manager. We say that a block is complete when the physical KV
     cache it points to is fully computed. A block can have a parent, which is the block that came before in the
-    sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block and
-    its parent's hash (if there is a parent)."""
+    sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block, the
+    layer (group_id) it belong to and its parent's hash (if there is a parent)."""
-    def __init__(self, id_: int, parent_id: int | None) -> None:
+    def __init__(self, id_: int, parent_id: int | None, group_id: int) -> None:
         self.id: int = id_
         self.parent_id: int | None = parent_id
+        self.group_id: int = group_id
         self.hash: int | None = None
         self.ref_count: int = 1
     def __repr__(self) -> str:
-        return f"Block(id={self.id}, parent_id={self.parent_id}, hash={self.hash}, ref_count={self.ref_count})"
+        return f"Block(id={self.id}, parent_id={self.parent_id}, group_id={self.group_id}, hash={self.hash}, ref_count={self.ref_count})"
     @property
     def is_complete(self) -> bool:
@@ -52,8 +53,9 @@ class Block:
 class BlockManager:
-    """A class to manage the number of free blocks and block re-use. If prefix sharing is off, the block manager is a
-    simple FIFO structure where blocks are either free or in use. If prefix sharing is on, blocks can have 3 states:
+    """A class to manage the number of free blocks and block re-use. When a block becomes in use, a flag is passed to
+    determine if the block is shareable or not. If it is, then a Block object is created and kept track of internally.
+    It can have the following states:
       - in use: one or more requests references this block, thus it cannot be written over. The number of requests
         referencing this block is stored as ref_count in the Block object.
       - un-initialized: the block points to a space in the KV cache tensor that contains no data yet. Those blocks can
@@ -63,19 +65,19 @@ class BlockManager:
         the ref_count of the block and remove it from the list of initialized blocks, because it is now in use.
         Still, the block can be freed if no un-initialized blocks are left. In that case, we remove its hash from the
         hash table.
+    If the block is not shareable, we just use the block manager as a FIFO structure where blocks are either free or in
+    use. Sharability is determined by the type of cache allocator: blocks created for full attention layers are
+    shareable, while blocks created for sliding window attention layers are not.
     There is no structure to keep track of the blocks in use: if a block is neither un-initialized nor initialized,
     it is in use.
     """
-    def __init__(self, num_blocks: int, block_size: int, use_prefix_sharing: bool) -> None:
-        """Initializes the block manager with a given number of blocks (num_blocks) of size (block_size). Prefix sharing
-        can be turned on with the (use_prefix_sharing) flag, which only happens if the model has only full attention
-        layers."""
+    def __init__(self, num_blocks: int, block_size: int) -> None:
+        """Initializes the block manager with a given number of blocks (num_blocks) of size (block_size)."""
         self.num_blocks = num_blocks
         self.block_size = block_size
         self._uninit_block_ids = deque(range(num_blocks))
         self._init_block_ids: dict[int, None] = {}  # effectively act as an ordered set
-        self._use_prefix_sharing = use_prefix_sharing
         self._hash_to_id: dict[int, int] = {}
         self._id_to_block: dict[int, Block] = {}
@@ -102,22 +104,81 @@ class BlockManager:
             self._uninit_block_ids.append(id_to_uninitialize)
         return True
-    def get_free_blocks(self, n_blocks: int, last_block_id: int | None) -> list[int] | None:
-        """Returns a list of (n_blocks) free block and mark them as no longuer free in the internal data structures. One
-        can also pass a (last_block_id) to indicate the last block id in the sequence, which is used to keep track of
-        the parent block. If the manager cannot find enough free blocks, it returns None."""
+    def get_free_blocks(
+        self, n_blocks: int, last_block_id: int | None, shareable: bool, group_id: int
+    ) -> list[int] | None:
+        """Returns a list of (n_blocks) free block and mark them as no longuer free in the internal data structures.
+        If the (shareable) flag is set to True, a Block object is created to keep track of the block, with the
+        (last_block_id) to indicate the last block id in the sequence, also named the parent block. If the manager
+        cannot find enough free blocks, it returns None."""
         if not self.has_enough_free_blocks(n_blocks):
             return None
         allocated_block_ids = [self._uninit_block_ids.popleft() for _ in range(n_blocks)]
-        # If we use prefix caching, we keep track of the allocated blocks as partial blocks
-        if self._use_prefix_sharing:
+        # If the block is shareable, we keep track of the allocated blocks as partial blocks
+        if shareable:
             for block_id in allocated_block_ids:
-                block = Block(block_id, last_block_id)
+                block = Block(block_id, last_block_id, group_id)
                 self._id_to_block[block_id] = block
                 last_block_id = block_id
         # In both cases, we return the allocated block ids
         return allocated_block_ids
+    def fork_blocks(
+        self, parent_blocks: list[int], num_forks: int, shareable: bool, group_id: int
+    ) -> tuple[list[list[int]], list[int], list[int]]:
+        """Fork a given list of (parent_blocks) as many times as (num_forks). If the blocks are (shareable), we use
+        reference on the blocks that are complete. Otherwise, we allocate new blocks and keep track of their indices to
+        later copy the physical cache. For instance, when forking 4 blocks for 2 children:
+        Parent blocks: [0, 1, 2, 3], with all blocks being complete except the last one (block 3).
+        ----------------------------------------- IF BLOCKS ARE NOT SHAREABLE -----------------------------------------
+        Forked blocks lists: [[5, 6, 7, 8], [9, 10, 11, 12]]
+        Copy source:          [0, 1, 2, 3,   0,  1,  2,  3]
+                               ↓  ↓  ↓  ↓    ↓   ↓   ↓   ↓
+        Copy destination:     [5, 6, 7, 8,   9, 10, 11, 12]  → 8 blocks are newly allocated and copied
+        ----------------------------------------- IF BLOCKS ARE SHAREABLE ---------------------------------------------
+        Forked blocks lists: [[0, 1, 2, 5], [0, 1, 2, 6]]
+        Copy source:          [         3,            3]     (block 3 is not complete so it's copied, not referenced)
+                                        ↓             ↓
+        Copy destination:     [         5,            6]     → only 2 blocks are newly allocated and copied
+        """
+        # First phase: reference all complete blocks
+        forked_by_reference = []
+        if shareable:
+            for block_id in parent_blocks:
+                block = self._id_to_block[block_id]
+                if block.is_complete:
+                    forked_by_reference.append(block.id)
+                    block.ref_count += num_forks
+                else:
+                    break
+        # Early return if we have forked all blocks by reference
+        blocks_to_copy = len(parent_blocks) - len(forked_by_reference)
+        if blocks_to_copy == 0:
+            return [forked_by_reference[:] for _ in range(num_forks)], [], []
+        # From now on, each child will have its own list of blocks
+        forked_blocks_lists = []
+        copy_src = []
+        copy_dst = []
+        # Second phase: allocate new blocks if needed
+        parent_id = forked_by_reference[-1] if forked_by_reference else None
+        for _ in range(num_forks):
+            allocated_block_ids = self.get_free_blocks(blocks_to_copy, parent_id, shareable, group_id)
+            if allocated_block_ids is None:
+                return None, [], []
+            forked_blocks_lists.append(forked_by_reference + allocated_block_ids)
+            copy_src.extend(parent_blocks[-blocks_to_copy:])
+            copy_dst.extend(allocated_block_ids)
+        return forked_blocks_lists, copy_src, copy_dst
     def increase_ref_count(self, block_id: int) -> None:
         """Increases the reference count of a given (block_id)."""
         block = self._id_to_block[block_id]
@@ -137,23 +198,23 @@ class BlockManager:
                 self._id_to_block.pop(block_id)
                 self._uninit_block_ids.append(block_id)
-    def free_blocks(self, blocks: list[int]) -> None:
-        """Marks a list of (blocks) as free. If there is no prefix sharing, we simply add them to the uninitialized
+    def free_blocks(self, blocks: list[int], shareable: bool) -> None:
+        """Marks a list of (blocks) as free. If the blocks were not (shareable), we simply add them to the uninitialized
         blocks queue. Otherwise, their new state depends on whether they are complete."""
-        if self._use_prefix_sharing:
+        if shareable:
             for block_id in blocks:
                 self.decrease_ref_count(block_id)
         else:
             self._uninit_block_ids.extend(blocks)
-    def mark_blocks_as_complete(
+    def mark_shareable_blocks_as_complete(
         self, num_complete_blocks: int, allocated_blocks: list[int], prompt_ids: list[int]
     ) -> None:
         """Among the list of (allocated_blocks), mark (num_complete_blocks) incomplete blocks as now complete. The list
         of (prompt_ids) is used to compute the hash of the new block."""
         # Look for the first complete block, starting from the last block in the sequence
         parent_hash = None
-        incomplete_blocks: list[Block] = []
+        incomplete_blocks: list[tuple[int, Block]] = []
         for i, block_id in reverse_enumerate(allocated_blocks):
             block = self._id_to_block[block_id]
             if block.is_complete:
@@ -178,7 +239,7 @@ class BlockManager:
             # Otherwise, we compute the hash
             num_complete_blocks -= 1
             tokens = prompt_ids[i * self.block_size : (i + 1) * self.block_size]
-            block.hash = self.compute_hash(parent_hash, tokens)
+            block.hash = self.compute_hash(parent_hash, tokens, block.group_id)
             existing_block_id = self._hash_to_id.get(block.hash)
             # If the block hash is already in the hash to id mapping, we reference the existing block instead
@@ -187,19 +248,20 @@ class BlockManager:
                 allocated_blocks[i] = existing_block_id
                 self._id_to_block[existing_block_id].ref_count += 1
                 new_parent_id = existing_block_id
-                self.free_blocks([block.id])
+                self.free_blocks([block.id], shareable=True)
             # Otherwise, we add the completed block to the hash table
             else:
+                logger.debug(f"Adding new block {block.id} (group {block.group_id}) with hash {block.hash}")
                 self._hash_to_id[block.hash] = block.id
             # Update loop variables
             parent_hash = block.hash
-    def compute_hash(self, parent_hash: int | None, tokens: list[int]) -> int:
-        """Computes the hash of a block containing the given (tokens) with a given (parent_hash). If the block has no
-        parent, the parent hash is None."""
-        return hash((parent_hash, tuple(tokens)))
+    def compute_hash(self, parent_hash: int | None, tokens: list[int], group_id: int) -> int:
+        """Computes the hash of a block identified by the (tokens) it contains, its (parent_hash) and the layer
+        (group_id) it belong to. If the block has no parent, the parent hash is None."""
+        return hash((parent_hash, tuple(tokens), group_id))
 class CacheAllocator(ABC):
@@ -208,6 +270,7 @@ class CacheAllocator(ABC):
     _index: int
     block_table: dict[str, list[int]]  # request_id -> list of block_ids allocated to the request
+    uses_block_sharing: bool  # flag to determine if the blocks are shareable
     @abstractmethod
     def allocate_blocks(self, n_blocks: int, request_id: str, block_manager: BlockManager) -> int | None:
@@ -218,7 +281,7 @@ class CacheAllocator(ABC):
         """Frees all blocks associated with a (request_id) using the (block_manager)."""
         if request_id in self.block_table:
             blocks_to_free = self.block_table.pop(request_id)
-            block_manager.free_blocks(blocks_to_free)
+            block_manager.free_blocks(blocks_to_free, shareable=self.uses_block_sharing)
         else:
             logger.warning(
                 f"CacheAllocator {self._index} attempted to free blocks for non-existent request_id: {request_id}"
@@ -236,17 +299,48 @@ class CacheAllocator(ABC):
     def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
         """Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
+    def fork_blocks(
+        self, parent_request_id: str, children_request_ids: list[str], block_manager: BlockManager
+    ) -> tuple[list[int], list[int]]:
+        """Forks the cache blocks of a (parent_request_id) to a list of (children_request_ids). To manage the blocks,
+        the (block_manager) is used. When forking, the child's block are either shared with the parent, or they need to
+        be copied from the parent. Hence we return two lists of blocks that need to be copied: one for the source and
+        one for the destination."""
+        # Sanity checks
+        if parent_request_id not in self.block_table:
+            raise ValueError(f"No block table found for request {parent_request_id}")
+        # Actual forking
+        parent_blocks = self.block_table[parent_request_id]
+        list_forked_blocks, copy_src, copy_dst = block_manager.fork_blocks(
+            parent_blocks=parent_blocks,
+            num_forks=len(children_request_ids),
+            shareable=self.uses_block_sharing,
+            group_id=self._index,
+        )
+        if list_forked_blocks is None:
+            raise ValueError(f"Failed to fork blocks for request {parent_request_id}")
+        # Update the block table for all children requests
+        for children_request_id, forked_blocks in zip(children_request_ids, list_forked_blocks):
+            if children_request_id in self.block_table:
+                raise ValueError(f"Block table already exists for request {children_request_id}")
+            self.block_table[children_request_id] = forked_blocks
+        return copy_src, copy_dst
 class FullAttentionCacheAllocator(CacheAllocator):
     """Cache manager for a group of full attention layers."""
-    def __init__(self, index: int, block_size: int) -> None:
+    def __init__(self, index: int, block_size: int, allow_block_sharing: bool) -> None:
         """Initializes the cache manager for a group of full attention layers.
         Args:
             - index: the index of the associated layer group
             - block_size: the size of the blocks in the cache
         """
         self._index = index
+        self.uses_block_sharing = allow_block_sharing
         self.block_size = block_size
         self.block_table = {}
@@ -261,7 +355,7 @@ class FullAttentionCacheAllocator(CacheAllocator):
         else:
             last_block_id = self.block_table[request_id][-1]
         # Actual allocation, return early if failed
-        allocated_blocks = block_manager.get_free_blocks(n_blocks, last_block_id)
+        allocated_blocks = block_manager.get_free_blocks(n_blocks, last_block_id, self.uses_block_sharing, self._index)
         if allocated_blocks is None:
             return None
         self.block_table[request_id].extend(allocated_blocks)
@@ -315,6 +409,7 @@ class SlidingAttentionCacheAllocator(CacheAllocator):
             - sliding_window: the size of the sliding window
         """
         self._index = index
+        self.uses_block_sharing = False
         self.block_size = block_size
         self.sliding_window = sliding_window
         self._max_blocks_per_request = ceil(self.sliding_window / self.block_size)
@@ -334,7 +429,9 @@ class SlidingAttentionCacheAllocator(CacheAllocator):
         after_allocation = min(already_allocated + n_blocks, self._max_blocks_per_request)
         actual_n_blocks = after_allocation - already_allocated
         # Classic allocation
-        allocated_blocks = block_manager.get_free_blocks(actual_n_blocks, None)  # no prefix caching w/ sliding window
+        allocated_blocks = block_manager.get_free_blocks(
+            actual_n_blocks, None, self.uses_block_sharing, self._index
+        )  # no block sharing w/ sliding window
         if allocated_blocks is None:
             return None
         self.block_table[request_id].extend(allocated_blocks)

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl