PyPI - vllm-npu - Versions diffs - 0.4.2__py3-none-any.whl - Mend

vllm-npu 0.4.2__py3-none-any.whl

Files changed (219) hide show

vllm/__init__.py +23 -0
vllm/_custom_ops.py +251 -0
vllm/attention/__init__.py +13 -0
vllm/attention/backends/__init__.py +0 -0
vllm/attention/backends/abstract.py +127 -0
vllm/attention/backends/flash_attn.py +271 -0
vllm/attention/backends/flashinfer.py +220 -0
vllm/attention/backends/rocm_flash_attn.py +374 -0
vllm/attention/backends/torch_sdpa.py +250 -0
vllm/attention/backends/xformers.py +393 -0
vllm/attention/layer.py +56 -0
vllm/attention/ops/__init__.py +0 -0
vllm/attention/ops/paged_attn.py +216 -0
vllm/attention/ops/prefix_prefill.py +792 -0
vllm/attention/ops/triton_flash_attention.py +810 -0
vllm/attention/selector.py +91 -0
vllm/block.py +84 -0
vllm/config.py +1225 -0
vllm/core/__init__.py +0 -0
vllm/core/block/__init__.py +0 -0
vllm/core/block/block_table.py +295 -0
vllm/core/block/common.py +199 -0
vllm/core/block/cpu_gpu_block_allocator.py +228 -0
vllm/core/block/interfaces.py +205 -0
vllm/core/block/naive_block.py +318 -0
vllm/core/block/prefix_caching_block.py +606 -0
vllm/core/block_manager_v1.py +625 -0
vllm/core/block_manager_v2.py +258 -0
vllm/core/evictor_v1.py +105 -0
vllm/core/evictor_v2.py +127 -0
vllm/core/interfaces.py +113 -0
vllm/core/policy.py +45 -0
vllm/core/scheduler.py +1163 -0
vllm/distributed/__init__.py +3 -0
vllm/distributed/communication_op.py +237 -0
vllm/distributed/device_communicators/__init__.py +0 -0
vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
vllm/distributed/device_communicators/pynccl.py +287 -0
vllm/distributed/device_communicators/pynccl_utils.py +66 -0
vllm/distributed/parallel_state.py +339 -0
vllm/distributed/utils.py +136 -0
vllm/engine/__init__.py +0 -0
vllm/engine/arg_utils.py +649 -0
vllm/engine/async_llm_engine.py +737 -0
vllm/engine/llm_engine.py +784 -0
vllm/engine/metrics.py +368 -0
vllm/engine/output_processor/__init__.py +0 -0
vllm/engine/output_processor/interfaces.py +76 -0
vllm/engine/output_processor/multi_step.py +142 -0
vllm/engine/output_processor/single_step.py +284 -0
vllm/engine/output_processor/stop_checker.py +101 -0
vllm/engine/output_processor/util.py +19 -0
vllm/entrypoints/__init__.py +0 -0
vllm/entrypoints/api_server.py +119 -0
vllm/entrypoints/llm.py +259 -0
vllm/entrypoints/openai/__init__.py +0 -0
vllm/entrypoints/openai/api_server.py +186 -0
vllm/entrypoints/openai/cli_args.py +115 -0
vllm/entrypoints/openai/protocol.py +460 -0
vllm/entrypoints/openai/serving_chat.py +392 -0
vllm/entrypoints/openai/serving_completion.py +347 -0
vllm/entrypoints/openai/serving_engine.py +234 -0
vllm/envs.py +217 -0
vllm/executor/__init__.py +0 -0
vllm/executor/cpu_executor.py +152 -0
vllm/executor/distributed_gpu_executor.py +115 -0
vllm/executor/executor_base.py +115 -0
vllm/executor/gpu_executor.py +150 -0
vllm/executor/multiproc_worker_utils.py +263 -0
vllm/executor/neuron_executor.py +91 -0
vllm/executor/ray_gpu_executor.py +327 -0
vllm/executor/ray_utils.py +119 -0
vllm/logger.py +153 -0
vllm/logging/__init__.py +5 -0
vllm/logging/formatter.py +15 -0
vllm/lora/__init__.py +0 -0
vllm/lora/fully_sharded_layers.py +262 -0
vllm/lora/layers.py +1181 -0
vllm/lora/lora.py +167 -0
vllm/lora/models.py +645 -0
vllm/lora/punica.py +213 -0
vllm/lora/request.py +32 -0
vllm/lora/utils.py +98 -0
vllm/lora/worker_manager.py +251 -0
vllm/model_executor/__init__.py +7 -0
vllm/model_executor/guided_decoding/__init__.py +25 -0
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
vllm/model_executor/layers/__init__.py +0 -0
vllm/model_executor/layers/activation.py +173 -0
vllm/model_executor/layers/fused_moe/__init__.py +7 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
vllm/model_executor/layers/layernorm.py +71 -0
vllm/model_executor/layers/linear.py +709 -0
vllm/model_executor/layers/logits_processor.py +115 -0
vllm/model_executor/layers/ops/__init__.py +0 -0
vllm/model_executor/layers/ops/rand.py +157 -0
vllm/model_executor/layers/ops/sample.py +406 -0
vllm/model_executor/layers/quantization/__init__.py +35 -0
vllm/model_executor/layers/quantization/aqlm.py +376 -0
vllm/model_executor/layers/quantization/awq.py +175 -0
vllm/model_executor/layers/quantization/base_config.py +97 -0
vllm/model_executor/layers/quantization/fp8.py +265 -0
vllm/model_executor/layers/quantization/gptq.py +224 -0
vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
vllm/model_executor/layers/quantization/marlin.py +227 -0
vllm/model_executor/layers/quantization/schema.py +84 -0
vllm/model_executor/layers/quantization/squeezellm.py +137 -0
vllm/model_executor/layers/rejection_sampler.py +405 -0
vllm/model_executor/layers/rotary_embedding.py +525 -0
vllm/model_executor/layers/sampler.py +1051 -0
vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
vllm/model_executor/model_loader/__init__.py +30 -0
vllm/model_executor/model_loader/loader.py +362 -0
vllm/model_executor/model_loader/neuron.py +136 -0
vllm/model_executor/model_loader/tensorizer.py +368 -0
vllm/model_executor/model_loader/utils.py +41 -0
vllm/model_executor/model_loader/weight_utils.py +372 -0
vllm/model_executor/models/__init__.py +119 -0
vllm/model_executor/models/baichuan.py +410 -0
vllm/model_executor/models/bloom.py +327 -0
vllm/model_executor/models/chatglm.py +386 -0
vllm/model_executor/models/commandr.py +373 -0
vllm/model_executor/models/dbrx.py +413 -0
vllm/model_executor/models/decilm.py +122 -0
vllm/model_executor/models/deepseek.py +438 -0
vllm/model_executor/models/falcon.py +444 -0
vllm/model_executor/models/gemma.py +393 -0
vllm/model_executor/models/gpt2.py +266 -0
vllm/model_executor/models/gpt_bigcode.py +274 -0
vllm/model_executor/models/gpt_j.py +281 -0
vllm/model_executor/models/gpt_neox.py +295 -0
vllm/model_executor/models/internlm2.py +323 -0
vllm/model_executor/models/jais.py +333 -0
vllm/model_executor/models/llama.py +442 -0
vllm/model_executor/models/llava.py +239 -0
vllm/model_executor/models/minicpm.py +531 -0
vllm/model_executor/models/mixtral.py +583 -0
vllm/model_executor/models/mixtral_quant.py +404 -0
vllm/model_executor/models/mpt.py +295 -0
vllm/model_executor/models/olmo.py +356 -0
vllm/model_executor/models/opt.py +349 -0
vllm/model_executor/models/orion.py +319 -0
vllm/model_executor/models/phi.py +300 -0
vllm/model_executor/models/qwen.py +284 -0
vllm/model_executor/models/qwen2.py +367 -0
vllm/model_executor/models/qwen2_moe.py +447 -0
vllm/model_executor/models/stablelm.py +301 -0
vllm/model_executor/models/starcoder2.py +302 -0
vllm/model_executor/models/xverse.py +366 -0
vllm/model_executor/sampling_metadata.py +588 -0
vllm/model_executor/utils.py +35 -0
vllm/outputs.py +150 -0
vllm/py.typed +2 -0
vllm/sampling_params.py +340 -0
vllm/sequence.py +766 -0
vllm/spec_decode/__init__.py +0 -0
vllm/spec_decode/batch_expansion.py +397 -0
vllm/spec_decode/interfaces.py +73 -0
vllm/spec_decode/metrics.py +191 -0
vllm/spec_decode/multi_step_worker.py +203 -0
vllm/spec_decode/ngram_worker.py +176 -0
vllm/spec_decode/spec_decode_worker.py +472 -0
vllm/spec_decode/top1_proposer.py +200 -0
vllm/spec_decode/util.py +228 -0
vllm/test_utils.py +41 -0
vllm/transformers_utils/__init__.py +0 -0
vllm/transformers_utils/config.py +58 -0
vllm/transformers_utils/configs/__init__.py +16 -0
vllm/transformers_utils/configs/chatglm.py +68 -0
vllm/transformers_utils/configs/dbrx.py +278 -0
vllm/transformers_utils/configs/falcon.py +87 -0
vllm/transformers_utils/configs/jais.py +236 -0
vllm/transformers_utils/configs/mpt.py +178 -0
vllm/transformers_utils/detokenizer.py +313 -0
vllm/transformers_utils/tokenizer.py +149 -0
vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
vllm/transformers_utils/tokenizers/__init__.py +5 -0
vllm/transformers_utils/tokenizers/baichuan.py +255 -0
vllm/usage/__init__.py +0 -0
vllm/usage/usage_lib.py +209 -0
vllm/utils.py +677 -0
vllm/worker/__init__.py +0 -0
vllm/worker/cache_engine.py +105 -0
vllm/worker/cpu_model_runner.py +346 -0
vllm/worker/cpu_worker.py +321 -0
vllm/worker/model_runner.py +1168 -0
vllm/worker/neuron_model_runner.py +196 -0
vllm/worker/neuron_worker.py +98 -0
vllm/worker/worker.py +345 -0
vllm/worker/worker_base.py +146 -0
vllm_npu-0.4.2.dist-info/LICENSE +201 -0
vllm_npu-0.4.2.dist-info/METADATA +173 -0
vllm_npu-0.4.2.dist-info/RECORD +219 -0
vllm_npu-0.4.2.dist-info/WHEEL +5 -0
vllm_npu-0.4.2.dist-info/top_level.txt +1 -0

vllm/core/block/prefix_caching_block.py ADDED Viewed

@@ -0,0 +1,606 @@
+"""Token blocks."""
+from itertools import takewhile
+from os.path import commonprefix
+from typing import Dict, FrozenSet, Iterable, List, Optional
+from vllm.core.block.common import (CopyOnWriteTracker,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
+PrefixHash = int
+# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
+# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
+# then we know this block hasn't been accessed yet.
+_DEFAULT_LAST_ACCESSED_TIME = -1
+class PrefixCachingBlockAllocator(BlockAllocator):
+    """A block allocator that implements prefix caching.
+    The PrefixCachingBlockAllocator maintains a cache of blocks based on their
+    content hash. It reuses blocks with the same content hash to avoid redundant
+    memory allocation. The allocator also supports copy-on-write operations.
+    Args:
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+    ):
+        # A mapping of prefix hash to block index. All blocks which have a
+        # prefix hash will be in this dict, even if they have refcount 0.
+        self._cached_blocks: Dict[PrefixHash, BlockId] = {}
+        # A mapping of blockId to Block to track those cached blocks
+        self._blocks: Dict[BlockId, Block] = {}
+        # An allocator for blocks that do not have prefix hashes.
+        self._hashless_allocator = NaiveBlockAllocator(
+            create_block=self._create_block,  # type: ignore
+            num_blocks=num_blocks,
+            block_size=block_size,
+            block_ids=block_ids,
+        )
+        self._block_size = block_size
+        # Evitor used to maintain how we want to handle those computed blocks
+        # if we find memory pressure is high.
+        self.evictor: Evictor = make_evictor(eviction_policy)
+        # We share the refcounter between allocators. This allows us to promote
+        # blocks originally allocated in the hashless allocator to immutable
+        # blocks.
+        self._refcounter = self._hashless_allocator.refcounter
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly(),
+            allocator=self,
+        )
+    # Implements Block.Factory.
+    def _create_block(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+    ) -> Block:
+        # Bind block to self.
+        allocator = self
+        return PrefixCachingBlock(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            block_id=block_id,
+            prefix_caching_allocator=allocator,
+            computed=computed,
+        )
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
+        """Allocates an immutable block with the given token IDs, reusing cached
+        blocks if possible.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+            token_ids (List[int]): The token IDs to be stored in the block.
+        Returns:
+            Block: The allocated immutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+        block = self._create_block(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=self._block_size,
+            allocator=self,
+        )
+        assert block.content_hash is not None
+        cached_block_id = self._cached_blocks.get(block.content_hash, None)
+        if cached_block_id is not None:
+            block.block_id = cached_block_id
+            self._incr_refcount_cached_block(block, block.block_id)
+            return block
+        block = self.allocate_mutable(prev_block)
+        block.append_token_ids(token_ids)
+        assert block.content_hash is not None
+        return block
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
+        """Allocates a mutable block. If there are no free blocks, this will
+        evict unused cached blocks.
+        Args:
+            prev_block (Block): The previous block in the sequence.
+                None is not allowed unlike it is super class.
+        Returns:
+            Block: The allocated mutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+        try:
+            block = self._hashless_allocator.allocate_mutable(
+                prev_block=prev_block)
+            assert block.block_id not in self._blocks
+            assert block.block_id is not None
+            self._blocks[block.block_id] = block
+            return block
+        except BlockAllocator.NoFreeBlocksError:
+            # We must check the unused cached blocks before raising OOM.
+            pass
+        # If the evictor has blocks available for eviction, evict a block
+        # and return it.
+        if self.evictor.num_blocks > 0:
+            block_id, content_hash_to_evict = self.evictor.evict()
+            # Here we may have scenario that several blocks have
+            # the same content hash, but due to the latter coming block
+            # is coming from mutable to immutable path, their physical
+            # block is added into evictor.
+            # However in this case, we shall not pop the _cached_blocks,
+            # as the same content is still used by others, which means
+            # we need to check ref before decide to pop the list.
+            _block_id = self._cached_blocks[content_hash_to_evict]
+            refcount = self._refcounter.get(_block_id)
+            if refcount == 1:
+                self._cached_blocks.pop(content_hash_to_evict)
+                assert _block_id == block_id
+            self._refcounter.incr(block_id)
+            # the block comes from evictor already contain computed result
+            block = self._create_block(
+                prev_block=prev_block,
+                token_ids=[],
+                block_size=self._block_size,
+                allocator=self,
+                block_id=block_id,
+                computed=True,
+            )
+            assert block.content_hash is None
+            assert block.block_id not in self._blocks
+            assert block.block_id is not None
+            self._blocks[block.block_id] = block
+            return block
+        # No block available in hashless allocator, nor in unused cache blocks.
+        raise BlockAllocator.NoFreeBlocksError()
+    def _incr_refcount_cached_block(self, block: Block,
+                                    block_id: BlockId) -> None:
+        # since block is already computed, mark it
+        block.computed = True
+        refcount = self._refcounter.incr(block_id)
+        if refcount == 1:
+            # if block get referred, then it shall not be in evictor
+            # and put it into _blocks for tracking
+            if block_id in self.evictor:
+                self.evictor.remove(block_id)
+            self._blocks[block_id] = block
+    def free(self, block: Block) -> None:
+        """Decrement the refcount of the block. If the decremented refcount is
+        zero, store the block in the freelist.
+        If the block has a content hash (meaning it is immutable), then we will
+        keep the block around in case future allocations require it.
+        """
+        assert (block.block_id
+                is not None), "freeing unallocated block is undefined"
+        self._free_block_id_for_block(block.block_id, block)
+        block.block_id = None
+    def _free_block_id_for_block(self, block_id: BlockId,
+                                 block: Block) -> None:
+        assert isinstance(block, PrefixCachingBlock)
+        if block.content_hash is None:
+            refcount = self._refcounter.get(block_id)
+            # We have fork case where block would get more than one ref,
+            # so we cannot free it from tracking if ref cnt large than 1
+            if refcount <= 1:
+                assert block.block_id is not None
+                del self._blocks[block.block_id]
+            return self._hashless_allocator.free(block)
+        refcount = self._refcounter.decr(block_id)
+        # If no longer used, add the block to the evictor.
+        if refcount == 0:
+            assert block.content_hash in self._cached_blocks
+            assert block.block_id is not None
+            del self._blocks[block.block_id]
+            self.evictor.add(block.block_id, block.content_hash,
+                             block.num_tokens_total, block.last_accessed)
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+        Args:
+            last_block (Block): The last block in the original sequence.
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+        forked_blocks = []
+        prev_block = None
+        for block in source_blocks:
+            refcount = self._refcounter.incr(block.block_id)
+            assert refcount != 1, "can't fork free'd block"
+            forked_blocks.append(
+                self._create_block(
+                    prev_block=prev_block,
+                    token_ids=block.token_ids,
+                    block_id=block.block_id,
+                    block_size=self._block_size,
+                    allocator=self,
+                ))
+            prev_block = forked_blocks[-1]
+        return forked_blocks
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        assert device is None
+        # The number of free blocks is the number of hashless free blocks
+        # plus the number of blocks evictor could free from its list.
+        return self._hashless_allocator.get_num_free_blocks(
+        ) + self.evictor.num_blocks
+    def get_num_total_blocks(self) -> int:
+        return self._hashless_allocator.get_num_total_blocks()
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._hashless_allocator.all_block_ids
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """Once a mutable block is full, it can be promoted to an immutable
+        block. This means that its content can be referenced by future blocks
+        having the same prefix.
+        Note that if we already have a cached block with the same content, we
+        will replace the newly-promoted block's mapping with the existing cached
+        block.
+        Args:
+            block: The mutable block to be promoted.
+        Returns:
+            BlockId: Either the original block index, or the block index of
+                the previously cached block matching the same content.
+        """
+        assert block.content_hash is not None
+        assert block.block_id is not None
+        assert self._refcounter.get(block.block_id) > 0
+        # If the content hash does not have a corresponding cached block,
+        # set this block as the cached block.
+        if block.content_hash not in self._cached_blocks:
+            self._cached_blocks[block.content_hash] = block.block_id
+        else:
+            self._free_block_id_for_block(block.block_id, block)
+            self._incr_refcount_cached_block(
+                block, self._cached_blocks[block.content_hash])
+        return self._cached_blocks[block.content_hash]
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+        Args:
+            block (Block): The block to check for copy-on-write.
+        Returns:
+            Optional[BlockId]: The block index of the new block if a copy-on
+                -write operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        return self._cow_tracker.cow_block_if_not_appendable(block)
+    def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+        Returns:
+            Dict[BlockId, List[BlockId]]: A dictionary mapping source
+                block indices to lists of destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+        If the block is added into evictor, we need to update corresponding
+        info in evictor's metadata.
+        """
+        for block_id in block_ids:
+            if block_id in self._blocks:
+                self._blocks[block_id].last_accessed = now
+            elif block_id in self.evictor:
+                self.evictor.update(block_id, now)
+            else:
+                raise ValueError(
+                    "Mark block as accessed which is not belonged to GPU")
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as computed, used in prefix caching."""
+        for block_id in block_ids:
+            if block_id in self._blocks:
+                # only those full block is valid for prefix caching
+                if self._blocks[block_id].is_full:
+                    self._blocks[block_id].computed = True
+            elif block_id not in self.evictor:
+                raise ValueError(f"Mark {block_id=} as computed which "
+                                 "is not belonged to GPU")
+    def block_is_computed(self, block_id: int) -> bool:
+        if block_id in self._blocks:
+            return self._blocks[block_id].computed
+        else:
+            return block_id in self.evictor
+    def get_common_computed_block_ids(
+            self, seq_block_ids: List[List[int]]) -> List[int]:
+        """Return the block ids that are common for a given sequence group.
+        Only those blocks that are immutable and already be marked
+        compyted would be taken consideration.
+        """
+        # NOTE We exclude the last block to avoid the case where the entire
+        # prompt is cached. This would cause erroneous behavior in model
+        # runner.
+        ids_list = [
+            list(
+                takewhile(lambda block_id: self.block_is_computed(block_id),
+                          seq[:-1])) for seq in seq_block_ids
+        ]
+        # It returns a list of int although type annotation says list of string.
+        return commonprefix([
+            ids for ids in ids_list  # type: ignore
+            if ids != []
+        ])
+class PrefixCachingBlock(Block):
+    """A block implementation that supports prefix caching.
+    The PrefixCachingBlock class represents a block of token IDs with prefix
+    caching capabilities. It wraps a NaiveBlock internally and provides
+    additional functionality for content hashing and promoting immutable blocks
+    with the prefix caching allocator.
+    Args:
+        prev_block (Optional[PrefixCachingBlock]): The previous block in the
+            sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        prefix_caching_allocator (BlockAllocator): The prefix
+            caching block allocator associated with this block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None.
+    """
+    def __init__(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        prefix_caching_allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+    ):
+        assert isinstance(prefix_caching_allocator,
+                          PrefixCachingBlockAllocator), (
+                              "Currently this class is only tested with "
+                              "PrefixCachingBlockAllocator.")
+        assert_prefix_caching_block_or_none(prev_block)
+        self._prev_block = prev_block
+        self._cached_content_hash: Optional[int] = None
+        self._cached_num_tokens_total: Optional[int] = None
+        self._prefix_caching_allocator = prefix_caching_allocator
+        self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self._computed = computed
+        self._block = NaiveBlock(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            block_id=block_id,
+            allocator=prefix_caching_allocator,
+            _cow_target=self,
+        )
+    @property
+    def computed(self) -> bool:
+        return self._computed
+    @computed.setter
+    def computed(self, value) -> None:
+        self._computed = value
+    @property
+    def last_accessed(self) -> float:
+        return self._last_accessed
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._last_accessed = last_accessed_ts
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block and registers the block as
+        immutable if the block becomes full.
+        Internally, the naive block handles CoW.
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        assert token_ids
+        # naive block handles CoW.
+        self._block.append_token_ids(token_ids)
+        # If the content hash is present, then the block can be made immutable.
+        # Register ourselves with the allocator, potentially replacing the
+        # physical block index.
+        if self.content_hash is not None:
+            self.block_id = (self._prefix_caching_allocator.
+                             promote_to_immutable_block(self))
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block.block_id
+    @block_id.setter
+    def block_id(self, value) -> None:
+        self._block.block_id = value
+    @property
+    def is_full(self) -> bool:
+        return self._block.is_full
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block.num_empty_slots
+    @property
+    def num_tokens_total(self) -> int:
+        """return the total tokens so far.
+        Here we iterate the block chain till to the first block, while
+        cache the result in local to prevent repeated computations.
+        """
+        if self._cached_num_tokens_total is not None:
+            return self._cached_num_tokens_total
+        _block: Optional[Block] = self
+        self._cached_num_tokens_total = 0
+        # TODO: current implement here take O(N^2), we expect future
+        # we have O(1) here
+        while _block is not None:
+            self._cached_num_tokens_total += len(_block.token_ids)
+            _block = _block.prev_block
+        return self._cached_num_tokens_total
+    @property
+    def block_size(self) -> int:
+        return self._block.block_size
+    @property
+    def token_ids(self) -> List[int]:
+        return self._block.token_ids
+    @property
+    def prev_block(self) -> Optional[Block]:
+        return self._prev_block
+    @property
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined.
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        # If the hash is already computed, return it.
+        if self._cached_content_hash is not None:
+            return self._cached_content_hash
+        # We cannot compute a hash for the current block because it is not full.
+        if not self.is_full:
+            return None
+        is_first_block = self._prev_block is None
+        prev_block_hash = (
+            None if is_first_block else
+            self._prev_block.content_hash  # type: ignore
+        )
+        # Previous block exists but does not yet have a hash.
+        # Return no hash in this case.
+        if prev_block_hash is None and not is_first_block:
+            return None
+        self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
+            is_first_block,
+            prev_block_hash,
+            cur_block_token_ids=self.token_ids)
+        return self._cached_content_hash
+    @staticmethod
+    def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int]) -> int:
+        """Computes a hash value corresponding to the contents of a block and
+        the contents of the preceding block(s). The hash value is used for
+        prefix caching.
+        NOTE: Content-based hashing does not yet support LoRA.
+        Parameters:
+        - is_first_block (bool): A flag indicating if the block is the first in
+            the sequence.
+        - prev_block_hash (Optional[int]): The hash of the previous block. None
+            if this is the first block.
+        - cur_block_token_ids (List[int]): A list of token ids in the current
+            block. The current block is assumed to be full.
+        Returns:
+        - int: The computed hash value for the block.
+        """
+        assert (prev_block_hash is None) == is_first_block
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
+def assert_prefix_caching_block_or_none(block: Optional[Block]):
+    if block is None:
+        return
+    assert isinstance(block, PrefixCachingBlock)