PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/v1/kv_offload/arc_manager.py ADDED Viewed

@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+class ARCOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager implementing the ARC (Adaptive Replacement Cache)
+    eviction policy with a pluggable backend.
+    Data Structures:
+        T1: Recent cache containing blocks accessed once.
+        T2: Frequent cache containing blocks accessed multiple times.
+        B1/B2: Ghost lists tracking recently evicted blocks from T1/T2.
+        target_t1_size: Adaptive target size for the T1 partition.
+    Algorithm Flow:
+        1. Cache lookup (lookup):
+           Searches T1 and T2 for block hashes and counts consecutive hits
+           until a miss or non-ready block is encountered.
+        2. Cache touch (touch) - Adaptive Learning:
+           For each block_hash (in reverse order):
+           - If in T1: Move to T2 (promotion from recent to frequent).
+           - If in T2: Move to MRU position (end of queue).
+           - If in B1 ghost list: Increase target_t1_size.
+           - If in B2 ghost list: Decrease target_t1_size.
+        3. Block eviction (prepare_store) - Adaptive Replacement:
+           Determines eviction source based on adaptive target:
+           - If T1 size > target_t1_size: Evict from T1, add to B1.
+           - Otherwise: Evict from T2, add to B2.
+           Finally, bound each ghost list size.
+        4. Block insertion (prepare_store):
+           New blocks are always inserted into T1 and removed from B1/B2 if
+           present. Blocks may later be promoted to T2 during touch operations.
+    Adaptive Behavior:
+        The algorithm self-tunes the recency vs. frequency trade-off:
+        - B1 hit: Recent access patterns matter more → increase T1.
+        - B2 hit: Frequent access patterns matter more → decrease T1.
+    """
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        self.target_t1_size: float = 0.0
+        self.t1: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.t2: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        # block_hash -> None (only care about presence)
+        self.b1: OrderedDict[BlockHash, None] = OrderedDict()
+        self.b2: OrderedDict[BlockHash, None] = OrderedDict()
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
+        self.cache_capacity: int = self.backend.get_num_free_blocks()
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found in cache"
+            assert block.is_ready, f"Block {block_hash!r} is not ready for reading"
+            block.ref_cnt += 1
+            blocks.append(block)
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if block_hash in self.t1:
+                block = self.t1.pop(block_hash)
+                if not block.is_ready:
+                    # block was just prepared to be stored, not really touched twice
+                    self.t1.move_to_end(block_hash)
+                else:
+                    self.t2[block_hash] = block
+            elif block_hash in self.t2:
+                self.t2.move_to_end(block_hash)
+            elif block_hash in self.b1:
+                delta = max(1, len(self.b2) / len(self.b1))
+                self.target_t1_size = min(
+                    self.target_t1_size + delta, self.cache_capacity
+                )
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b1.move_to_end(block_hash)
+            elif block_hash in self.b2:
+                delta = max(1, len(self.b1) / len(self.b2))
+                self.target_t1_size = max(self.target_t1_size - delta, 0)
+                # move to MRU position (end) to keep it fresh in the ghost list
+                self.b2.move_to_end(block_hash)
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.t1.get(block_hash) or self.t2.get(block_hash)
+            assert block is not None, f"Block {block_hash!r} not found"
+            assert block.ref_cnt > 0, f"Block {block_hash!r} ref_cnt is already 0"
+            block.ref_cnt -= 1
+    def prepare_store(
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
+        block_hashes_to_store = []
+        for block_hash in block_hashes:
+            if block_hash not in self.t1 and block_hash not in self.t2:
+                block_hashes_to_store.append(block_hash)
+        if not block_hashes_to_store:
+            return PrepareStoreOutput(
+                block_hashes_to_store=[],
+                store_spec=self.backend.get_load_store_spec([], []),
+                block_hashes_evicted=[],
+            )
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
+        to_evict = []
+        while num_blocks_to_evict > 0:
+            block_to_evict = None
+            if len(self.t1) >= int(self.target_t1_size):
+                # try to evict the least recently used (oldest) block from T1
+                for block_hash, block in self.t1.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t1
+                        eviction_b = self.b1
+                        break
+            if not block_to_evict:
+                # try to evict the least recently used (oldest) block from T2
+                for block_hash, block in self.t2.items():
+                    if block.ref_cnt == 0:
+                        block_to_evict = (block_hash, block)
+                        eviction_t = self.t2
+                        eviction_b = self.b2
+                        break
+                else:
+                    # cannot evict enough blocks, cache is full of in-use items
+                    return None
+            block_hash, block = block_to_evict
+            del eviction_t[block_hash]
+            eviction_b[block_hash] = None
+            to_evict.append(block_hash)
+            self.backend.free(block)
+            num_blocks_to_evict -= 1
+        for b in [self.b1, self.b2]:
+            for i in range(len(b) - self.cache_capacity):
+                b.popitem(last=False)
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store), (
+            "Backend did not allocate the expected number of blocks"
+        )
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.t1[block_hash] = block
+            self.b1.pop(block_hash, None)
+            self.b2.pop(block_hash, None)
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+        if success:
+            for block_hash in block_hashes:
+                block = self.t1.get(block_hash) or self.t2.get(block_hash)
+                if block is not None and not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.t1.pop(block_hash, None)
+                if block is None:
+                    block = self.t2.pop(block_hash, None)
+                if block is not None and not block.is_ready:
+                    self.backend.free(block)
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()

vllm/v1/kv_offload/backend.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+class BlockStatus(ctypes.Structure):
+    """
+    Offloading status for a single block of KV data.
+    Holds the following information:
+    ref_cnt - the current number of transfers using this block as a source.
+        A value of -1 indicates the block is not yet ready to be read.
+    load_store_spec - backend-specific information on how to actually
+        read/write the block.
+    """
+    _fields_ = [("ref_cnt", ctypes.c_int32)]
+    def __init__(self):
+        super().__init__()
+        # initialize block as "not ready" (ref_cnt = -1)
+        self.ref_cnt = -1
+    @property
+    def is_ready(self) -> bool:
+        """
+        Returns whether the block is ready to be read.
+        """
+        return self.ref_cnt >= 0
+class Backend(ABC):
+    """
+    An abstract class for allocating and returning specs for writing
+    KV blocks to some backend.
+    """
+    def __init__(self, block_size: int, medium: str):
+        self.block_size = block_size
+        self.medium = medium
+    @abstractmethod
+    def get_num_free_blocks(self):
+        """
+        Returns the number of current number of blocks that can be allocated.
+        """
+        pass
+    @abstractmethod
+    def allocate_blocks(self, block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        """
+        Allocate space for writing blocks.
+        This method assumes there is enough space for allocation.
+        It is unsafe to use without checking get_num_free_blocks beforehand.
+        Args:
+            block_hashes: the hashes identifying the blocks to be written.
+        Returns:
+            A list of BlockStatus for the allocated blocks.
+            The ref_cnt of each returned item will be -1, meaning the block
+            is not yet ready to be read.
+        """
+        pass
+    @abstractmethod
+    def free(self, block: BlockStatus):
+        """
+        Free a previously allocated block.
+        You should only call this function with blocks returned by
+        allocate_blocks, and only once per each block.
+        Args:
+            block: The block to be freed.
+        """
+        pass
+    def get_load_store_spec(
+        self, block_hashes: Iterable[BlockHash], blocks: Iterable[BlockStatus]
+    ) -> LoadStoreSpec:
+        """
+        Get backend-specific information on how to read/write blocks.
+        Args:
+            block_hashes: the list of block hashes identifying the blocks.
+            blocks: the list of blocks.
+        Returns:
+            A LoadStoreSpec that can be used by a worker
+            to read/write the blocks.
+        """
+        raise NotImplementedError

vllm/v1/kv_offload/backends/__init__.py ADDED Viewed

File without changes

vllm/v1/kv_offload/backends/cpu.py ADDED Viewed

@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+class CPUBlockStatus(BlockStatus):
+    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)]  # type: ignore
+    def __init__(self, block_id: int):
+        super().__init__()
+        self.block_id = block_id
+class CPUBackend(Backend):
+    def __init__(self, block_size: int, num_blocks: int):
+        super().__init__(block_size=block_size, medium=CPULoadStoreSpec.medium())
+        self.num_blocks: int = num_blocks
+        self.num_allocated_blocks: int = 0
+        self.allocated_blocks_free_list: list[int] = []
+    def get_num_free_blocks(self):
+        return (
+            len(self.allocated_blocks_free_list)
+            + self.num_blocks
+            - self.num_allocated_blocks
+        )
+    def allocate_blocks(self, block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        num_fresh_blocks = min(
+            len(block_hashes), self.num_blocks - self.num_allocated_blocks
+        )
+        num_reused_blocks = len(block_hashes) - num_fresh_blocks
+        assert len(self.allocated_blocks_free_list) >= num_reused_blocks
+        # allocate fresh blocks
+        blocks: list[BlockStatus] = []
+        for _ in range(num_fresh_blocks):
+            blocks.append(CPUBlockStatus(self.num_allocated_blocks))
+            self.num_allocated_blocks += 1
+        # allocate reused blocks
+        for _ in range(num_reused_blocks):
+            block_id = self.allocated_blocks_free_list.pop()
+            blocks.append(CPUBlockStatus(block_id))
+        return blocks
+    def free(self, block: BlockStatus):
+        assert isinstance(block, CPUBlockStatus)
+        self.allocated_blocks_free_list.append(block.block_id)
+    def get_load_store_spec(
+        self, block_hashes: Iterable[BlockHash], blocks: Iterable[BlockStatus]
+    ) -> LoadStoreSpec:
+        return CPULoadStoreSpec([block.block_id for block in blocks])

vllm/v1/kv_offload/cpu.py ADDED Viewed

@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+import torch
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.platforms import current_platform
+from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
+from vllm.v1.kv_offload.backends.cpu import CPUBackend
+from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.spec import OffloadingSpec
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
+from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+class CPUOffloadingSpec(OffloadingSpec):
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+        num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
+        if not num_cpu_blocks:
+            raise Exception(
+                "num_cpu_blocks must be specified in kv_connector_extra_config"
+            )
+        self.num_cpu_blocks: int = num_cpu_blocks
+        # scheduler-side
+        self._manager: OffloadingManager | None = None
+        # worker-side
+        self._handler: OffloadingHandler | None = None
+        self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
+    def get_manager(self) -> OffloadingManager:
+        if not self._manager:
+            kv_events_config = self.vllm_config.kv_events_config
+            enable_events = (
+                kv_events_config is not None and kv_events_config.enable_kv_cache_events
+            )
+            backend = CPUBackend(
+                block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks
+            )
+            if self.eviction_policy == "lru":
+                self._manager = LRUOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            elif self.eviction_policy == "arc":
+                self._manager = ARCOffloadingManager(
+                    backend=backend, enable_events=enable_events
+                )
+            else:
+                raise ValueError(
+                    f"Unknown eviction policy: {self.eviction_policy}. "
+                    f"Supported policies: lru, arc"
+                )
+        return self._manager
+    def get_handlers(
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
+        if not self._handler:
+            if not current_platform.is_cuda_alike():
+                raise Exception(
+                    "CPU Offloading is currently only supported on CUDA-alike GPUs"
+                )
+            layer_names = list(kv_caches.keys())
+            layers = get_layers_from_vllm_config(
+                self.vllm_config, AttentionLayerBase, layer_names
+            )
+            attn_backends = {
+                layer_name: layers[layer_name].get_attn_backend()
+                for layer_name in layer_names
+            }
+            self._handler = CpuGpuOffloadingHandler(
+                attn_backends=attn_backends,
+                gpu_block_size=self.gpu_block_size,
+                cpu_block_size=self.offloaded_block_size,
+                num_cpu_blocks=self.num_cpu_blocks,
+                gpu_caches=kv_caches,
+            )
+        assert self._handler is not None
+        yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
+        yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler

vllm/v1/kv_offload/factory.py ADDED Viewed

@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.spec import OffloadingSpec
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+logger = init_logger(__name__)
+class OffloadingSpecFactory:
+    _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {}
+    @classmethod
+    def register_spec(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a spec with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+        def loader() -> type[OffloadingSpec]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+        cls._registry[name] = loader
+    @classmethod
+    def create_spec(
+        cls,
+        config: "VllmConfig",
+    ) -> OffloadingSpec:
+        kv_transfer_config = config.kv_transfer_config
+        assert kv_transfer_config is not None
+        extra_config = kv_transfer_config.kv_connector_extra_config
+        spec_name = extra_config.get("spec_name", "CPUOffloadingSpec")
+        if spec_name in cls._registry:
+            spec_cls = cls._registry[spec_name]()
+        else:
+            spec_module_path = extra_config.get("spec_module_path")
+            if spec_module_path is None:
+                raise ValueError(f"Unsupported spec type: {spec_name}")
+            spec_module = importlib.import_module(spec_module_path)
+            spec_cls = getattr(spec_module, spec_name)
+        assert issubclass(spec_cls, OffloadingSpec)
+        logger.info("Creating offloading spec with name: %s", spec_name)
+        return spec_cls(config)
+# Register various specs here.
+OffloadingSpecFactory.register_spec(
+    "CPUOffloadingSpec", "vllm.v1.kv_offload.cpu", "CPUOffloadingSpec"
+)

vllm/v1/kv_offload/lru_manager.py ADDED Viewed

@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    OffloadingManager,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+class LRUOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
+    """
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        # block_hash -> BlockStatus
+        self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.blocks.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.is_ready
+            block.ref_cnt += 1
+            blocks.append(block)
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if self.blocks.get(block_hash):
+                self.blocks.move_to_end(block_hash)
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.ref_cnt > 0
+            block.ref_cnt -= 1
+    def prepare_store(
+        self, block_hashes: Iterable[BlockHash]
+    ) -> PrepareStoreOutput | None:
+        # filter out blocks that are already stored
+        block_hashes_to_store = [
+            block_hash for block_hash in block_hashes if block_hash not in self.blocks
+        ]
+        num_blocks_to_evict = (
+            len(block_hashes_to_store) - self.backend.get_num_free_blocks()
+        )
+        # build list of blocks to evict
+        to_evict = []
+        if num_blocks_to_evict > 0:
+            for block_hash, block in self.blocks.items():
+                if block.ref_cnt == 0:
+                    to_evict.append(block_hash)
+                    num_blocks_to_evict -= 1
+                    if num_blocks_to_evict == 0:
+                        break
+            else:
+                # we could not evict enough blocks
+                return None
+        # evict blocks
+        for block_hash in to_evict:
+            self.backend.free(self.blocks.pop(block_hash))
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=to_evict,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=True,
+                )
+            )
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store)
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.blocks[block_hash] = block
+        # build store specs for allocated blocks
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store, blocks)
+        return PrepareStoreOutput(
+            block_hashes_to_store=block_hashes_to_store,
+            store_spec=store_spec,
+            block_hashes_evicted=to_evict,
+        )
+    def complete_store(self, block_hashes: Iterable[BlockHash], success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+        if success:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    self.backend.free(block)
+                    del self.blocks[block_hash]
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(
+                    block_hashes=stored_block_hashes,
+                    block_size=self.backend.block_size,
+                    medium=self.backend.medium,
+                    removed=False,
+                )
+            )
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()