PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/distributed/tpu_connector.py ADDED Viewed

@@ -0,0 +1,699 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Proxy server routes the request to P with max_output_tokens=1
+P workflow:
+    P recives the request
+    P scheduler checks if the prefill is full done in `request_finished()`
+    If done:
+        P puts the request-id in `scheduler_output.finished_req_ids`
+            and puts the request in `scheduler_output.kv_connector_metadata.reqs_to_send`
+        P responds the proxy server with `finished_req_ids` and the `kv_transfer_params`
+        P worker gets `reqs_to_send` and runs async `_prepare_kv_and_wait()`
+    Else:
+        P schedules the prefill with multiple turns due to chunked-prefill.
+    P worker checks if the request has been pulled by D
+    If done:
+        P worker puts the request-id in `done_sending()`
+        P scheduler frees blocks for the requet in done sending.
+    Else:
+        P holds the blocks for the request until it's pulled by D
+    (
+        One scheduler step can finish:
+            scheduler RUNNING -> connector reqs_to_send -> worker prefill -> output
+        The waiting buffer will get freed after notified by D or expired.
+    )
+Proxy server recives the response from P and forwards it to D
+D workflow:
+    D recives the request
+    D scheduler calculates the num of tokens needing to pull from P in `get_num_new_matched_tokens()`
+    D checks if need to pull from P
+    If true:
+        D puts the request in `scheduler_output.kv_connector_metadata.reqs_to_load`
+        D worker gets `reqs_to_load` and runs `_pull_and_write_kv()` in separate threads (to be async)
+        D worker checks if the async loading is done:
+            If done:
+                D worker puts the request-id in `done_recving`.
+                D scheduler then knows the request can be scheduled for decoding now. The model decode
+                  will happen in the next scheduler step.
+            Else:
+                D worker handles other requests first.
+    Else (too short prompt, full local prefix-cache):
+        D still needs to puts the request in `reqs_to_load` but with None metadata, because D needs to
+            notify P the prefilled KV cache is no longer needed and can be freed in P.
+    (
+        Two scheduler steps can finish:
+            scheduler WAITING_FOR_REMOTE_KVS -> connector reqs_to_load -> worker wait for pulling
+            worker pulling done, notify P to free blocks
+            scheduler RUNNING -> connector reqs_to_load=None -> worker decode -> output
+        The waiting buffer will get freed after notified by D or expired.
+    )
+"""
+import copy
+import functools
+import os
+import threading
+import time
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Optional
+from uuid import uuid4
+import jax
+import jax.numpy as jnp
+import numpy as np
+import zmq
+from jax.experimental.transfer import start_transfer_server
+from jax.sharding import Mesh
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.utils import make_zmq_path, make_zmq_socket, round_down
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.request import RequestStatus
+if TYPE_CHECKING:
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+from tpu_inference.distributed.utils import (get_host_ip, get_kv_ips,
+                                             get_kv_ports,
+                                             get_kv_transfer_port, get_node_id,
+                                             get_side_channel_port)
+from tpu_inference.logger import init_logger
+from tpu_inference.runner.tpu_jax_runner import TPUModelRunner
+from tpu_inference.utils import device_array
+ReqId = str
+# Feature requests:
+# 1. support async pulling natively
+# 2. partial pulling (like RDMA)
+# 3. non-blocking jax array read/write
+# The await pull KV cache will be cleared after
+# this time (in seconds) if no pulling occurred on it.
+P2P_WAIT_PULL_TIMEOUT = 120
+logger = init_logger(__name__)
+@dataclass
+class SendMeta:
+    uuid: int
+    local_block_ids: list[int]
+    expiration_time: float
+@dataclass
+class LoadMeta:
+    uuid: int
+    local_block_ids: list[int]
+    remote_block_ids: list[int]
+    remote_host: str | list[str]
+    remote_port: int | list[int]
+@dataclass
+class _kv_transfer_params:
+    """
+    P prepares this in request_finished() and responds to proxy server.
+    D recieves this from proxy server and uses this to create LoadMeta.
+    """
+    uuid: int
+    remote_block_ids: list[int]
+    # A single IP for single-host, or a list of IPs for mult-host.
+    remote_host: str | list[str]
+    # A single port for single-host, or a list of ports for mult-host.
+    remote_port: int | list[int]
+# The metadata used for communicating between scheduler and worker connectors.
+@dataclass
+class TPUConnectorMetadata(KVConnectorMetadata):
+    reqs_to_send: dict[ReqId, SendMeta] = field(default_factory=dict)
+    reqs_to_load: dict[ReqId, LoadMeta] = field(default_factory=dict)
+class TPUConnector(KVConnectorBase_V1):
+    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
+        assert vllm_config.kv_transfer_config is not None
+        if role == KVConnectorRole.SCHEDULER:
+            self.connector_scheduler = \
+                TPUConnectorScheduler(vllm_config)
+            self.connector_worker = None
+        elif role == KVConnectorRole.WORKER:
+            self.connector_scheduler = None
+            self.connector_worker = TPUConnectorWorker(vllm_config)
+    ############################################################
+    # Scheduler Side Methods
+    ############################################################
+    def get_num_new_matched_tokens(
+            self, request: "Request",
+            num_computed_tokens: int) -> tuple[int, bool]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.get_num_new_matched_tokens(
+            request, num_computed_tokens)
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_state_after_alloc(
+            request, blocks, num_external_tokens)
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> TPUConnectorMetadata:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.build_connector_meta()
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.request_finished(request, block_ids)
+    ############################################################
+    # Worker Side Methods
+    ############################################################
+    def register_kv_caches(self, kv_caches: list[jax.Array]):
+        """
+        We don't register kv_caches in connector, we call `register_runner` and
+        use runner.kv_caches directly instead because the ref of runner.kv_caches
+        would be reassigned during model forward.
+        """
+        pass
+    def register_runner(self, runner: TPUModelRunner) -> None:
+        assert self.connector_worker is not None
+        self.connector_worker.register_runner(runner)
+    def start_load_kv(self, _, **kwargs) -> None:
+        assert self.connector_worker is not None
+        assert isinstance(self._connector_metadata, TPUConnectorMetadata)
+        self.connector_worker.process_send_load(self._connector_metadata)
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """TPU connector doesn't support layer wise load."""
+        pass
+    def save_kv_layer(self, **kwargs) -> None:
+        """TPU connector doesn't support layer wise save."""
+        pass
+    def wait_for_save(self):
+        """
+        Not useful for TPU, because by the design of vLLM KVConnectorModelRunnerMixin,
+        this function is only called when scheduler_output.total_num_scheduled_tokens is not 0.
+        But the reqs_to_send is only available after the req finished prefilling where the
+        total_num_scheduled_tokens could be 0 if no other running reqs.
+        So we run saving logic in `start_load_kv -> process_send_load` instead.
+        """
+        pass
+    def get_finished(self,
+                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
+        assert self.connector_worker is not None
+        return self.connector_worker.get_finished()
+class TPUConnectorScheduler():
+    def __init__(self, vllm_config: "VllmConfig"):
+        self.vllm_config = vllm_config
+        self.config = vllm_config.kv_transfer_config
+        self.is_producer = self.config.is_kv_producer
+        self.block_size = vllm_config.cache_config.block_size
+        # This is updated in self.update_state_after_alloc() for D,
+        # each request that needs to pull KV cache from remote will be added to it.
+        self.reqs_to_send: dict[ReqId, SendMeta] = {}
+        # This is updated in self.request_finished() for P,
+        # each request that finished prefilling will be added to it.
+        self.reqs_to_load: dict[ReqId, LoadMeta] = {}
+        self.kv_ip = get_kv_ips()
+        self.kv_port = get_kv_ports()
+        logger.info(
+            f"Scheduler --> kv_ip={self.kv_ip} | kv_port={self.kv_port}")
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        D workers use this to get the number of new tokens
+        that can be loaded from remote P workers.
+        No-op for P workers.
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            A tuple with the following elements:
+                - The number of tokens that will be loaded from the
+                  external KV cache.
+                - If async loading. Must be 'False' for TPU connector
+                  because TPU pulls KV cache in a blocking way.
+        """
+        if self.is_producer:
+            return 0, False
+        assert num_computed_tokens % self.block_size == 0
+        # This rounding logic must be consistent with calculating
+        # remote_block_ids in P's request_finished()
+        rounded_num_prompt_tokens = round_down(len(request.prompt_token_ids),
+                                               self.block_size)
+        count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
+        # NOTE(xiang): Although the JAX P2P pulling is a blocking op, we will run it in a
+        # separte thread to make it async, so we are safe to return True here.
+        if count > 0:
+            return count, True
+        return 0, False
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update states after block allocation.
+        No-op for P workers.
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        if self.is_producer:
+            return
+        params = request.kv_transfer_params
+        if num_external_tokens > 0:
+            # We need to load KV-cache from remote (partial prefix cache hit).
+            local_block_ids = blocks.get_block_ids()[0]
+            # NOTE(xiang): D needs to pull the whole prefill blocks from the remote
+            # regardless how much ratio the prefix cache hits.
+            # The reason is JAX P2P doesn't work as RDMA, instead it works like:
+            # P just prepares the whole prefilled data and waits for pulling, then D pulls the
+            # whole data. Which means even with partial prefix cache hit on D, D cannot only
+            # pull the remaining partial data from P.
+            # Unless we implement a side channel to let P know the prefix cache hit info on D,
+            # so P can prepare those non-hit KV only, with that we need to change to:
+            # local_block_ids = blocks.get_unhashed_block_ids()
+            self.reqs_to_load[request.request_id] = LoadMeta(
+                uuid=params["uuid"],
+                local_block_ids=local_block_ids,
+                remote_block_ids=params["remote_block_ids"],
+                remote_host=params["remote_host"],
+                remote_port=params["remote_port"],
+            )
+        else:
+            # This branch means two cases:
+            # 1. We don't need to load KV-cache from remote because of full local cache.
+            # 2. The async pulling is done.
+            # In both cases we need to send notification to let P free memory.
+            self.reqs_to_load[request.request_id] = LoadMeta(
+                uuid=params["uuid"],
+                local_block_ids=None,
+                remote_block_ids=None,
+                remote_host=params["remote_host"],
+                remote_port=params["remote_port"],
+            )
+        logger.info(f"Scheduler -->  reqs_to_load={self.reqs_to_load}")
+    def build_connector_meta(self) -> TPUConnectorMetadata:
+        """
+        Build the scheduler metadata and pass to the downstream worker.
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        """
+        meta = TPUConnectorMetadata()
+        if self.is_producer:
+            meta.reqs_to_send = self.reqs_to_send
+            self.reqs_to_send = {}
+        else:
+            meta.reqs_to_load = self.reqs_to_load
+            self.reqs_to_load = {}
+        return meta
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Called when a request has finished, before its blocks are freed.
+        No-op for D workers.
+        Args:
+            request (Request): the request object.
+            block_ids: The block IDs allocated for this request and need to be freed.
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        if not self.is_producer:
+            return False, None
+        # Mark the request finished only if the prefill is done and generates 1 output token.
+        # The request's max_tokens has been reset to 1, so it must be finished by length capped.
+        if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
+            return False, None
+        # NOTE(xiang): Get computed blocks rounded by block_size.
+        # This indication means for the last partially filled block, we won't bother transfering
+        # KV-cache, will just let D run prefill locally.
+        all_full = request.num_computed_tokens % self.block_size == 0
+        computed_block_ids = block_ids if all_full else block_ids[:-1]
+        # If prompt < block_size, no transfer so free blocks immediately.
+        delay_free_blocks = len(computed_block_ids) > 0
+        if delay_free_blocks:
+            uuid = get_uuid()
+            expiration_time = time.perf_counter() + P2P_WAIT_PULL_TIMEOUT
+            self.reqs_to_send[request.request_id] = SendMeta(
+                uuid=uuid,
+                local_block_ids=computed_block_ids,
+                expiration_time=expiration_time)
+            kv_transfer_params = dict(uuid=uuid,
+                                      remote_block_ids=computed_block_ids,
+                                      remote_host=self.kv_ip,
+                                      remote_port=self.kv_port)
+            logger.info(f"Scheduler ---->  reqs_to_send={self.reqs_to_send} | "
+                        f"kv_transfer_params={kv_transfer_params}")
+        else:
+            kv_transfer_params = {}
+        return delay_free_blocks, kv_transfer_params
+class TPUConnectorWorker:
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.config = vllm_config.kv_transfer_config
+        self.is_producer = self.config.is_kv_producer
+        self.runner: TPUModelRunner = None
+        self.mesh: Mesh = None
+        self.multi_host = os.getenv("TPU_MULTIHOST_BACKEND",
+                                    "").lower() == "ray"
+        # NOTE(xiang): This can not be the worker rank set in RayDistributedExecutor.
+        # The worker rank is assigned with vLLM's sorting logic, which does not work
+        # for TPU host topology.
+        self.node_id = get_node_id()
+        # req_id: (kv, expiration_time)
+        self.reqs_wait_pull: dict[ReqId, list[list[jax.Array], float]] = {}
+        # req_id: thread_future
+        self.reqs_pulling: dict[ReqId, Future] = {}
+        self.host_ip = get_host_ip()
+        self.kv_transfer_port = get_kv_transfer_port()
+        self.side_channel_port = get_side_channel_port()
+        self.kv_transfer_server = None
+        self._maybe_start_p2p_server()
+        self.zmq_cxt = zmq.Context()
+        if self.is_producer:
+            ready_event = threading.Event()
+            self.pull_notify_listener_t = threading.Thread(
+                target=self._pull_notify_listener,
+                args=(ready_event, ),
+                daemon=True,
+            )
+            self.pull_notify_listener_t.start()
+            ready_event.wait()
+        else:
+            self.pull_executor = ThreadPoolExecutor(max_workers=64)
+            self.pull_conns: dict[str, Any] = {}
+            self.notif_sockets: dict[str, zmq.Socket] = {}
+        logger.info(f"Worker {self.node_id} --> init | "
+                    f"ip={self.host_ip} | "
+                    f"kv_transfer_port={self.kv_transfer_port} | "
+                    f"side_channel_port={self.side_channel_port}")
+    def __del__(self):
+        if self.is_producer:
+            self.pull_notify_listener_t.join(timeout=0)
+        else:
+            self.pull_executor.shutdown(wait=False)
+        self.zmq_cxt.destroy(linger=0)
+    def register_runner(self, runner: TPUModelRunner):
+        self.runner = runner
+        self.mesh = runner.mesh
+        # Get the spec of the kv_caches
+        kv_caches = runner.kv_caches
+        kv_layer = kv_caches[0]
+        self.num_layers = len(kv_caches)
+        self.shape = list(kv_layer.shape)
+        self.dtype = kv_layer.dtype
+        self.sharding = kv_layer.sharding
+    def _maybe_start_p2p_server(self):
+        if self.kv_transfer_server is not None:
+            return
+        server_addr = f"{self.host_ip}:{self.kv_transfer_port}"
+        transport_addr = f'{self.host_ip}:0'
+        self.kv_transfer_server = start_transfer_server(
+            jax.local_devices()[0].client,
+            server_addr,
+            [transport_addr],
+            max_num_parallel_copies=8,
+            transfer_size=256 * 1024 * 1024,
+            use_raw_buffers=False,
+        )
+        logger.info(
+            f"Worker {self.node_id} --> kv transfer | addr={self.kv_transfer_server.address()}"
+        )
+    def _pull_notify_listener(self, ready_event: threading.Event):
+        sock_path = make_zmq_path("tcp", "*", self.side_channel_port)
+        sock = make_zmq_socket(ctx=self.zmq_cxt,
+                               path=sock_path,
+                               socket_type=zmq.ROUTER,
+                               bind=True)
+        ready_event.set()
+        logger.info(
+            f"Worker {self.node_id} --> zmq listener | sock_path={sock_path}")
+        while True:
+            client_id, req_id_bytes = sock.recv_multipart()
+            req_id = req_id_bytes.decode('utf-8')
+            logger.info(
+                f"Worker {self.node_id} --> zmq recieve | req_id={req_id}")
+            if req_id in self.reqs_wait_pull:
+                # Set the expiration time of this request to -1, mark to be done
+                self.reqs_wait_pull[req_id][1] = -1
+            else:
+                raise ValueError(
+                    f"Disagg producer recives a non-exist pulling finished notification request {req_id}"
+                )
+            time.sleep(0)
+            # The response is not really needed.
+            # sock.send_multipart([client_id, b"", b"ACK"])
+    def process_send_load(self, metadata: TPUConnectorMetadata):
+        """
+        This is called in runner before calling model forward,
+        whenever the scheduler_output.total_num_scheduled_tokens is empty or not.
+        """
+        reqs = metadata.reqs_to_send
+        if reqs:
+            assert self.is_producer
+            logger.info(f"Worker {self.node_id} -->  reqs_to_send={reqs}")
+        for req_id, req_meta in reqs.items():
+            self._prepare_kv_and_wait(req_id, req_meta)
+        reqs = metadata.reqs_to_load
+        if reqs:
+            assert not self.is_producer
+            logger.info(f"Worker {self.node_id} -->  reqs_to_load={reqs}")
+        for req_id, req_meta in reqs.items():
+            if req_meta.remote_block_ids is not None:
+                # The request requires to pull KV from P, build the connection and pull
+                # the data asyncly.
+                conn = self._maybe_build_kv_connection(req_meta)
+                self.reqs_pulling[req_id] = self.pull_executor.submit(
+                    self._pull_kv, conn, req_meta)
+            else:
+                # The request has finished pulling the KV from remote, or it has full local
+                # prefix cache, need to notify P to let it free blocks.
+                socket = self._maybe_build_notif_socket(req_meta)
+                self._notify_pull_done(socket, req_id)
+    def _prepare_kv_and_wait(self, req_id: str, req_meta: SendMeta):
+        local_block_ids = req_meta.local_block_ids
+        # TODO(xiang): pad block_ids to avoid recompilation
+        indices = device_array(self.mesh, np.array(local_block_ids))
+        kv = select_from_kv_caches(self.runner.kv_caches, indices)
+        # NOTE(xiang): We need to manually store the kv because:
+        # Although we can set use_raw_buffers=True to let kv be safely destroyed after
+        # calling await_pull, it could be a stranding buffer if D never pulls it.
+        # So we have to set use_raw_buffers=False and stores the kv, then the kv buffer
+        # will be safely destroyed by either D notifying or expiration.
+        self.reqs_wait_pull[req_id] = [kv, req_meta.expiration_time]
+        self.kv_transfer_server.await_pull(req_meta.uuid, kv)
+    def _maybe_build_kv_connection(self, req_meta: LoadMeta) -> Any:
+        remote_addr = f"{req_meta.remote_host}:{req_meta.remote_port}"
+        if remote_addr in self.pull_conns:
+            conn = self.pull_conns[remote_addr]
+        else:
+            conn = self.kv_transfer_server.connect(remote_addr)
+            self.pull_conns[remote_addr] = conn
+            logger.info(
+                f"Worker {self.node_id} --> kv transfer | connect={remote_addr}"
+            )
+        return conn
+    def _pull_kv(self, conn: Any, req_meta: LoadMeta):
+        # The local allocated blocks which don't hit prefix caching.
+        local_block_ids = req_meta.local_block_ids
+        # The remote computed blocks which need to pull from P.
+        remote_block_ids = req_meta.remote_block_ids
+        # Make sure they have the same num blocks because we don't care
+        # if partial prefix cache hit now.
+        assert len(local_block_ids) == len(remote_block_ids)
+        kv_spec = self._get_kv_spec(len(remote_block_ids))
+        # TODO(xiang): pad block_ids to avoid recompilation
+        indices = device_array(self.mesh, np.array(local_block_ids))
+        kv = conn.pull(req_meta.uuid, kv_spec)
+        logger.info(
+            f"Worker {self.node_id} --> kv transfer | pull uuid={req_meta.uuid}"
+        )
+        return kv, indices
+    def _get_kv_spec(self, num_blocks: int) -> list[jax.ShapeDtypeStruct]:
+        assert num_blocks <= self.shape[0]
+        shape = copy.copy(self.shape)
+        shape[0] = num_blocks
+        return [
+            jax.ShapeDtypeStruct(shape, self.dtype, sharding=self.sharding)
+        ] * self.num_layers
+    def _maybe_build_notif_socket(self, req_meta: LoadMeta) -> zmq.Socket:
+        sock_path = make_zmq_path("tcp", req_meta.remote_host,
+                                  self.side_channel_port)
+        if sock_path in self.notif_sockets:
+            sock = self.notif_sockets[sock_path]
+        else:
+            sock = make_zmq_socket(ctx=self.zmq_cxt,
+                                   path=sock_path,
+                                   socket_type=zmq.DEALER,
+                                   bind=False)
+            logger.info(
+                f"Worker {self.node_id} --> zmq notify | sock_path={sock_path}"
+            )
+        return sock
+    def _notify_pull_done(self, sock: zmq.Socket, req_id: str):
+        logger.info(f"Worker {self.node_id} --> zmq notify | req_id={req_id}")
+        sock.send_string(req_id)
+        # The response is not really needed.
+        # ack = sock.recv_string()
+    def get_finished(self) -> tuple[set[str], set[str]]:
+        done_sending: set[str] = set()
+        done_recving: set[str] = set()
+        if not self.reqs_wait_pull and not self.reqs_pulling:
+            return done_sending, done_recving
+        # Mark a req as done recieving after its pulling thread returns.
+        # This req can then be scheduled for decoding in the next scheduler step.
+        for req_id in list(self.reqs_pulling.keys()):
+            future = self.reqs_pulling[req_id]
+            if future.done():
+                # NOTE(xiang): we do the scatter in main thread to avoid data racing.
+                # The data racing is not for the kv_caches buffer, it's for the runner.kv_caches ref.
+                kv, indices = future.result()
+                self.runner.kv_caches = scatter_kv_slices(
+                    self.runner.kv_caches, kv, indices)
+                del self.reqs_pulling[req_id]
+                done_recving.add(req_id)
+        # Mark a req as done seding when it's expired.
+        # This req can then be released blocks in the current scheduler step.
+        now = time.perf_counter()
+        for req_id in list(self.reqs_wait_pull):
+            _, expires = self.reqs_wait_pull[req_id]
+            if now > expires:
+                del self.reqs_wait_pull[req_id]
+                done_sending.add(req_id)
+        if done_sending:
+            logger.info(
+                f"Worker {self.node_id} -->  done_sending={done_sending}")
+        if done_recving:
+            logger.info(
+                f"Worker {self.node_id} -->  done_recving={done_recving}")
+        return done_sending, done_recving
+def get_uuid() -> int:
+    int128 = uuid4().int
+    # Must be 64-bit int, otherwise vllm output encoder would raise error.
+    int64 = int128 >> 64
+    return int64
+@jax.jit
+def select_from_kv_caches(kv_caches: list[jax.Array],
+                          indices: list[jax.Array]) -> list[jax.Array]:
+    selected = [cache.at[indices].get() for cache in kv_caches]
+    return selected
+@functools.partial(
+    jax.jit,
+    donate_argnames=("kv_caches", ),
+)
+def scatter_kv_slices(kv_caches: list[jax.Array], kv_slices: list[jax.Array],
+                      indices: list[jax.Array]) -> list[jax.Array]:
+    num_indices = indices.shape[0]
+    num_slices = kv_slices[0].shape[0]
+    # indices might be padded
+    assert num_slices <= num_indices
+    new_kv_caches = []
+    for cache, slice in zip(kv_caches, kv_slices):
+        if num_slices < num_indices:
+            slice = jnp.pad(slice, ((0, num_indices - num_slices), (0, 0),
+                                    (0, 0), (0, 0)))
+        new_cache = cache.at[indices].set(slice)
+        new_kv_caches.append(new_cache)
+    return new_kv_caches