PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/disaggregation/mooncake/conn.py CHANGED Viewed

@@ -9,6 +9,8 @@ import queue
 import socket
 import struct
 import threading
+import time
+from collections import defaultdict
 from functools import cache
 from typing import Dict, List, Optional, Tuple, Union
@@ -27,30 +29,33 @@ from sglang.srt.disaggregation.base.conn import (
     KVPoll,
 )
 from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
-from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+    FastQueue,
+    group_concurrent_contiguous,
+)
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
+from sglang.srt.utils import (
+    get_free_port,
+    get_int_env_var,
+    get_ip,
+    get_local_ip_by_remote,
+)
 logger = logging.getLogger(__name__)
-def group_concurrent_contiguous(
-    src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
-) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
-    """Vectorised NumPy implementation."""
-    if src_indices.size == 0:
-        return [], []
-    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
-    src_groups = np.split(src_indices, brk)
-    dst_groups = np.split(dst_indices, brk)
-    src_groups = [g.tolist() for g in src_groups]
-    dst_groups = [g.tolist() for g in dst_groups]
+class KVTransferError(Exception):
+    def __init__(self, bootstrap_room: int, failure_reason: str):
+        super().__init__(failure_reason)
+        self.bootstrap_room = bootstrap_room
+        self.failure_reason = failure_reason
-    return src_groups, dst_groups
+    def __str__(self):
+        return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
+# prefill
 @dataclasses.dataclass
 class TransferKVChunk:
     room: int
@@ -60,6 +65,7 @@ class TransferKVChunk:
     prefill_aux_index: Optional[int]
+# decode
 @dataclasses.dataclass
 class TransferInfo:
     room: int
@@ -93,6 +99,7 @@ class TransferInfo:
         )
+# decode
 @dataclasses.dataclass
 class KVArgsRegisterInfo:
     room: str
@@ -145,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
         self.server_socket = zmq.Context().socket(zmq.PULL)
         self.register_buffer_to_engine()
         if self.disaggregation_mode == DisaggregationMode.PREFILL:
-            self.transfer_queue = queue.Queue()
             self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
             self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
             self.start_prefill_thread()
             self._register_to_bootstrap()
+            self.session_failures = defaultdict(int)
+            self.failed_sessions = set()
+            self.session_lock = threading.Lock()
             # Determine the number of threads to use for kv sender
             cpu_count = os.cpu_count()
-            self.executor = concurrent.futures.ThreadPoolExecutor(
-                min(cpu_count // 4, 16)
+            transfer_thread_pool_size = get_int_env_var(
+                "SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
+                min(max(4, int(0.75 * cpu_count) // 8), 12),
+            )
+            transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
+            self.transfer_queues: List[FastQueue] = [
+                FastQueue() for _ in range(transfer_queue_size)
+            ]
+            assert transfer_thread_pool_size >= transfer_queue_size, (
+                f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
+                f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
+            )
+            self.executors = [
+                concurrent.futures.ThreadPoolExecutor(
+                    transfer_thread_pool_size // transfer_queue_size
+                )
+                for _ in range(transfer_queue_size)
+            ]
+            for queue, executor in zip(self.transfer_queues, self.executors):
+                threading.Thread(
+                    target=self.transfer_worker, args=(queue, executor), daemon=True
+                ).start()
+            self.bootstrap_time_out = get_int_env_var(
+                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
             )
         elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.heartbeat_failures = {}
+            self.session_pool = defaultdict(requests.Session)
+            self.session_pool_lock = threading.Lock()
+            self.addr_to_rooms_tracker = defaultdict(set)
+            self.connection_lock = threading.Lock()
+            # Heartbeat interval should be at least 2 seconds
+            self.heartbeat_interval = max(
+                float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
+            )
+            # Heartbeat failure should be at least 1
+            self.max_failures = max(
+                get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
+            )
             self.start_decode_thread()
             self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
             self.prefill_tp_size_table: Dict[str, int] = {}
@@ -166,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
                 f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
             )
+        self.failure_records: Dict[int, str] = {}
+        self.failure_lock = threading.Lock()
     def register_buffer_to_engine(self):
         for kv_data_ptr, kv_data_len in zip(
             self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -189,6 +236,7 @@ class MooncakeKVManager(BaseKVManager):
         prefill_kv_indices: npt.NDArray[np.int64],
         dst_kv_ptrs: list[int],
         dst_kv_indices: npt.NDArray[np.int64],
+        executor: concurrent.futures.ThreadPoolExecutor,
     ):
         # Group by indices
         prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
@@ -220,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
             return 0
         futures = [
-            self.executor.submit(
+            executor.submit(
                 process_layer,
                 src_ptr,
                 dst_ptr,
@@ -232,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
         for future in concurrent.futures.as_completed(futures):
             status = future.result()
             if status != 0:
-                # Immediate shutdown on first error (existing tasks will finish)
-                self.executor.shutdown(wait=False)
                 for f in futures:
                     f.cancel()
                 return status
@@ -252,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
             self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
         )
         decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
-        # TODO: mooncake transfer engine can do async transfer. Do async later
-        # Not sure about the amount of aux data, maybe transfer it by zmq is more effective
         status = self.engine.transfer_sync(
             mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
         )
         return status
-    def sync_status_to_decode_endpoint(self, remote: str, dst_port: int, room: int):
+    def sync_status_to_decode_endpoint(
+        self, remote: str, dst_port: int, room: int, status: int
+    ):
         if ":" in remote:
             remote = remote.split(":")[0]
         self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
             [
                 str(room).encode("ascii"),
-                str(self.check_status(room)).encode("ascii"),
+                str(status).encode("ascii"),
             ]
         )
+    def transfer_worker(
+        self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
+    ):
+        while True:
+            try:
+                kv_chunk: TransferKVChunk = queue.get()
+                reqs_to_be_processed = (
+                    self.transfer_infos[kv_chunk.room].values()
+                    if kv_chunk.room in self.transfer_infos
+                    else []
+                )
+                polls = []
+                dst_ranks_infos = []
+                for req in reqs_to_be_processed:
+                    if not req.is_dummy:
+                        # Early exit if the request has failed
+                        with self.session_lock:
+                            if req.mooncake_session_id in self.failed_sessions:
+                                self.record_failure(
+                                    kv_chunk.room,
+                                    f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
+                                )
+                                self.update_status(kv_chunk.room, KVPoll.Failed)
+                                self.sync_status_to_decode_endpoint(
+                                    req.endpoint,
+                                    req.dst_port,
+                                    req.room,
+                                    KVPoll.Failed,
+                                )
+                                break
+                        chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
+                        # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
+                        # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
+                        if len(chunked_dst_kv_indice) < len(
+                            kv_chunk.prefill_kv_indices
+                        ):
+                            kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
+                                : len(chunked_dst_kv_indice)
+                            ]
+                            logger.warning(
+                                f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
+                            )
+                        ret = self.send_kvcache(
+                            req.mooncake_session_id,
+                            kv_chunk.prefill_kv_indices,
+                            self.decode_kv_args_table[
+                                req.mooncake_session_id
+                            ].dst_kv_ptrs,
+                            chunked_dst_kv_indice,
+                            executor,
+                        )
+                        if ret != 0:
+                            with self.session_lock:
+                                self.session_failures[req.mooncake_session_id] += 1
+                                # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
+                                if self.session_failures[req.mooncake_session_id] >= 1:
+                                    self.failed_sessions.add(req.mooncake_session_id)
+                                    logger.error(
+                                        f"Session {req.mooncake_session_id} failed."
+                                    )
+                            self.record_failure(
+                                kv_chunk.room,
+                                f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
+                            )
+                            self.update_status(kv_chunk.room, KVPoll.Failed)
+                            self.sync_status_to_decode_endpoint(
+                                req.endpoint, req.dst_port, req.room, KVPoll.Failed
+                            )
+                            break
+                        if kv_chunk.is_last:
+                            # Only the last chunk we need to send the aux data
+                            ret = self.send_aux(
+                                req.mooncake_session_id,
+                                kv_chunk.prefill_aux_index,
+                                self.decode_kv_args_table[
+                                    req.mooncake_session_id
+                                ].dst_aux_ptrs,
+                                req.dst_aux_index,
+                            )
+                            polls.append(True if ret == 0 else False)
+                            dst_ranks_infos.append(
+                                (req.endpoint, req.dst_port, req.room)
+                            )
+                            # Only sync status when all the dst ranks have received the kvcache
+                            if len(polls) == req.required_dst_info_num:
+                                status = KVPoll.Success if all(polls) else KVPoll.Failed
+                                self.update_status(req.room, status)
+                                for endpoint, dst_port, room in dst_ranks_infos:
+                                    self.sync_status_to_decode_endpoint(
+                                        endpoint, dst_port, room, status
+                                    )
+                    else:
+                        # Dummy request means the decode instance is not used, so its status can be marked as success directly
+                        # Dummy request does not need to sync status to decode endpoint
+                        if kv_chunk.is_last and req.room in self.request_status:
+                            self.update_status(req.room, KVPoll.Success)
+                if (
+                    kv_chunk.room not in self.request_status
+                    or self.check_status(kv_chunk.room) == KVPoll.Success
+                ):
+                    if kv_chunk.room in self.transfer_infos:
+                        self.transfer_infos.pop(kv_chunk.room)
+            except Exception as e:
+                # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
+                raise RuntimeError(
+                    f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
+                )
     def start_prefill_thread(self):
         self.rank_port = get_free_port()
         self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
@@ -284,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
                     self.decode_kv_args_table[mooncake_session_id] = (
                         KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
                     )
+                    with self.session_lock:
+                        if mooncake_session_id in self.failed_sessions:
+                            self.failed_sessions.remove(mooncake_session_id)
+                        if mooncake_session_id in self.session_failures:
+                            del self.session_failures[mooncake_session_id]
                     logger.debug(
                         f"Register KVArgs from {mooncake_session_id} successfully"
                     )
@@ -301,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
                     if len(self.transfer_infos[room]) == required_dst_info_num:
                         self.update_status(room, KVPoll.WaitingForInput)
-        def transfer_thread():
-            # TODO: Shall we use KVPoll.Transferring state?
-            while True:
-                try:
-                    kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
-                    reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
-                    polls = []
-                    dst_ranks_infos = []
-                    for req in reqs_to_be_processed:
-                        if not req.is_dummy:
-                            chunked_dst_kv_indice = req.dst_kv_indices[
-                                kv_chunk.index_slice
-                            ]
-                            assert len(chunked_dst_kv_indice) == len(
-                                kv_chunk.prefill_kv_indices
-                            ), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
-                            ret = self.send_kvcache(
-                                req.mooncake_session_id,
-                                kv_chunk.prefill_kv_indices,
-                                self.decode_kv_args_table[
-                                    req.mooncake_session_id
-                                ].dst_kv_ptrs,
-                                chunked_dst_kv_indice,
-                            )
-                            if ret != 0:
-                                self.update_status(kv_chunk.room, KVPoll.Failed)
-                                self.sync_status_to_decode_endpoint(
-                                    req.endpoint, req.dst_port, req.room
-                                )
-                                continue
-                            if kv_chunk.is_last:
-                                # Only the last chunk we need to send the aux data
-                                ret = self.send_aux(
-                                    req.mooncake_session_id,
-                                    kv_chunk.prefill_aux_index,
-                                    self.decode_kv_args_table[
-                                        req.mooncake_session_id
-                                    ].dst_aux_ptrs,
-                                    req.dst_aux_index,
-                                )
-                                polls.append(True if ret == 0 else False)
-                                dst_ranks_infos.append(
-                                    (req.endpoint, req.dst_port, req.room)
-                                )
-                                # Only sync status when all the dst ranks have received the kvcache
-                                if len(polls) == req.required_dst_info_num:
-                                    self.update_status(
-                                        req.room,
-                                        KVPoll.Success if all(polls) else KVPoll.Failed,
-                                    )
-                                    for endpoint, dst_port, room in dst_ranks_infos:
-                                        self.sync_status_to_decode_endpoint(
-                                            endpoint, dst_port, room
-                                        )
-                        else:
-                            # Dummy request means the decode instance is not used, so its status can be marked as success directly
-                            # Dummy request does not need to sync status to decode endpoint
-                            if kv_chunk.is_last:
-                                self.update_status(req.room, KVPoll.Success)
-                    if self.check_status(kv_chunk.room) == KVPoll.Success:
-                        self.transfer_infos.pop(kv_chunk.room)
-                except queue.Empty:
-                    continue
         threading.Thread(target=bootstrap_thread).start()
-        threading.Thread(target=transfer_thread).start()
     def start_decode_thread(self):
         self.rank_port = get_free_port()
@@ -382,9 +478,69 @@ class MooncakeKVManager(BaseKVManager):
                 (bootstrap_room, status) = self.server_socket.recv_multipart()
                 status = int(status.decode("ascii"))
                 bootstrap_room = int(bootstrap_room.decode("ascii"))
+                if status == KVPoll.Failed:
+                    self.record_failure(
+                        bootstrap_room,
+                        f"Failed to get kvcache from prefill instance, it might be dead",
+                    )
                 self.update_status(bootstrap_room, status)
+        def heartbeat_checker():
+            while True:
+                time.sleep(self.heartbeat_interval)
+                with self.connection_lock:
+                    addresses = list(self.prefill_dp_size_table.keys())
+                for bootstrap_addr in addresses:
+                    session = None
+                    try:
+                        with self.session_pool_lock:
+                            session = self.session_pool[bootstrap_addr]
+                        response = session.get(
+                            f"http://{bootstrap_addr}/health",
+                            timeout=(2, 3),
+                            headers={"Connection": "keep-alive"},
+                        )
+                        if response.status_code == 200:
+                            self.heartbeat_failures[bootstrap_addr] = 0
+                            current_rooms = self.addr_to_rooms_tracker[
+                                bootstrap_addr
+                            ].copy()
+                            for bootstrap_room in current_rooms:
+                                # Remove KVPoll.Success requests from the tracker
+                                if bootstrap_room not in self.request_status:
+                                    self.addr_to_rooms_tracker[bootstrap_addr].discard(
+                                        bootstrap_room
+                                    )
+                        else:
+                            logger.info(
+                                f"Attempting to reconnect to {bootstrap_addr}..."
+                            )
+                            self.heartbeat_failures[bootstrap_addr] = (
+                                self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+                            )
+                            with self.session_pool_lock:
+                                if bootstrap_addr in self.session_pool:
+                                    del self.session_pool[bootstrap_addr]
+                    except Exception:
+                        logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
+                        self.heartbeat_failures[bootstrap_addr] = (
+                            self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+                        )
+                    if (
+                        self.heartbeat_failures.get(bootstrap_addr, 0)
+                        >= self.max_failures
+                    ):
+                        self._handle_node_failure(bootstrap_addr)
+                        with self.session_pool_lock:
+                            if bootstrap_addr in self.session_pool:
+                                del self.session_pool[bootstrap_addr]
         threading.Thread(target=decode_thread).start()
+        threading.Thread(target=heartbeat_checker).start()
     def add_transfer_request(
         self,
@@ -397,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
         assert self.disaggregation_mode == DisaggregationMode.PREFILL
         assert not is_last or (is_last and aux_index is not None)
-        self.transfer_queue.put(
+        if (
+            bootstrap_room not in self.request_status
+            or self.check_status(bootstrap_room) == KVPoll.Failed
+        ):
+            logger.debug(
+                "Request with bootstrap_room=%s already failed", bootstrap_room
+            )
+            return
+        if bootstrap_room not in self.transfer_infos:
+            # This means that the current rank is a dummy rank for this request,
+            # and it has already been marked as success, so there is no need to
+            # add further chunks into the transfer queue.
+            return
+        # NOTE(shangming): sharding according to the dst_infos to make sure
+        # requests with the same dst_sessions will be added into the same
+        # queue, which enables early abort with failed sessions.
+        dst_infos = self.transfer_infos[bootstrap_room].keys()
+        session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
+        shard_idx = session_port_sum % len(self.transfer_queues)
+        self.transfer_queues[shard_idx].put(
             TransferKVChunk(
                 room=bootstrap_room,
                 prefill_kv_indices=kv_indices,
@@ -406,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
                 prefill_aux_index=aux_index,
             )
         )
-        self.update_status(bootstrap_room, KVPoll.WaitingForInput)
     def check_status(self, bootstrap_room: int):
         return self.request_status[bootstrap_room]
@@ -415,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
         if bootstrap_room not in self.request_status:
             self.request_status[bootstrap_room] = status
         else:
-            # NOTE: The prefill engine could recv bootstrapping first
-            self.request_status[bootstrap_room] = max(
-                self.request_status[bootstrap_room], status
-            )
+            # NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
+            if status == KVPoll.Failed:
+                self.request_status[bootstrap_room] = KVPoll.Failed
+            else:
+                self.request_status[bootstrap_room] = max(
+                    self.request_status[bootstrap_room], status
+                )
+    def record_failure(self, bootstrap_room: int, failure_reason: str):
+        with self.failure_lock:
+            self.failure_records[bootstrap_room] = failure_reason
     def get_session_id(self):
         return self.engine.get_session_id()
@@ -442,15 +626,52 @@ class MooncakeKVManager(BaseKVManager):
         }
         try:
-            response = requests.put(url, json=payload)
+            response = requests.put(url, json=payload, timeout=5)
             if response.status_code == 200:
                 logger.debug("Prefill successfully registered to bootstrap server.")
             else:
                 logger.error(
-                    f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
+                    f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
                 )
         except Exception as e:
-            logger.error(f"Prefill Failed to register to bootstrap server: {e}")
+            logger.error(
+                f"Prefill instance failed to register to bootstrap server: {e}"
+            )
+    def _handle_node_failure(self, failed_bootstrap_addr):
+        with self.connection_lock:
+            keys_to_remove = [
+                k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
+            ]
+            for k in keys_to_remove:
+                del self.connection_pool[k]
+            if failed_bootstrap_addr in self.prefill_tp_size_table:
+                del self.prefill_tp_size_table[failed_bootstrap_addr]
+            if failed_bootstrap_addr in self.prefill_dp_size_table:
+                del self.prefill_dp_size_table[failed_bootstrap_addr]
+            possible_affected_rooms = self.addr_to_rooms_tracker.get(
+                failed_bootstrap_addr, []
+            )
+            if failed_bootstrap_addr in self.addr_to_rooms_tracker:
+                del self.addr_to_rooms_tracker[failed_bootstrap_addr]
+        # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
+        affected_rooms = []
+        for room in possible_affected_rooms:
+            if (
+                room in self.request_status
+                and self.check_status(room) != KVPoll.Success
+            ):
+                self.record_failure(
+                    room,
+                    f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
+                )
+                self.update_status(room, KVPoll.Failed)
+                affected_rooms.append(room)
+        logger.error(
+            f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
+        )
 class MooncakeKVSender(BaseKVSender):
@@ -463,18 +684,24 @@ class MooncakeKVSender(BaseKVSender):
         self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
         self.aux_index = None
         self.bootstrap_server_url = bootstrap_addr
-        self.session_id = self.kv_mgr.get_session_id()
+        self.conclude_state = None
+        self.init_time = None
+        # inner state
+        self.curr_idx = 0
     def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
         self.num_kv_indices = num_kv_indices
         self.aux_index = aux_index
+        self.init_time = time.time()
     def send(
         self,
         kv_indices: npt.NDArray[np.int64],
-        index_slice: slice,
-        is_last: bool,
     ):
+        index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
+        self.curr_idx += len(kv_indices)
+        is_last = self.curr_idx == self.num_kv_indices
         if not is_last:
             self.kv_mgr.add_transfer_request(
                 self.bootstrap_room, kv_indices, index_slice, False
@@ -489,10 +716,42 @@ class MooncakeKVSender(BaseKVSender):
             )
     def poll(self) -> KVPoll:
-        return self.kv_mgr.check_status(self.bootstrap_room)
+        if self.conclude_state is None:
+            status = self.kv_mgr.check_status(self.bootstrap_room)
+            if status in (KVPoll.Success, KVPoll.Failed):
+                self.conclude_state = status
+            elif status == KVPoll.Bootstrapping:
+                if self.init_time is not None:
+                    now = time.time()
+                    elapsed = now - self.init_time
+                    if elapsed >= self.kv_mgr.bootstrap_time_out:
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
+                        )
+                        self.conclude_state = KVPoll.Failed
+                        return KVPoll.Failed
+            return status
+        else:
+            return self.conclude_state
+    def clear(self) -> None:
+        if self.bootstrap_room in self.kv_mgr.request_status:
+            self.kv_mgr.request_status.pop(self.bootstrap_room)
     def failure_exception(self):
-        raise Exception("Fake KVSender Exception")
+        self.clear()
+        # Explicitly set the status to failure since this request has failed in another rank
+        if self.conclude_state is None:
+            self.conclude_state = KVPoll.Failed
+        with self.kv_mgr.failure_lock:
+            failure_reason = self.kv_mgr.failure_records.pop(
+                self.bootstrap_room, "Failed due to an unknown reason from another rank"
+            )
+        raise KVTransferError(self.bootstrap_room, failure_reason)
 class MooncakeKVReceiver(BaseKVReceiver):
@@ -506,22 +765,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
         mgr: MooncakeKVManager,
         bootstrap_addr: str,
         bootstrap_room: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None,
     ):
         self.bootstrap_room = bootstrap_room
         self.bootstrap_addr = bootstrap_addr
         self.kv_mgr = mgr
         self.session_id = self.kv_mgr.get_session_id()
-        self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
+        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
+        self.conclude_state = None
+        self.data_parallel_rank = data_parallel_rank
         if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
             self.prefill_tp_size, self.prefill_dp_size = (
-                self._get_prefill_dp_size_from_server()
+                self._get_prefill_parallel_info_from_server()
             )
             if self.prefill_tp_size is None or self.prefill_dp_size is None:
-                logger.error(
-                    f"Could not fetch prefill parallel info for bootstrap_addr: {self.bootstrap_addr}"
+                self.kv_mgr.record_failure(
+                    self.bootstrap_room,
+                    f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
                 )
+                self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+                return
             else:
+                logger.debug(
+                    f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
+                )
                 self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
                     self.prefill_tp_size
                 )
@@ -579,7 +847,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
             self.target_tp_rank = self.target_tp_ranks[0]
             self.required_dst_info_num = 1
-        self.target_dp_group = bootstrap_room % self.prefill_dp_size
+        if self.data_parallel_rank is not None:
+            logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
+            self.target_dp_group = self.data_parallel_rank
+        else:
+            self.target_dp_group = bootstrap_room % self.prefill_dp_size
         # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
         bootstrap_key = (
@@ -599,32 +871,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
                         target_tp_rank == self.target_tp_rank
                         or self.target_tp_rank is None
                     )
+                    logger.debug(
+                        f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
+                    )
                     bootstrap_infos.append(bootstrap_info)
                 else:
-                    logger.error(
-                        f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}"
+                    self.kv_mgr.record_failure(
+                        self.bootstrap_room,
+                        f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
                     )
+                    self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+                    return
             self.bootstrap_infos = bootstrap_infos
+            self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
-            if len(self.bootstrap_infos) == 0:
-                logger.error(
-                    f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
-                )
-            else:
-                self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
-                # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
-                self._register_kv_args()
+            # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
+            self._register_kv_args()
         else:
             self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
         assert len(self.bootstrap_infos) > 0
-        self.kv_mgr.update_status(bootstrap_room, KVPoll.WaitingForInput)
+        self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
+        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
     def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
         """Fetch the bootstrap info from the bootstrap server."""
         try:
             url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
-            response = requests.get(url)
+            response = requests.get(url, timeout=5)
             if response.status_code == 200:
                 bootstrap_info = response.json()
                 return bootstrap_info
@@ -637,7 +912,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
             logger.error(f"Error fetching prefill info from bootstrap: {e}")
             return None
-    def _get_prefill_dp_size_from_server(self) -> int:
+    def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
         """Fetch the prefill parallel info from the bootstrap server."""
         try:
             url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
@@ -651,10 +926,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
                 logger.error(
                     f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
                 )
-                return None
+                return None, None
         except Exception as e:
             logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
-            return None
+            return None, None
     def _register_kv_args(self):
         for bootstrap_info in self.bootstrap_infos:
@@ -696,9 +971,6 @@ class MooncakeKVReceiver(BaseKVReceiver):
             self.prefill_server_url = (
                 f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
             )
-            logger.debug(
-                f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
-            )
             is_dummy = bootstrap_info["is_dummy"]
             sock, lock = self._connect("tcp://" + self.prefill_server_url)
@@ -716,10 +988,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
                 )
     def poll(self) -> KVPoll:
-        return self.kv_mgr.check_status(self.bootstrap_room)
+        if self.conclude_state is None:
+            status = self.kv_mgr.check_status(self.bootstrap_room)
+            if status in (KVPoll.Success, KVPoll.Failed):
+                self.conclude_state = status
+            return status
+        else:
+            return self.conclude_state
+    def clear(self) -> None:
+        if self.bootstrap_room in self.kv_mgr.request_status:
+            self.kv_mgr.request_status.pop(self.bootstrap_room)
     def failure_exception(self):
-        raise Exception("Fake KVReceiver Exception")
+        self.clear()
+        # Explicitly set the status to failure since this request has failed in another rank
+        if self.conclude_state is None:
+            self.conclude_state = KVPoll.Failed
+        with self.kv_mgr.failure_lock:
+            failure_reason = self.kv_mgr.failure_records.pop(
+                self.bootstrap_room, "Failed due to an unknown reason from another rank"
+            )
+        raise KVTransferError(self.bootstrap_room, failure_reason)
 class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
@@ -743,6 +1036,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
     def _setup_routes(self):
         self.app.router.add_route("*", "/route", self._handle_route)
+        self.app.router.add_get("/health", self._handle_health_check)
+    async def _handle_health_check(self, request):
+        return web.Response(text="OK", status=200)
     async def _handle_route(self, request: web.Request):
         method = request.method
@@ -771,14 +1068,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
             self.dp_size = dp_size
         tp_size_per_dp_rank = tp_size // dp_size
-        if self.tp_size_per_dp_rank == None:
+        if self.tp_size_per_dp_rank is None:
             self.tp_size_per_dp_rank = tp_size_per_dp_rank
-        # Add lock to make sure thread-safe
         if role == "Prefill":
             dp_group = engine_rank // tp_size_per_dp_rank
             tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
+            # Add lock to make sure thread-safe
             async with self.lock:
                 if dp_group not in self.prefill_port_table:
                     self.prefill_port_table[dp_group] = {}
@@ -788,7 +1085,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
                 "rank_port": rank_port,
             }
             logger.debug(
-                f"Register Prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
+                f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
             )
         return web.Response(text="OK", status=200)
@@ -824,7 +1121,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
             self._loop = asyncio.new_event_loop()
             asyncio.set_event_loop(self._loop)
-            self._runner = web.AppRunner(self.app)
+            access_log = None
+            if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
+                access_log = self.app.logger
+            self._runner = web.AppRunner(self.app, access_log=access_log)
             self._loop.run_until_complete(self._runner.setup())
             site = web.TCPSite(self._runner, port=self.port)

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl