PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -25,12 +25,13 @@ from collections import deque
 from http import HTTPStatus
 from typing import TYPE_CHECKING, List, Optional
+import numpy as np
 import torch
-from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
+from sglang.srt.disaggregation.base import BaseKVManager, KVPoll
 from sglang.srt.disaggregation.utils import (
+    FAKE_BOOTSTRAP_HOST,
     DisaggregationMode,
-    FakeBootstrapHost,
     KVClassType,
     MetadataBuffers,
     ReqToMetadataIdxAllocator,
@@ -51,7 +52,6 @@ if TYPE_CHECKING:
     from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
     from sglang.srt.mem_cache.memory_pool import KVCache
 logger = logging.getLogger(__name__)
@@ -68,35 +68,45 @@ class PrefillBootstrapQueue:
         metadata_buffers: MetadataBuffers,
         tp_rank: int,
         tp_size: int,
+        gpu_id: int,
         bootstrap_port: int,
         gloo_group: ProcessGroup,
-        transfer_backend: TransferBackend,
+        max_total_num_tokens: int,
+        decode_tp_size: int,
+        decode_dp_size: int,
         scheduler: Scheduler,
+        pp_rank: int,
+        pp_size: int,
+        transfer_backend: TransferBackend,
     ):
         self.token_to_kv_pool = token_to_kv_pool
         self.draft_token_to_kv_pool = draft_token_to_kv_pool
         self.is_mla_backend = is_mla_backend(token_to_kv_pool)
         self.metadata_buffers = metadata_buffers
         self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
         self.tp_rank = tp_rank
         self.tp_size = tp_size
-        self.transfer_backend = transfer_backend
-        self.scheduler = scheduler
-        self.kv_manager = self._init_kv_manager()
+        self.decode_tp_size = decode_tp_size
+        self.decode_dp_size = decode_dp_size
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+        self.gpu_id = gpu_id
+        self.bootstrap_port = bootstrap_port
         self.queue: List[Req] = []
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
         self.gloo_group = gloo_group
-        self.bootstrap_port = bootstrap_port
-    def store_prefill_results(self, idx: int, token_id: int):
-        assert token_id >= 0, f"token_id: {token_id} is negative"
-        output_id_buffer = self.metadata_buffers[0]
-        output_id_buffer[idx] = token_id
+        self.max_total_num_tokens = max_total_num_tokens
+        self.scheduler = scheduler
+        self.transfer_backend = transfer_backend
+        self.kv_manager = self._init_kv_manager()
     def _init_kv_manager(self) -> BaseKVManager:
-        kv_args = KVArgs()
+        kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
+        kv_args = kv_args_class()
         kv_args.engine_rank = self.tp_rank
+        kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
+        kv_args.prefill_pp_size = self.pp_size
         kv_data_ptrs, kv_data_lens, kv_item_lens = (
             self.token_to_kv_pool.get_contiguous_buf_infos()
         )
@@ -115,12 +125,12 @@ class PrefillBootstrapQueue:
         kv_args.kv_data_lens = kv_data_lens
         kv_args.kv_item_lens = kv_item_lens
-        # Define req -> input ids buffer
         kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
             self.metadata_buffers.get_buf_infos()
         )
         kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
         kv_manager = kv_manager_class(
             kv_args,
@@ -130,23 +140,39 @@ class PrefillBootstrapQueue:
         )
         return kv_manager
-    def add(self, req: Req) -> None:
-        if req.bootstrap_host == FakeBootstrapHost:
-            # Fake transfer for warmup reqs
+    def add(self, req: Req, num_kv_heads: int) -> None:
+        if self._check_if_req_exceed_kv_capacity(req):
+            return
+        if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
             kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
         else:
             kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
+        dest_tp_ranks = [self.tp_rank]
         req.disagg_kv_sender = kv_sender_class(
             mgr=self.kv_manager,
             bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
             bootstrap_room=req.bootstrap_room,
+            dest_tp_ranks=dest_tp_ranks,
+            pp_rank=self.pp_rank,
         )
         self._process_req(req)
         self.queue.append(req)
-    def extend(self, reqs: List[Req]) -> None:
+    def extend(self, reqs: List[Req], num_kv_heads: int) -> None:
         for req in reqs:
-            self.add(req)
+            self.add(req, num_kv_heads)
+    def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
+        if len(req.origin_input_ids) > self.max_total_num_tokens:
+            message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
+            logger.error(message)
+            prepare_abort(req, message)
+            self.scheduler.stream_output([req], req.return_logprob)
+            return True
+        return False
     def _process_req(self, req: Req) -> None:
         """
@@ -154,19 +180,40 @@ class PrefillBootstrapQueue:
         """
         req.sampling_params.max_new_tokens = 1
-    def pop_bootstrapped(self) -> List[Req]:
-        """pop the reqs which has finished bootstrapping"""
+    def pop_bootstrapped(
+        self,
+        return_failed_reqs: bool = False,
+        rids_to_check: Optional[List[str]] = None,
+    ) -> List[Req]:
+        """
+        pop the reqs which has finished bootstrapping
+        return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
+        """
         bootstrapped_reqs = []
+        failed_reqs = []
         indices_to_remove = set()
         if len(self.queue) == 0:
-            return []
+            if return_failed_reqs is False:
+                return []
+            else:
+                return [], []
         polls = poll_and_all_reduce(
             [req.disagg_kv_sender for req in self.queue], self.gloo_group
         )
         for i, (req, poll) in enumerate(zip(self.queue, polls)):
+            if rids_to_check is not None:
+                # if req not in reqs_info_to_check, skip
+                if req.rid not in rids_to_check:
+                    continue
+                # Either waiting for input or failed
+                assert poll == KVPoll.WaitingForInput or poll == KVPoll.Failed
             if poll == KVPoll.Bootstrapping:
                 continue
             elif poll == KVPoll.Failed:
@@ -181,9 +228,10 @@ class PrefillBootstrapQueue:
                 )
                 self.scheduler.stream_output([req], req.return_logprob)
                 indices_to_remove.add(i)
+                failed_reqs.append(req)
                 continue
-            # KV.WaitingForInput
+            # KV.WaitingForInput - init here
             num_kv_indices = len(req.origin_input_ids)
             if self.req_to_metadata_buffer_idx_allocator.available_size() == 0:
                 break
@@ -192,9 +240,9 @@ class PrefillBootstrapQueue:
                 self.req_to_metadata_buffer_idx_allocator.alloc()
             )
             assert req.metadata_buffer_index is not None
             num_pages = kv_to_page_num(num_kv_indices, self.token_to_kv_pool.page_size)
             req.disagg_kv_sender.init(num_pages, req.metadata_buffer_index)
             bootstrapped_reqs.append(req)
             indices_to_remove.add(i)
@@ -202,7 +250,10 @@ class PrefillBootstrapQueue:
             entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
         ]
-        return bootstrapped_reqs
+        if return_failed_reqs is False:
+            return bootstrapped_reqs
+        else:
+            return bootstrapped_reqs, failed_reqs
 class SchedulerDisaggregationPrefillMixin:
@@ -211,7 +262,7 @@ class SchedulerDisaggregationPrefillMixin:
     """
     @torch.no_grad()
-    def event_loop_normal_disagg_prefill(self: Scheduler):
+    def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
         """A normal scheduler loop for prefill worker in disaggregation mode."""
         while True:
@@ -229,7 +280,6 @@ class SchedulerDisaggregationPrefillMixin:
                 or self.server_args.enable_sp_layernorm
             ):
                 batch, _ = self.prepare_dp_attn_batch(batch)
             self.cur_batch = batch
             if batch:
@@ -242,6 +292,7 @@ class SchedulerDisaggregationPrefillMixin:
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
+                self.maybe_sleep_on_idle()
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -249,7 +300,7 @@ class SchedulerDisaggregationPrefillMixin:
             self.running_batch.batch_is_full = False
     @torch.no_grad()
-    def event_loop_overlap_disagg_prefill(self: Scheduler):
+    def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
         self.result_queue = deque()
         while True:
@@ -267,9 +318,7 @@ class SchedulerDisaggregationPrefillMixin:
                 or self.server_args.enable_sp_layernorm
             ):
                 batch, _ = self.prepare_dp_attn_batch(batch)
             self.cur_batch = batch
             if batch:
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
@@ -286,6 +335,9 @@ class SchedulerDisaggregationPrefillMixin:
             if self.last_batch:
                 tmp_batch, tmp_result = self.result_queue.popleft()
+                tmp_batch.next_batch_sampling_info = (
+                    self.tp_worker.cur_sampling_info if batch else None
+                )
                 self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
             if len(self.disagg_prefill_inflight_queue) > 0:
@@ -294,6 +346,7 @@ class SchedulerDisaggregationPrefillMixin:
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
+                self.maybe_sleep_on_idle()
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -307,7 +360,7 @@ class SchedulerDisaggregationPrefillMixin:
         launch_done: Optional[threading.Event] = None,
     ) -> None:
         """
-        Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
+        Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
         Adapted from process_batch_result_prefill
         """
         (
@@ -323,7 +376,7 @@ class SchedulerDisaggregationPrefillMixin:
         )
         logprob_pt = 0
-        # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
+        # Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
         if self.enable_overlap:
             # wait
             logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
@@ -395,11 +448,15 @@ class SchedulerDisaggregationPrefillMixin:
         # We need to remove the sync in the following function for overlap schedule.
         self.set_next_batch_sampling_info_done(batch)
-    def process_disagg_prefill_inflight_queue(self: Scheduler) -> None:
+    def process_disagg_prefill_inflight_queue(
+        self: Scheduler, rids_to_check: Optional[List[str]] = None
+    ) -> List[Req]:
         """
         Poll the requests in the middle of transfer. If done, return the request.
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
         """
-        assert len(self.disagg_prefill_inflight_queue) > 0
+        if len(self.disagg_prefill_inflight_queue) == 0:
+            return []
         done_reqs = []
@@ -411,12 +468,22 @@ class SchedulerDisaggregationPrefillMixin:
         undone_reqs: List[Req] = []
         # Check .poll() for the reqs in disagg_prefill_inflight_queue. If Success, respond to the client and remove it from the queue
         for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+            if rids_to_check is not None:
+                if req.rid not in rids_to_check:
+                    undone_reqs.append(req)
+                    continue
+                assert poll == KVPoll.Success or poll == KVPoll.Failed
             if poll in [KVPoll.WaitingForInput, KVPoll.Transferring]:
                 undone_reqs.append(req)
             elif poll == KVPoll.Success:  # transfer done
                 self.tree_cache.cache_finished_req(req)  # unlock the tree
                 req.finished_reason = FINISH_LENGTH(length=0)
                 # FIXME: clean up req's data in transfer engine
+                if hasattr(req.disagg_kv_sender, "clear"):
+                    req.disagg_kv_sender.clear()
                 done_reqs.append(req)
             elif poll == KVPoll.Failed:
                 error_message = f"Prefill transfer failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
@@ -430,11 +497,8 @@ class SchedulerDisaggregationPrefillMixin:
                     req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
                 )
                 done_reqs.append(req)
-        for req in done_reqs:
-            self.disagg_prefill_bootstrap_queue.req_to_metadata_buffer_idx_allocator.free(
-                req.metadata_buffer_index
-            )
+            else:
+                assert False, f"Unexpected polling state {poll=}"
         # Stream requests which have finished transfer
         self.stream_output(
@@ -442,9 +506,32 @@ class SchedulerDisaggregationPrefillMixin:
             any(req.return_logprob for req in done_reqs),
             None,
         )
+        for req in done_reqs:
+            req: Req
+            self.req_to_metadata_buffer_idx_allocator.free(req.metadata_buffer_index)
+            req.metadata_buffer_index = -1
         self.disagg_prefill_inflight_queue = undone_reqs
+        return done_reqs
+    def get_transferred_rids(self: Scheduler) -> List[str]:
+        """
+        Used by PP, get the transferred rids but **do not pop**
+        """
+        polls = poll_and_all_reduce(
+            [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
+            self.tp_worker.get_tp_group().cpu_group,
+        )
+        transferred_rids: List[str] = []
+        for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+            if poll == KVPoll.Success or poll == KVPoll.Failed:
+                transferred_rids.append(req.rid)
+        return transferred_rids
     def process_prefill_chunk(self: Scheduler) -> None:
         if self.last_batch and self.last_batch.forward_mode.is_extend():
             if self.chunked_req:

sglang/srt/disaggregation/utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import dataclasses
 import os
 import random
+import threading
 import warnings
 from collections import deque
 from enum import Enum
@@ -18,10 +19,10 @@ from sglang.srt.utils import get_ip
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import Req
-FakeBootstrapHost = "2.2.2.2"
-# env var for testing failure, convert to float explicitly
-FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
+#########################
+# Constants & Enums
+#########################
+FAKE_BOOTSTRAP_HOST = "2.2.2.2"
 class DisaggregationMode(Enum):
@@ -30,6 +31,14 @@ class DisaggregationMode(Enum):
     DECODE = "decode"
+#########################
+# Synchronization
+#########################
+# env var for testing failure, convert to float explicitly
+FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
 def poll_and_all_reduce(pollers, gloo_group):
     # at a certain prob, the poll is failed to simulate failure
     if FAILURE_PROB > 0:
@@ -46,6 +55,11 @@ def poll_and_all_reduce(pollers, gloo_group):
     return tensor_to_reduce.tolist()
+#########################
+# Metadata Buffers
+#########################
 class ReqToMetadataIdxAllocator:
     """A memory pool that maps a request to its first output token location."""
@@ -69,6 +83,91 @@ class ReqToMetadataIdxAllocator:
         self.free_slots.append(free_index)
+class MetadataBuffers:
+    def __init__(self, size: int, max_top_logprobs_num: int = 128):
+        # TODO: abort top_logprobs_num > 128 in PD
+        # We transfer the metadata of first output token to decode
+        # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
+        self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device="cpu")
+        self.output_token_logprobs_val = torch.zeros(
+            (size, 16), dtype=torch.float32, device="cpu"
+        )
+        self.output_token_logprobs_idx = torch.zeros(
+            (size, 16), dtype=torch.int32, device="cpu"
+        )
+        self.output_top_logprobs_val = torch.zeros(
+            (size, max_top_logprobs_num), dtype=torch.float32, device="cpu"
+        )
+        self.output_top_logprobs_idx = torch.zeros(
+            (size, max_top_logprobs_num), dtype=torch.int32, device="cpu"
+        )
+    def get_buf_infos(self):
+        ptrs = [
+            self.output_ids.data_ptr(),
+            self.output_token_logprobs_val.data_ptr(),
+            self.output_token_logprobs_idx.data_ptr(),
+            self.output_top_logprobs_val.data_ptr(),
+            self.output_top_logprobs_idx.data_ptr(),
+        ]
+        data_lens = [
+            self.output_ids.nbytes,
+            self.output_token_logprobs_val.nbytes,
+            self.output_token_logprobs_idx.nbytes,
+            self.output_top_logprobs_val.nbytes,
+            self.output_top_logprobs_idx.nbytes,
+        ]
+        item_lens = [
+            self.output_ids[0].nbytes,
+            self.output_token_logprobs_val[0].nbytes,
+            self.output_token_logprobs_idx[0].nbytes,
+            self.output_top_logprobs_val[0].nbytes,
+            self.output_top_logprobs_idx[0].nbytes,
+        ]
+        return ptrs, data_lens, item_lens
+    def get_buf(self, idx: int):
+        return (
+            self.output_ids[idx],
+            self.output_token_logprobs_val[idx],
+            self.output_token_logprobs_idx[idx],
+            self.output_top_logprobs_val[idx],
+            self.output_top_logprobs_idx[idx],
+        )
+    def set_buf(self, req: Req):
+        self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
+        if req.return_logprob:
+            if req.output_token_logprobs_val:  # not none or empty list
+                self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_val[0]
+                )
+            if req.output_token_logprobs_idx:  # not none or empty list
+                self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_idx[0]
+                )
+            if req.output_top_logprobs_val:  # not none or empty list
+                self.output_top_logprobs_val[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_val[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
+                )
+            if req.output_top_logprobs_idx:  # not none or empty list
+                self.output_top_logprobs_idx[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_idx[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
+                )
+#########################
+# Transfer Backend
+#########################
 class TransferBackend(Enum):
     MOONCAKE = "mooncake"
     NIXL = "nixl"
@@ -76,6 +175,7 @@ class TransferBackend(Enum):
 class KVClassType(Enum):
+    KVARGS = "kvargs"
     MANAGER = "manager"
     SENDER = "sender"
     RECEIVER = "receiver"
@@ -86,6 +186,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
     from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
     if transfer_backend == TransferBackend.MOONCAKE:
+        from sglang.srt.disaggregation.base import KVArgs
         from sglang.srt.disaggregation.mooncake import (
             MooncakeKVBootstrapServer,
             MooncakeKVManager,
@@ -94,13 +195,15 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
         )
         class_mapping = {
+            KVClassType.KVARGS: KVArgs,
             KVClassType.MANAGER: MooncakeKVManager,
             KVClassType.SENDER: MooncakeKVSender,
             KVClassType.RECEIVER: (MooncakeKVReceiver),
             KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
         }
         return class_mapping.get(class_type)
-    if transfer_backend == TransferBackend.NIXL:
+    elif transfer_backend == TransferBackend.NIXL:
+        from sglang.srt.disaggregation.base import KVArgs
         from sglang.srt.disaggregation.nixl import (
             NixlKVBootstrapServer,
             NixlKVManager,
@@ -109,16 +212,19 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
         )
         class_mapping = {
+            KVClassType.KVARGS: KVArgs,
             KVClassType.MANAGER: NixlKVManager,
             KVClassType.SENDER: NixlKVSender,
             KVClassType.RECEIVER: (NixlKVReceiver),
             KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
         }
         return class_mapping.get(class_type)
-    if transfer_backend == TransferBackend.FAKE:
+    elif transfer_backend == TransferBackend.FAKE:
+        from sglang.srt.disaggregation.base import KVArgs
         from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
         class_mapping = {
+            KVClassType.KVARGS: KVArgs,
             KVClassType.SENDER: FakeKVSender,
             KVClassType.RECEIVER: (FakeKVReceiver),
         }
@@ -127,6 +233,11 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
     raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
+#########################
+# KV Pages
+#########################
 def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
     # 1. The page is guaranteed to be full except the last page.
     # 2. page index = kv_index // page_size
@@ -142,6 +253,11 @@ def kv_to_page_num(num_kv_indices: int, page_size: int):
     return (num_kv_indices + page_size - 1) // page_size
+#########################
+# PDLB Registry
+#########################
 @dataclasses.dataclass
 class PDRegistryRequest:
     """A request to register a machine itself to the LB."""
@@ -180,6 +296,11 @@ def register_disaggregation_server(
         )
+#########################
+# Misc
+#########################
 def is_mla_backend(target_kv_pool) -> bool:
     from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
@@ -199,83 +320,3 @@ def prepare_abort(req: Req, error_message: str, status_code=None):
         req.input_top_logprobs_idx = []
         req.input_token_ids_logprobs_val = []
         req.input_token_ids_logprobs_idx = []
-class MetadataBuffers:
-    def __init__(self, size: int, max_top_logprobs_num: int = 128):
-        # TODO: abort top_logprobs_num > 128 in PD
-        # We transfer the metadata of first output token to decode
-        # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
-        self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device="cpu")
-        self.output_token_logprobs_val = torch.zeros(
-            (size, 16), dtype=torch.float32, device="cpu"
-        )
-        self.output_token_logprobs_idx = torch.zeros(
-            (size, 16), dtype=torch.int32, device="cpu"
-        )
-        self.output_top_logprobs_val = torch.zeros(
-            (size, max_top_logprobs_num), dtype=torch.float32, device="cpu"
-        )
-        self.output_top_logprobs_idx = torch.zeros(
-            (size, max_top_logprobs_num), dtype=torch.int32, device="cpu"
-        )
-    def get_buf_infos(self):
-        ptrs = [
-            self.output_ids.data_ptr(),
-            self.output_token_logprobs_val.data_ptr(),
-            self.output_token_logprobs_idx.data_ptr(),
-            self.output_top_logprobs_val.data_ptr(),
-            self.output_top_logprobs_idx.data_ptr(),
-        ]
-        data_lens = [
-            self.output_ids.nbytes,
-            self.output_token_logprobs_val.nbytes,
-            self.output_token_logprobs_idx.nbytes,
-            self.output_top_logprobs_val.nbytes,
-            self.output_top_logprobs_idx.nbytes,
-        ]
-        item_lens = [
-            self.output_ids[0].nbytes,
-            self.output_token_logprobs_val[0].nbytes,
-            self.output_token_logprobs_idx[0].nbytes,
-            self.output_top_logprobs_val[0].nbytes,
-            self.output_top_logprobs_idx[0].nbytes,
-        ]
-        return ptrs, data_lens, item_lens
-    def get_buf(self, idx: int):
-        return (
-            self.output_ids[idx],
-            self.output_token_logprobs_val[idx],
-            self.output_token_logprobs_idx[idx],
-            self.output_top_logprobs_val[idx],
-            self.output_top_logprobs_idx[idx],
-        )
-    def set_buf(self, req: Req):
-        self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
-        if req.return_logprob:
-            if req.output_token_logprobs_val:  # not none or empty list
-                self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
-                    req.output_token_logprobs_val[0]
-                )
-            if req.output_token_logprobs_idx:  # not none or empty list
-                self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
-                    req.output_token_logprobs_idx[0]
-                )
-            if req.output_top_logprobs_val:  # not none or empty list
-                self.output_top_logprobs_val[req.metadata_buffer_index][
-                    : len(req.output_top_logprobs_val[0])
-                ] = torch.tensor(
-                    req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
-                )
-            if req.output_top_logprobs_idx:  # not none or empty list
-                self.output_top_logprobs_idx[req.metadata_buffer_index][
-                    : len(req.output_top_logprobs_idx[0])
-                ] = torch.tensor(
-                    req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
-                )

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl