PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +5 -4
sglang/bench_one_batch_server.py +23 -15
sglang/bench_serving.py +133 -57
sglang/compile_deep_gemm.py +4 -4
sglang/srt/configs/model_config.py +39 -28
sglang/srt/conversation.py +1 -1
sglang/srt/disaggregation/decode.py +122 -133
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +11 -2
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +9 -19
sglang/srt/disaggregation/prefill.py +126 -44
sglang/srt/disaggregation/utils.py +116 -5
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +28 -8
sglang/srt/entrypoints/http_server.py +6 -4
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +63 -17
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/utils.py +2 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +0 -10
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
sglang/srt/layers/moe/ep_moe/layer.py +104 -50
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +66 -9
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +7 -2
sglang/srt/layers/quantization/deep_gemm.py +5 -3
sglang/srt/layers/quantization/fp8.py +90 -0
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +18 -5
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +16 -3
sglang/srt/managers/mm_utils.py +293 -139
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +3 -3
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +49 -21
sglang/srt/managers/schedule_policy.py +4 -5
sglang/srt/managers/scheduler.py +92 -50
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +99 -24
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +74 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +2 -2
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +20 -9
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +4 -0
sglang/srt/model_executor/model_runner.py +144 -54
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_v2.py +297 -343
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama4.py +10 -2
sglang/srt/models/llava.py +26 -18
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/siglip.py +294 -0
sglang/srt/openai_api/adapter.py +28 -16
sglang/srt/openai_api/protocol.py +6 -0
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/server_args.py +134 -24
sglang/srt/speculative/eagle_utils.py +131 -0
sglang/srt/speculative/eagle_worker.py +47 -2
sglang/srt/utils.py +68 -12
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_utils.py +2 -36
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +20 -11
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +128 -102
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/mooncake/transfer_engine.py CHANGED Viewed

@@ -61,7 +61,8 @@ class MooncakeTransferEngine:
         self, session_id: str, buffer: int, peer_buffer_address: int, length: int
     ) -> int:
         """Synchronously transfer data to the specified address."""
+        # the first time: based on session_id (which contains remote_ip) to construct a queue pair, and cache the queue pair
+        # later: based on the cached queue pair to send data
         ret = self.engine.transfer_sync_write(
             session_id, buffer, peer_buffer_address, length
         )

sglang/srt/disaggregation/nixl/conn.py CHANGED Viewed

@@ -35,29 +35,19 @@ logger = logging.getLogger(__name__)
 NixlEngineInfo: TypeAlias = Dict[str, Union[str, int]]
-# From Mooncake backend.
 def group_concurrent_contiguous(
     src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
 ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
-    src_groups = []
-    dst_groups = []
-    current_src = [src_indices[0]]
-    current_dst = [dst_indices[0]]
-    for i in range(1, len(src_indices)):
-        src_contiguous = src_indices[i] == src_indices[i - 1] + 1
-        dst_contiguous = dst_indices[i] == dst_indices[i - 1] + 1
-        if src_contiguous and dst_contiguous:
-            current_src.append(src_indices[i])
-            current_dst.append(dst_indices[i])
-        else:
-            src_groups.append(current_src)
-            dst_groups.append(current_dst)
-            current_src = [src_indices[i]]
-            current_dst = [dst_indices[i]]
+    """Vectorised NumPy implementation."""
+    if src_indices.size == 0:
+        return [], []
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
-    src_groups.append(current_src)
-    dst_groups.append(current_dst)
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
     return src_groups, dst_groups

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -22,6 +22,7 @@ from __future__ import annotations
 import logging
 import threading
 from collections import deque
+from http import HTTPStatus
 from typing import TYPE_CHECKING, List, Optional
 import torch
@@ -31,6 +32,7 @@ from sglang.srt.disaggregation.utils import (
     DisaggregationMode,
     FakeBootstrapHost,
     KVClassType,
+    MetadataBuffers,
     ReqToMetadataIdxAllocator,
     TransferBackend,
     get_kv_class,
@@ -38,8 +40,10 @@ from sglang.srt.disaggregation.utils import (
     kv_to_page_indices,
     kv_to_page_num,
     poll_and_all_reduce,
+    prepare_abort,
 )
 from sglang.srt.managers.schedule_batch import FINISH_LENGTH, Req, ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
 if TYPE_CHECKING:
     from torch.distributed import ProcessGroup
@@ -59,9 +63,9 @@ class PrefillBootstrapQueue:
     def __init__(
         self,
         token_to_kv_pool: KVCache,
+        draft_token_to_kv_pool: Optional[KVCache],
         req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
-        metadata_buffers: List[torch.Tensor],
-        aux_dtype: torch.dtype,
+        metadata_buffers: MetadataBuffers,
         tp_rank: int,
         tp_size: int,
         bootstrap_port: int,
@@ -70,8 +74,9 @@ class PrefillBootstrapQueue:
         scheduler: Scheduler,
     ):
         self.token_to_kv_pool = token_to_kv_pool
+        self.draft_token_to_kv_pool = draft_token_to_kv_pool
         self.is_mla_backend = is_mla_backend(token_to_kv_pool)
-        self.aux_dtype = aux_dtype
         self.metadata_buffers = metadata_buffers
         self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
@@ -96,20 +101,24 @@ class PrefillBootstrapQueue:
             self.token_to_kv_pool.get_contiguous_buf_infos()
         )
+        if self.draft_token_to_kv_pool is not None:
+            # We should also transfer draft model kv cache. The indices are
+            # always shared with a target model.
+            draft_kv_data_ptrs, draft_kv_data_lens, draft_kv_item_lens = (
+                self.draft_token_to_kv_pool.get_contiguous_buf_infos()
+            )
+            kv_data_ptrs += draft_kv_data_ptrs
+            kv_data_lens += draft_kv_data_lens
+            kv_item_lens += draft_kv_item_lens
         kv_args.kv_data_ptrs = kv_data_ptrs
         kv_args.kv_data_lens = kv_data_lens
         kv_args.kv_item_lens = kv_item_lens
         # Define req -> input ids buffer
-        kv_args.aux_data_ptrs = [
-            metadata_buffer.data_ptr() for metadata_buffer in self.metadata_buffers
-        ]
-        kv_args.aux_data_lens = [
-            metadata_buffer.nbytes for metadata_buffer in self.metadata_buffers
-        ]
-        kv_args.aux_item_lens = [
-            metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
-        ]
+        kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
+            self.metadata_buffers.get_buf_infos()
+        )
         kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
@@ -135,6 +144,10 @@ class PrefillBootstrapQueue:
         self._process_req(req)
         self.queue.append(req)
+    def extend(self, reqs: List[Req]) -> None:
+        for req in reqs:
+            self.add(req)
     def _process_req(self, req: Req) -> None:
         """
         Set max_new_tokens = 1, so PrefillAdder memory estimation is accurate
@@ -157,7 +170,18 @@ class PrefillBootstrapQueue:
             if poll == KVPoll.Bootstrapping:
                 continue
             elif poll == KVPoll.Failed:
-                raise Exception("Bootstrap failed")
+                error_message = f"Prefill bootstrap failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
+                try:
+                    req.disagg_kv_sender.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.error(error_message)
+                prepare_abort(
+                    req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                )
+                self.scheduler.stream_output([req], req.return_logprob)
+                indices_to_remove.add(i)
+                continue
             # KV.WaitingForInput
             num_kv_indices = len(req.origin_input_ids)
@@ -250,6 +274,16 @@ class SchedulerDisaggregationPrefillMixin:
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
+                if self.last_batch is None:
+                    # Create a dummy first batch to start the pipeline for overlap schedule.
+                    # It is now used for triggering the sampling_info_done event.
+                    tmp_batch = ScheduleBatch(
+                        reqs=None,
+                        forward_mode=ForwardMode.DUMMY_FIRST,
+                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
+                    )
+                    self.set_next_batch_sampling_info_done(tmp_batch)
             if self.last_batch:
                 tmp_batch, tmp_result = self.result_queue.popleft()
                 self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
@@ -273,10 +307,9 @@ class SchedulerDisaggregationPrefillMixin:
         launch_done: Optional[threading.Event] = None,
     ) -> None:
         """
-        Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
+        Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
         Adapted from process_batch_result_prefill
         """
         (
             logits_output,
             next_token_ids,
@@ -289,27 +322,78 @@ class SchedulerDisaggregationPrefillMixin:
             result.extend_logprob_start_len_per_req,
         )
+        logprob_pt = 0
         # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
         if self.enable_overlap:
             # wait
-            _, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(launch_done)
+            logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
+                launch_done
+            )
         else:
             next_token_ids = result.next_token_ids.tolist()
-        for req, next_token_id in zip(batch.reqs, next_token_ids, strict=True):
+            if batch.return_logprob:
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs.tolist()
+                    )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = tuple(
+                        logits_output.input_token_logprobs.tolist()
+                    )
+        for i, (req, next_token_id) in enumerate(
+            zip(batch.reqs, next_token_ids, strict=True)
+        ):
             req: Req
             if req.is_chunked <= 0:
                 # There is no output_ids for prefill
                 req.output_ids.append(next_token_id)
                 self.tree_cache.cache_unfinished_req(req)  # update the tree and lock
-                self.send_kv_chunk(req, token_id=next_token_id)
                 self.disagg_prefill_inflight_queue.append(req)
+                if req.return_logprob:
+                    assert extend_logprob_start_len_per_req is not None
+                    assert extend_input_len_per_req is not None
+                    extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                    extend_input_len = extend_input_len_per_req[i]
+                    num_input_logprobs = extend_input_len - extend_logprob_start_len
+                    self.add_logprob_return_values(
+                        i,
+                        req,
+                        logprob_pt,
+                        next_token_ids,
+                        num_input_logprobs,
+                        logits_output,
+                    )
+                    logprob_pt += num_input_logprobs
+                self.send_kv_chunk(req, last_chunk=True)
+                if req.grammar is not None:
+                    req.grammar.accept_token(next_token_id)
+                    req.grammar.finished = req.finished()
             else:
                 # being chunked reqs' prefill is not finished
                 req.is_chunked -= 1
+                if req.return_logprob:
+                    extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                    extend_input_len = extend_input_len_per_req[i]
+                    if extend_logprob_start_len < extend_input_len:
+                        # Update input logprobs.
+                        num_input_logprobs = extend_input_len - extend_logprob_start_len
+                        self.add_input_logprob_return_values(
+                            i,
+                            req,
+                            logits_output,
+                            logprob_pt,
+                            num_input_logprobs,
+                            last_prefill_chunk=False,
+                        )
+                        logprob_pt += num_input_logprobs
                 if self.enable_overlap:
-                    self.send_kv_chunk(req, end_idx=req.tmp_end_idx)
+                    self.send_kv_chunk(req, last_chunk=False, end_idx=req.tmp_end_idx)
+        # We need to remove the sync in the following function for overlap schedule.
+        self.set_next_batch_sampling_info_done(batch)
     def process_disagg_prefill_inflight_queue(self: Scheduler) -> None:
         """
@@ -335,7 +419,17 @@ class SchedulerDisaggregationPrefillMixin:
                 # FIXME: clean up req's data in transfer engine
                 done_reqs.append(req)
             elif poll == KVPoll.Failed:
-                raise Exception("Transferring failed")
+                error_message = f"Prefill transfer failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
+                try:
+                    req.disagg_kv_sender.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.warning(error_message)
+                self.tree_cache.cache_finished_req(req)  # unlock the tree
+                prepare_abort(
+                    req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                )
+                done_reqs.append(req)
         for req in done_reqs:
             self.disagg_prefill_bootstrap_queue.req_to_metadata_buffer_idx_allocator.free(
@@ -343,7 +437,11 @@ class SchedulerDisaggregationPrefillMixin:
             )
         # Stream requests which have finished transfer
-        self.stream_output(done_reqs, False, None)
+        self.stream_output(
+            done_reqs,
+            any(req.return_logprob for req in done_reqs),
+            None,
+        )
         self.disagg_prefill_inflight_queue = undone_reqs
@@ -369,7 +467,7 @@ class SchedulerDisaggregationPrefillMixin:
     def send_kv_chunk(
         self: Scheduler,
         req: Req,
-        token_id: Optional[int] = None,
+        last_chunk: bool = False,
         end_idx: Optional[int] = None,
     ) -> None:
         """
@@ -377,44 +475,28 @@ class SchedulerDisaggregationPrefillMixin:
         """
         page_size = self.token_to_kv_pool_allocator.page_size
         start_idx = req.start_send_idx
-        # if end_idx is specified, use it as the end index of the kv chunk because in overlap schedule,
-        # the resolved length is not the same as fill_ids's length
         end_idx = (
             end_idx
             if end_idx is not None
             else min(len(req.fill_ids), len(req.origin_input_ids))
         )
-        last_chunk = token_id is not None
-        if (not last_chunk) and (
-            end_idx % page_size != 0
-        ):  # todo: remove the second condition
+        if not last_chunk:
             # if not the last chunk and the last page is partial, delay the last partial page to the next send
             end_idx = end_idx - end_idx % page_size
-        # Update next start_send_idx
-        req.start_send_idx = end_idx
         kv_indices = (
             self.req_to_token_pool.req_to_token[req.req_pool_idx, start_idx:end_idx]
             .cpu()
             .numpy()
         )
-        if last_chunk is True:
-            self.disagg_prefill_bootstrap_queue.store_prefill_results(
-                req.metadata_buffer_index, token_id
-            )
+        req.start_send_idx = end_idx
+        if last_chunk:
+            self.disagg_metadata_buffers.set_buf(req)
         page_indices = kv_to_page_indices(kv_indices, page_size)
-        page_start_idx = start_idx // page_size
-        page_end_idx = page_start_idx + len(page_indices)
         if len(page_indices) == 0:
             logger.info(
                 f"Skip sending kv chunk for request {req.rid=} {req.bootstrap_room=} because page_indices is empty"
             )
             return
-        req.disagg_kv_sender.send(
-            page_indices, slice(page_start_idx, page_end_idx), last_chunk
-        )
+        req.disagg_kv_sender.send(page_indices)

sglang/srt/disaggregation/utils.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from __future__ import annotations
 import dataclasses
+import os
+import random
 import warnings
 from collections import deque
 from enum import Enum
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import requests
@@ -13,6 +15,14 @@ import torch.distributed as dist
 from sglang.srt.utils import get_ip
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+FakeBootstrapHost = "2.2.2.2"
+# env var for testing failure, convert to float explicitly
+FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
 class DisaggregationMode(Enum):
     NULL = "null"
@@ -20,11 +30,17 @@ class DisaggregationMode(Enum):
     DECODE = "decode"
-FakeBootstrapHost = "2.2.2.2"
 def poll_and_all_reduce(pollers, gloo_group):
-    polls = [int(poller.poll()) for poller in pollers]
+    # at a certain prob, the poll is failed to simulate failure
+    if FAILURE_PROB > 0:
+        from sglang.srt.disaggregation.base import KVPoll
+        polls = [
+            int(KVPoll.Failed) if random.random() < FAILURE_PROB else int(poller.poll())
+            for poller in pollers
+        ]
+    else:
+        polls = [int(poller.poll()) for poller in pollers]
     tensor_to_reduce = torch.tensor(polls, dtype=torch.uint8, device="cpu")
     dist.all_reduce(tensor_to_reduce, op=dist.ReduceOp.MIN, group=gloo_group)
     return tensor_to_reduce.tolist()
@@ -168,3 +184,98 @@ def is_mla_backend(target_kv_pool) -> bool:
     from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
     return isinstance(target_kv_pool, MLATokenToKVPool)
+def prepare_abort(req: Req, error_message: str, status_code=None):
+    from sglang.srt.managers.schedule_batch import FINISH_ABORT
+    # populate finish metadata and stream output
+    req.finished_reason = FINISH_ABORT(error_message, status_code)
+    if req.return_logprob:
+        req.input_token_logprobs_val = []
+        req.input_token_logprobs_idx = []
+        req.input_top_logprobs_val = []
+        req.input_top_logprobs_idx = []
+        req.input_token_ids_logprobs_val = []
+        req.input_token_ids_logprobs_idx = []
+class MetadataBuffers:
+    def __init__(self, size: int, max_top_logprobs_num: int = 128):
+        # TODO: abort top_logprobs_num > 128 in PD
+        # We transfer the metadata of first output token to decode
+        # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
+        self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device="cpu")
+        self.output_token_logprobs_val = torch.zeros(
+            (size, 16), dtype=torch.float32, device="cpu"
+        )
+        self.output_token_logprobs_idx = torch.zeros(
+            (size, 16), dtype=torch.int32, device="cpu"
+        )
+        self.output_top_logprobs_val = torch.zeros(
+            (size, max_top_logprobs_num), dtype=torch.float32, device="cpu"
+        )
+        self.output_top_logprobs_idx = torch.zeros(
+            (size, max_top_logprobs_num), dtype=torch.int32, device="cpu"
+        )
+    def get_buf_infos(self):
+        ptrs = [
+            self.output_ids.data_ptr(),
+            self.output_token_logprobs_val.data_ptr(),
+            self.output_token_logprobs_idx.data_ptr(),
+            self.output_top_logprobs_val.data_ptr(),
+            self.output_top_logprobs_idx.data_ptr(),
+        ]
+        data_lens = [
+            self.output_ids.nbytes,
+            self.output_token_logprobs_val.nbytes,
+            self.output_token_logprobs_idx.nbytes,
+            self.output_top_logprobs_val.nbytes,
+            self.output_top_logprobs_idx.nbytes,
+        ]
+        item_lens = [
+            self.output_ids[0].nbytes,
+            self.output_token_logprobs_val[0].nbytes,
+            self.output_token_logprobs_idx[0].nbytes,
+            self.output_top_logprobs_val[0].nbytes,
+            self.output_top_logprobs_idx[0].nbytes,
+        ]
+        return ptrs, data_lens, item_lens
+    def get_buf(self, idx: int):
+        return (
+            self.output_ids[idx],
+            self.output_token_logprobs_val[idx],
+            self.output_token_logprobs_idx[idx],
+            self.output_top_logprobs_val[idx],
+            self.output_top_logprobs_idx[idx],
+        )
+    def set_buf(self, req: Req):
+        self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
+        if req.return_logprob:
+            if req.output_token_logprobs_val:  # not none or empty list
+                self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_val[0]
+                )
+            if req.output_token_logprobs_idx:  # not none or empty list
+                self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_idx[0]
+                )
+            if req.output_top_logprobs_val:  # not none or empty list
+                self.output_top_logprobs_val[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_val[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
+                )
+            if req.output_top_logprobs_idx:  # not none or empty list
+                self.output_top_logprobs_idx[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_idx[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
+                )

sglang/srt/distributed/utils.py CHANGED Viewed

@@ -127,14 +127,14 @@ class StatelessProcessGroup:
         key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
         self.store.set(key, pickle.dumps(obj))
         self.send_dst_counter[dst] += 1
-        self.entries.append((key, time.time()))
+        self.entries.append((key, time.perf_counter()))
     def expire_data(self):
         """Expire data that is older than `data_expiration_seconds` seconds."""
         while self.entries:
             # check the oldest entry
             key, timestamp = self.entries[0]
-            if time.time() - timestamp > self.data_expiration_seconds:
+            if time.perf_counter() - timestamp > self.data_expiration_seconds:
                 self.store.delete_key(key)
                 self.entries.popleft()
             else:
@@ -158,7 +158,7 @@ class StatelessProcessGroup:
             key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
             self.store.set(key, pickle.dumps(obj))
             self.broadcast_send_counter += 1
-            self.entries.append((key, time.time()))
+            self.entries.append((key, time.perf_counter()))
             return obj
         else:
             key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"

sglang/srt/entrypoints/EngineBase.py CHANGED Viewed

@@ -27,6 +27,11 @@ class EngineBase(ABC):
         """Generate outputs based on given inputs."""
         pass
+    @abstractmethod
+    def flush_cache(self):
+        """Flush the cache of the engine."""
+        pass
     @abstractmethod
     def update_weights_from_tensor(
         self,

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -47,6 +47,7 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     GenerateReqInput,
     GetWeightsByNameReqInput,
+    ImageDataItem,
     InitWeightsUpdateGroupReqInput,
     ReleaseMemoryOccupationReqInput,
     ResumeMemoryOccupationReqInput,
@@ -150,9 +151,9 @@ class Engine(EngineBase):
         # See also python/sglang/srt/utils.py:load_image for more details.
         image_data: Optional[
             Union[
-                List[List[Union[Image, str]]],
-                List[Union[Image, str]],
-                Union[Image, str],
+                List[List[ImageDataItem]],
+                List[ImageDataItem],
+                ImageDataItem,
             ]
         ] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,
@@ -221,9 +222,9 @@ class Engine(EngineBase):
         # See also python/sglang/srt/utils.py:load_image for more details.
         image_data: Optional[
             Union[
-                List[List[Union[Image, str]]],
-                List[Union[Image, str]],
-                Union[Image, str],
+                List[List[ImageDataItem]],
+                List[ImageDataItem],
+                ImageDataItem,
             ]
         ] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,
@@ -320,7 +321,26 @@ class Engine(EngineBase):
         loop.run_until_complete(self.tokenizer_manager.start_profile())
     def stop_profile(self):
-        self.tokenizer_manager.stop_profile()
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(self.tokenizer_manager.stop_profile())
+    def start_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.start_expert_distribution_record()
+        )
+    def stop_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.stop_expert_distribution_record()
+        )
+    def dump_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.dump_expert_distribution_record()
+        )
     def get_server_info(self):
         loop = asyncio.get_event_loop()
@@ -486,7 +506,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.1.2.post1",
+            "0.1.4",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -47,7 +47,7 @@ from sglang.srt.disaggregation.utils import (
     register_disaggregation_server,
 )
 from sglang.srt.entrypoints.engine import _launch_subprocesses
-from sglang.srt.function_call_parser import FunctionCallParser
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.managers.io_struct import (
     AbortReq,
     CloseSessionReqInput,
@@ -182,13 +182,14 @@ async def health_generate(request: Request) -> Response:
         async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
             break
-    tic = time.time()
+    tic = time.perf_counter()
     task = asyncio.create_task(gen())
-    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
+    while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
         await asyncio.sleep(1)
         if _global_state.tokenizer_manager.last_receive_tstamp > tic:
             task.cancel()
             _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+            _global_state.tokenizer_manager.health_check_failed = False
             return Response(status_code=200)
     task.cancel()
@@ -202,6 +203,7 @@ async def health_generate(request: Request) -> Response:
         f"last_heartbeat time: {last_receive_time}"
     )
     _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+    _global_state.tokenizer_manager.health_check_failed = True
     return Response(status_code=503)
@@ -353,7 +355,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
 @app.api_route("/stop_profile", methods=["GET", "POST"])
 async def stop_profile_async():
     """Stop profiling."""
-    _global_state.tokenizer_manager.stop_profile()
+    await _global_state.tokenizer_manager.stop_profile()
     return Response(
         content="Stop profiling. This will take some time.\n",
         status_code=200,

sglang 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl