PyPI - sglang - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl - Mend

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

sglang/bench_one_batch.py +0 -2
sglang/bench_serving.py +224 -127
sglang/compile_deep_gemm.py +3 -0
sglang/launch_server.py +0 -14
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/falcon_h1.py +12 -58
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +68 -31
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/qwen3_next.py +11 -43
sglang/srt/disaggregation/decode.py +7 -18
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/nixl/conn.py +55 -23
sglang/srt/disaggregation/prefill.py +17 -32
sglang/srt/entrypoints/engine.py +2 -2
sglang/srt/entrypoints/grpc_request_manager.py +10 -23
sglang/srt/entrypoints/grpc_server.py +220 -80
sglang/srt/entrypoints/http_server.py +49 -1
sglang/srt/entrypoints/openai/protocol.py +159 -31
sglang/srt/entrypoints/openai/serving_chat.py +13 -71
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +4 -0
sglang/srt/function_call/function_call_parser.py +8 -6
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
sglang/srt/layers/attention/attention_registry.py +31 -22
sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
sglang/srt/layers/attention/flashattention_backend.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +223 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/triton_backend.py +1 -1
sglang/srt/layers/logits_processor.py +136 -6
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
sglang/srt/layers/moe/ep_moe/layer.py +8 -286
sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/utils.py +7 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/fp8.py +84 -18
sglang/srt/layers/quantization/modelopt_quant.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/w4afp8.py +2 -16
sglang/srt/lora/lora_manager.py +0 -8
sglang/srt/managers/overlap_utils.py +18 -16
sglang/srt/managers/schedule_batch.py +119 -90
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +213 -126
sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
sglang/srt/managers/tokenizer_manager.py +270 -53
sglang/srt/managers/tp_worker.py +39 -28
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +162 -68
sglang/srt/mem_cache/radix_cache.py +8 -3
sglang/srt/mem_cache/swa_radix_cache.py +70 -14
sglang/srt/model_executor/cuda_graph_runner.py +1 -1
sglang/srt/model_executor/forward_batch_info.py +4 -18
sglang/srt/model_executor/model_runner.py +55 -51
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +187 -6
sglang/srt/model_loader/weight_utils.py +3 -0
sglang/srt/models/falcon_h1.py +11 -9
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/grok.py +5 -13
sglang/srt/models/mixtral.py +1 -3
sglang/srt/models/mllama4.py +11 -1
sglang/srt/models/nemotron_h.py +514 -0
sglang/srt/models/utils.py +5 -1
sglang/srt/sampling/sampling_batch_info.py +11 -9
sglang/srt/server_args.py +100 -33
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_utils.py +0 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils/common.py +18 -0
sglang/srt/utils/hf_transformers_utils.py +2 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +40 -0
sglang/test/simple_eval_longbench_v2.py +332 -0
sglang/test/test_cutlass_w4a8_moe.py +9 -19
sglang/test/test_deterministic.py +18 -2
sglang/test/test_deterministic_utils.py +81 -0
sglang/test/test_disaggregation_utils.py +63 -0
sglang/test/test_utils.py +32 -11
sglang/version.py +1 -1
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -174,7 +174,7 @@ class SchedulePolicy:
         self.waiting_queue_radix_tree.reset()
         for r in waiting_queue:
-            prefix_ids = r.adjust_max_prefix_ids()
+            prefix_ids = r.origin_input_ids + r.output_ids
             extra_key = r.extra_key
             # NOTE: the prefix_indices must always be aligned with last_node

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -25,12 +25,14 @@ from concurrent import futures
 from dataclasses import dataclass
 from http import HTTPStatus
 from types import SimpleNamespace
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Deque, Dict, List, Optional, Tuple, Union
 import psutil
 import setproctitle
 import torch
 import zmq
+from torch.cuda import Stream as CudaStream
+from torch.cuda import StreamContext as CudaStreamContext
 from torch.distributed import barrier
 from sglang.global_config import global_config
@@ -112,8 +114,10 @@ from sglang.srt.managers.io_struct import (
     UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.mm_utils import init_embedding_cache
+from sglang.srt.managers.overlap_utils import FutureIndices, FutureMap
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
+    ModelWorkerBatch,
     MultimodalInputs,
     Req,
     RequestStage,
@@ -139,15 +143,13 @@ from sglang.srt.managers.scheduler_update_weights_mixin import (
     SchedulerUpdateWeightsMixin,
 )
 from sglang.srt.managers.session_controller import Session
-from sglang.srt.managers.tp_worker import TpModelWorker
-from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
 from sglang.srt.managers.utils import validate_input_length
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
 from sglang.srt.model_executor.forward_batch_info import (
-    ForwardBatchOutput,
+    ForwardBatch,
     ForwardMode,
     PPProxyTensors,
 )
@@ -201,40 +203,48 @@ GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))
 @dataclass
 class GenerationBatchResult:
-    logits_output: Optional[LogitsProcessorOutput]
-    pp_hidden_states_proxy_tensors: Optional[PPProxyTensors]
-    next_token_ids: Optional[List[int]]
-    can_run_cuda_graph: bool
+    logits_output: Optional[LogitsProcessorOutput] = None
+    pp_hidden_states_proxy_tensors: Optional[PPProxyTensors] = None
+    next_token_ids: Optional[torch.Tensor] = None
+    num_accepted_tokens: Optional[int] = None
+    can_run_cuda_graph: bool = False
     # For output processing
-    extend_input_len_per_req: List[int]
-    extend_logprob_start_len_per_req: List[int]
-    @classmethod
-    def from_forward_batch_output(
-        cls,
-        forward_batch_output: ForwardBatchOutput,
-        extend_input_len_per_req: List[int],
-        extend_logprob_start_len_per_req: List[int],
-    ):
-        # TODO(lsyin): remove this workaround logic and try to unify output classes
-        return cls(
-            logits_output=forward_batch_output.logits_output,
-            pp_hidden_states_proxy_tensors=forward_batch_output.pp_proxy_tensors,
-            next_token_ids=forward_batch_output.next_token_ids,
-            extend_input_len_per_req=extend_input_len_per_req,
-            extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
-            can_run_cuda_graph=forward_batch_output.can_run_cuda_graph,
-        )
+    extend_input_len_per_req: Optional[List[int]] = None
+    extend_logprob_start_len_per_req: Optional[List[int]] = None
+    # For overlap scheduling
+    copy_done: Optional[torch.cuda.Event] = None
+    delay_sample_launch: bool = False
+    forward_batch: Optional[ForwardBatch] = None
+    future_indices: Optional[FutureIndices] = None
+    def copy_to_cpu(self, return_logprob: bool = False):
+        """Copy tensors to CPU in overlap scheduling.
+        Only the tensors which are needed for processing results are copied,
+        e.g., next_token_ids, logits outputs
+        """
+        if return_logprob:
+            if self.logits_output.next_token_logits is not None:
+                self.logits_output.next_token_logits = (
+                    self.logits_output.next_token_logits.to("cpu", non_blocking=True)
+                )
+            if self.logits_output.input_token_logprobs is not None:
+                self.logits_output.input_token_logprobs = (
+                    self.logits_output.input_token_logprobs.to("cpu", non_blocking=True)
+                )
+        if self.logits_output.hidden_states is not None:
+            self.logits_output.hidden_states = self.logits_output.hidden_states.to(
+                "cpu", non_blocking=True
+            )
+        self.next_token_ids = self.next_token_ids.to("cpu", non_blocking=True)
+        self.copy_done.record()
     @classmethod
     def from_pp_proxy(
         cls, logits_output, next_pp_outputs: PPProxyTensors, can_run_cuda_graph
     ):
-        # TODO(lsyin): also simplify this logic
-        # Current PP implementation in scheduler is not compatible with ForwardBatchOutput
-        # Maybe introduce a ProxyBatchOutput for PP and the original ForwardBatchOutput for TP
+        # TODO(lsyin): refactor PP and avoid using dict
         proxy_dict = next_pp_outputs.tensors
         return cls(
             logits_output=logits_output,
@@ -263,6 +273,48 @@ class Scheduler(
 ):
     """A scheduler that manages a tensor parallel GPU worker."""
+    def launch_draft_worker(
+        self, gpu_id, tp_rank, moe_ep_rank, server_args, port_args, dp_rank
+    ):
+        if self.spec_algorithm.is_eagle():
+            from sglang.srt.speculative.eagle_worker import EAGLEWorker
+            self.draft_worker = EAGLEWorker(
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                moe_ep_rank=moe_ep_rank,
+                server_args=server_args,
+                nccl_port=port_args.nccl_port,
+                target_worker=self.tp_worker,
+                dp_rank=dp_rank,
+            )
+        elif self.spec_algorithm.is_standalone():
+            from sglang.srt.speculative.standalone_worker import StandaloneWorker
+            self.draft_worker = StandaloneWorker(
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                moe_ep_rank=moe_ep_rank,
+                server_args=server_args,
+                nccl_port=port_args.nccl_port,
+                target_worker=self.tp_worker,
+                dp_rank=dp_rank,
+            )
+        elif self.spec_algorithm.is_ngram():
+            from sglang.srt.speculative.ngram_worker import NGRAMWorker
+            self.draft_worker = NGRAMWorker(
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                moe_ep_rank=moe_ep_rank,
+                server_args=server_args,
+                nccl_port=port_args.nccl_port,
+                target_worker=self.tp_worker,
+                dp_rank=dp_rank,
+            )
+        else:
+            self.draft_worker = None
     def __init__(
         self,
         server_args: ServerArgs,
@@ -388,12 +440,10 @@ class Scheduler(
             logger.info("Overlap scheduler is disabled for embedding models.")
         # Launch a tensor parallel worker
-        if self.enable_overlap:
-            TpWorkerClass = TpModelWorkerClient
-        else:
-            TpWorkerClass = TpModelWorker
-        self.tp_worker = TpWorkerClass(
+        from sglang.srt.managers.tp_worker import TpModelWorker
+        self.tp_worker = TpModelWorker(
             server_args=server_args,
             gpu_id=gpu_id,
             tp_rank=tp_rank,
@@ -404,44 +454,9 @@ class Scheduler(
         )
         # Launch a draft worker for speculative decoding
-        if self.spec_algorithm.is_eagle():
-            from sglang.srt.speculative.eagle_worker import EAGLEWorker
-            self.draft_worker = EAGLEWorker(
-                gpu_id=gpu_id,
-                tp_rank=tp_rank,
-                moe_ep_rank=moe_ep_rank,
-                server_args=server_args,
-                nccl_port=port_args.nccl_port,
-                target_worker=self.tp_worker,
-                dp_rank=dp_rank,
-            )
-        elif self.spec_algorithm.is_standalone():
-            from sglang.srt.speculative.standalone_worker import StandaloneWorker
-            self.draft_worker = StandaloneWorker(
-                gpu_id=gpu_id,
-                tp_rank=tp_rank,
-                moe_ep_rank=moe_ep_rank,
-                server_args=server_args,
-                nccl_port=port_args.nccl_port,
-                target_worker=self.tp_worker,
-                dp_rank=dp_rank,
-            )
-        elif self.spec_algorithm.is_ngram():
-            from sglang.srt.speculative.ngram_worker import NGRAMWorker
-            self.draft_worker = NGRAMWorker(
-                gpu_id=gpu_id,
-                tp_rank=tp_rank,
-                moe_ep_rank=moe_ep_rank,
-                server_args=server_args,
-                nccl_port=port_args.nccl_port,
-                target_worker=self.tp_worker,
-                dp_rank=dp_rank,
-            )
-        else:
-            self.draft_worker = None
+        self.launch_draft_worker(
+            gpu_id, tp_rank, moe_ep_rank, server_args, port_args, dp_rank
+        )
         # Dispatch the model worker
         if self.spec_algorithm.is_none():
@@ -464,8 +479,8 @@ class Scheduler(
             _,
             _,
         ) = self.tp_worker.get_worker_info()
-        if global_server_args_dict["max_micro_batch_size"] is None:
-            global_server_args_dict["max_micro_batch_size"] = max(
+        if global_server_args_dict["pp_max_micro_batch_size"] is None:
+            global_server_args_dict["pp_max_micro_batch_size"] = max(
                 self.max_running_requests // server_args.pp_size, 1
             )
@@ -525,9 +540,11 @@ class Scheduler(
         self.kv_transfer_speed_gb_s: float = 0.0
         self.kv_transfer_latency_ms: float = 0.0
         self.sessions: Dict[str, Session] = {}
-        self.current_stream = torch.get_device_module(self.device).current_stream()
+        self.default_stream: CudaStream = torch.get_device_module(
+            self.device
+        ).current_stream()
         if self.device == "cpu":
-            self.current_stream.synchronize = lambda: None  # No-op for CPU
+            self.default_stream.synchronize = lambda: None  # No-op for CPU
         self.forward_sleep_time = None
         # Init chunked prefill
@@ -618,6 +635,9 @@ class Scheduler(
         # Init prefill kv split size when deterministic inference is enabled with various attention backends
         self.init_deterministic_inference_config()
+        # Init overlap
+        self.init_overlap()
         # Init request dispatcher
         self._request_dispatcher = TypeBasedDispatcher(
             [
@@ -777,6 +797,7 @@ class Scheduler(
                     sliding_window_size=self.sliding_window_size,
                     page_size=self.page_size,
                     disable=server_args.disable_radix_cache,
+                    is_eagle=self.spec_algorithm.is_eagle(),
                 )
             elif server_args.enable_lmcache:
                 from sglang.srt.mem_cache.storage.lmcache.lmc_radix_cache import (
@@ -931,6 +952,32 @@ class Scheduler(
             # The prefill requests that are in the middle of kv sending
             self.disagg_prefill_inflight_queue: List[Req] = []
+    def init_overlap(self):
+        if not self.enable_overlap:
+            return
+        self.forward_stream: CudaStream = torch.get_device_module(self.device).Stream()
+        self.forward_stream_ctx: CudaStreamContext = torch.get_device_module(
+            self.device
+        ).stream(self.forward_stream)
+        self.copy_stream: CudaStream = torch.get_device_module(self.device).Stream()
+        self.copy_stream_ctx: CudaStreamContext = torch.get_device_module(
+            self.device
+        ).stream(self.copy_stream)
+        self.future_map = FutureMap(self.max_running_requests, self.device)
+        self.batch_record_buf = [None] * 2
+        self.batch_record_ct = 0
+    def record_batch_in_overlap(self, model_worker_batch: ModelWorkerBatch):
+        # FIXME(lsyin): hacky way to keep a reference to avoid GPU tensors being freed by torch GC
+        # NOTE: More Reliable: record all tensors into the forward stream
+        # NOTE: - for all future tensors, we shall always read from future map
+        #       - for all non-future tensors (produced only by schedule stream),
+        #       we shall keep its reference not being release during all the forwarding pass
+        self.batch_record_ct = (self.batch_record_ct + 1) % 2
+        self.batch_record_buf[self.batch_record_ct] = model_worker_batch
     def init_moe_config(self):
         if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
             initialize_moe_config(self.server_args)
@@ -957,9 +1004,11 @@ class Scheduler(
     @DynamicGradMode()
     def event_loop_overlap(self):
         """A scheduler loop that overlaps the CPU processing and GPU computation."""
-        self.result_queue = deque()
+        self.result_queue: Deque[Tuple[ScheduleBatch, GenerationBatchResult]] = deque()
         while True:
+            self.launch_last_batch_sample_if_needed()
             recv_reqs = self.recv_requests()
             self.process_input_requests(recv_reqs)
@@ -967,30 +1016,13 @@ class Scheduler(
             self.cur_batch = batch
             if batch:
-                batch.launch_done = threading.Event()
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
-                if self.last_batch is None:
-                    # Create a dummy first batch to start the pipeline for overlap schedule.
-                    # It is now used for triggering the sampling_info_done event.
-                    tmp_batch = ScheduleBatch(
-                        reqs=None,
-                        forward_mode=ForwardMode.DUMMY_FIRST,
-                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
-                    )
-                    self.process_batch_result(tmp_batch, None, batch.launch_done)
             if self.last_batch:
                 # Process the results of the last batch
                 tmp_batch, tmp_result = self.result_queue.popleft()
-                tmp_batch.next_batch_sampling_info = (
-                    self.tp_worker.cur_sampling_info if batch else None
-                )
-                # NOTE: we should use current launched batch's launch_done event Instead of the last batch's
-                self.process_batch_result(
-                    tmp_batch, tmp_result, batch.launch_done if batch else None
-                )
+                self.process_batch_result(tmp_batch, tmp_result)
             elif batch is None:
                 # When the server is idle, do self-check and re-init some states
                 self.self_check_during_idle()
@@ -1745,7 +1777,7 @@ class Scheduler(
             chunked_req_to_exclude.add(self.chunked_req)
             self.tree_cache.cache_unfinished_req(self.chunked_req, chunked=True)
             # chunked request keeps its rid but will get a new req_pool_idx
-            if self.tp_worker.worker.model_runner.is_hybrid_gdn:
+            if self.tp_worker.worker.model_runner.mambaish_config is not None:
                 self.req_to_token_pool.free(
                     self.chunked_req.req_pool_idx, free_mamba_cache=False
                 )
@@ -1802,7 +1834,7 @@ class Scheduler(
         return ret
     def get_num_allocatable_reqs(self, running_bs):
-        res = global_server_args_dict["max_micro_batch_size"] - running_bs
+        res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
         if self.pp_size > 1:
             res = min(res, self.req_to_token_pool.available_size())
         return res
@@ -2055,18 +2087,59 @@ class Scheduler(
                 # FIXME(lsyin): remove this if and finally unify the abstraction
                 batch_or_worker_batch = batch.get_model_worker_batch()
-            forward_batch_output = self.model_worker.forward_batch_generation(
-                batch_or_worker_batch
-            )
+            if self.enable_overlap:
+                # FIXME: remove this assert
+                assert isinstance(batch_or_worker_batch, ModelWorkerBatch)
+                model_worker_batch = batch_or_worker_batch
+                self.record_batch_in_overlap(model_worker_batch)
+                # Sampling info will be modified during forward
+                model_worker_batch.sampling_info = (
+                    model_worker_batch.sampling_info.copy_for_forward()
+                )
+                bs = len(model_worker_batch.seq_lens)
+                future_indices = self.future_map.alloc_future_indices(bs)
+                with self.forward_stream_ctx:
+                    self.forward_stream.wait_stream(self.default_stream)
+                    self.future_map.resolve_future(model_worker_batch)
+                    if batch.sampling_info.grammars is not None:
+                        model_worker_batch.delay_sample_launch = True
+                    batch_result = self.model_worker.forward_batch_generation(
+                        batch_or_worker_batch
+                    )
+                    # FIXME(lsyin): maybe move this to forward_batch_generation
+                    batch_result.copy_done = torch.get_device_module(
+                        self.device
+                    ).Event()
+                    if not model_worker_batch.delay_sample_launch:
+                        self.future_map.store_to_map(
+                            future_indices, batch_result.next_token_ids
+                        )
+                        batch_result.copy_to_cpu()
+                    else:
+                        batch_result.future_indices = future_indices
+                # FIXME(lsyin): move this assignment elsewhere
+                maybe_future_next_token_ids = -future_indices.indices
+            else:
+                batch_result = self.model_worker.forward_batch_generation(
+                    batch_or_worker_batch
+                )
+                maybe_future_next_token_ids = batch_result.next_token_ids
             if not self.spec_algorithm.is_none():
                 # TODO(lsyin): unify this metric-updating logic with non-spec, and move it to decode processing
-                self.udpate_spec_metrics(
-                    batch.batch_size(), forward_batch_output.num_accepted_tokens
+                self.update_spec_metrics(
+                    batch.batch_size(), batch_result.num_accepted_tokens
                 )
-            # update batch's output ids
-            batch.output_ids = forward_batch_output.next_token_ids
+            # NOTE: maybe_future_next_token_ids is used in ScheduleBatch,
+            #       which can probably be replaced by future_indices later [TODO(lsyin)].
+            #       we shall still keep the original outputs, e.g. next_token_ids
+            #       in the GenerationBatchOutput for processing after copy_done.
+            batch.output_ids = maybe_future_next_token_ids
             # These 2 values are needed for processing the output, but the values can be
             # modified by overlap schedule. So we have to copy them here so that
@@ -2083,39 +2156,60 @@ class Scheduler(
             else:
                 extend_logprob_start_len_per_req = None
-            return GenerationBatchResult.from_forward_batch_output(
-                forward_batch_output=forward_batch_output,
-                extend_input_len_per_req=extend_input_len_per_req,
-                extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
+            batch_result.extend_input_len_per_req = extend_input_len_per_req
+            batch_result.extend_logprob_start_len_per_req = (
+                extend_logprob_start_len_per_req
             )
+            return batch_result
         else:  # embedding or reward model
             model_worker_batch = batch.get_model_worker_batch()
             embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
             ret = EmbeddingBatchResult(embeddings=embeddings)
         return ret
+    def launch_last_batch_sample_if_needed(
+        self,
+    ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
+        if len(self.result_queue) == 0:
+            return
+        tmp_batch, tmp_result = self.result_queue.popleft()
+        tmp_result: GenerationBatchResult
+        if not tmp_result.delay_sample_launch:
+            self.result_queue.appendleft((tmp_batch, tmp_result))
+            return
+        with self.forward_stream_ctx:
+            self.forward_stream.wait_stream(self.default_stream)
+            tmp_result.next_token_ids = self.model_worker.model_runner.sample(
+                tmp_result.logits_output,
+                tmp_result.forward_batch,
+            )
+            future_indices = tmp_result.future_indices
+            self.future_map.store_to_map(future_indices, tmp_result.next_token_ids)
+            tmp_result.copy_to_cpu()
+            self.result_queue.appendleft((tmp_batch, tmp_result))
     def process_batch_result(
         self,
         batch: ScheduleBatch,
         result: Union[GenerationBatchResult, EmbeddingBatchResult],
-        launch_done: Optional[threading.Event] = None,
     ):
         if batch.forward_mode.is_decode():
-            self.process_batch_result_decode(batch, result, launch_done)
+            self.process_batch_result_decode(batch, result)
             if self.enable_trace:
                 trace_slice_batch("decode loop", batch.reqs)
         elif batch.forward_mode.is_extend():
-            self.process_batch_result_prefill(batch, result, launch_done)
+            self.process_batch_result_prefill(batch, result)
             if self.enable_trace:
                 trace_slice_batch("prefill", batch.reqs)
         elif batch.forward_mode.is_idle():
             if self.enable_overlap:
-                self.tp_worker.resolve_last_batch_result(launch_done)
-                self.set_next_batch_sampling_info_done(batch)
-        elif batch.forward_mode.is_dummy_first():
-            self.set_next_batch_sampling_info_done(batch)
+                if result.copy_done is not None:
+                    result.copy_done.synchronize()
         self.maybe_send_health_check_signal()
@@ -2325,13 +2419,6 @@ class Scheduler(
             self._add_request_to_queue(req)
         self.grammar_queue = self.grammar_queue[num_ready_reqs:]
-    def set_next_batch_sampling_info_done(self, batch: ScheduleBatch):
-        if batch.next_batch_sampling_info:
-            if batch.next_batch_sampling_info.grammars is not None:
-                batch.next_batch_sampling_info.update_regex_vocab_mask()
-                self.current_stream.synchronize()
-            batch.next_batch_sampling_info.sampling_info_done.set()
     def watchdog_thread(self):
         """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
         self.watchdog_last_forward_ct = 0
@@ -2510,7 +2597,7 @@ class Scheduler(
         server_args_dict = recv_req.server_args
         args_allow_update = set(
             [
-                "max_micro_batch_size",
+                "pp_max_micro_batch_size",
                 "speculative_accept_threshold_single",
                 "speculative_accept_threshold_acc",
             ]
@@ -2521,7 +2608,7 @@ class Scheduler(
                 logging.warning(f"Updating {k} is not supported.")
                 if_success = False
                 break
-            elif k == "max_micro_batch_size" and (
+            elif k == "pp_max_micro_batch_size" and (
                 v > self.max_running_requests // self.pp_size or v < 1
             ):
                 logging.warning(

sglang/srt/managers/scheduler_metrics_mixin.py CHANGED Viewed

@@ -69,7 +69,7 @@ class SchedulerMetricsMixin:
                 kv_events_config, self.attn_dp_rank
             )
-    def udpate_spec_metrics(self, bs: int, num_accepted_tokens: int):
+    def update_spec_metrics(self, bs: int, num_accepted_tokens: int):
         self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
         self.spec_num_total_forward_ct += bs
         self.num_generated_tokens += num_accepted_tokens

sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl