PyPI - sglang - Versions diffs - 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl - Mend

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -9
sglang/compile_deep_gemm.py +45 -4
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +9 -3
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +67 -13
sglang/srt/disaggregation/fake/__init__.py +1 -0
sglang/srt/disaggregation/fake/conn.py +88 -0
sglang/srt/disaggregation/mini_lb.py +45 -8
sglang/srt/disaggregation/mooncake/conn.py +198 -31
sglang/srt/disaggregation/prefill.py +36 -12
sglang/srt/disaggregation/utils.py +16 -2
sglang/srt/entrypoints/engine.py +9 -0
sglang/srt/entrypoints/http_server.py +35 -4
sglang/srt/function_call_parser.py +77 -5
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
sglang/srt/layers/attention/flashattention_backend.py +28 -10
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/utils.py +1 -1
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/layernorm.py +38 -16
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/deep_gemm.py +17 -10
sglang/srt/layers/quantization/fp8.py +20 -22
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +170 -126
sglang/srt/managers/data_parallel_controller.py +10 -3
sglang/srt/managers/io_struct.py +7 -0
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +38 -12
sglang/srt/managers/scheduler.py +41 -28
sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/managers/tp_worker.py +3 -3
sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
sglang/srt/mem_cache/memory_pool.py +87 -0
sglang/srt/model_executor/cuda_graph_runner.py +4 -3
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +19 -25
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +144 -70
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpmo.py +5 -1
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +50 -11
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/reasoning_parser.py +25 -1
sglang/srt/server_args.py +31 -24
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +5 -1
sglang/test/runners.py +6 -13
sglang/test/send_one.py +84 -28
sglang/test/test_utils.py +74 -18
sglang/version.py +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -35,6 +35,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 import copy
 import dataclasses
 import logging
+import threading
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -285,6 +286,7 @@ class MultimodalInputs:
     num_image_tokens: Optional[int] = None
     # QWen2-VL related
+    mrope_positions: Optional[torch.Tensor] = None
     mrope_position_delta: Optional[torch.Tensor] = None
     # image
@@ -310,16 +312,12 @@ class MultimodalInputs:
         assert isinstance(ret.mm_items, list)
         ret.mm_items = [item for item in ret.mm_items if item.is_valid()]
-        assert len(ret.mm_items) != 0
-        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
-        # Please note that if the `input_ids` is later used in the model forward,
-        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
-        # errors in cuda kernels. See also llava.py for example.
         for item in ret.mm_items:
             item.set_pad_value()
         optional_args = [
+            "mrope_positions",
+            "mrope_position_delta",
             "im_token_id",
             "im_start_id",
             "im_end_id",
@@ -350,11 +348,6 @@ class MultimodalInputs:
         merge image inputs when requests are being merged
         """
-        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
-        # Please note that if the `input_ids` is later used in the model forward,
-        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
-        # errors in cuda kernels. See also llava.py for example.
         # args needed to be merged
         optional_args = [
             "mm_items",
@@ -364,6 +357,30 @@ class MultimodalInputs:
             self_arg = getattr(self, arg, None)
             if self_arg is not None:
                 setattr(self, arg, self_arg + getattr(other, arg))
+        mrope_positions = self.mrope_positions
+        if mrope_positions is not None:
+            if other.mrope_positions is None:
+                self.mrope_positions = mrope_positions
+            else:
+                self.mrope_positions = torch.cat(
+                    [self.mrope_positions, other.mrope_positions], dim=1
+                )
+        mrope_position_delta = self.mrope_position_delta
+        if mrope_position_delta is not None:
+            if other.mrope_position_delta is None:
+                self.mrope_position_delta = mrope_position_delta
+            else:
+                self.mrope_position_delta = torch.cat(
+                    [self.mrope_position_delta, other.mrope_position_delta], dim=0
+                )
+        for key, val in other.__dict__.items():
+            if "_id" in key:
+                # set token_ids
+                if getattr(self, key, None) is None:
+                    setattr(self, key, getattr(other, key, None))
         # other args would be kept intact
@@ -388,6 +405,7 @@ class Req:
         return_hidden_states: bool = False,
         eos_token_ids: Optional[Set[int]] = None,
         bootstrap_host: Optional[str] = None,
+        bootstrap_port: Optional[int] = None,
         bootstrap_room: Optional[int] = None,
     ):
         # Input and output info
@@ -523,6 +541,7 @@ class Req:
         # For disaggregation
         self.bootstrap_host: str = bootstrap_host
+        self.bootstrap_port: Optional[int] = bootstrap_port
         self.bootstrap_room: Optional[int] = bootstrap_room
         self.disagg_kv_sender: Optional[BaseKVSender] = None
@@ -706,6 +725,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # This is an optimization to reduce the overhead of the prefill check.
     batch_is_full: bool = False
+    # Events
+    launch_done: Optional[threading.Event] = None
     # Sampling info
     sampling_info: SamplingBatchInfo = None
     next_batch_sampling_info: SamplingBatchInfo = None
@@ -1450,7 +1472,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if self.model_config.is_encoder_decoder:
             self.encoder_lens = torch.cat([self.encoder_lens, other.encoder_lens])
             self.encoder_lens_cpu.extend(other.encoder_lens_cpu)
         self.req_pool_indices = torch.cat(
             [self.req_pool_indices, other.req_pool_indices]
         )
@@ -1494,6 +1515,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
             or global_server_args_dict["attention_backend"] == "flashmla"
             or global_server_args_dict["attention_backend"] == "fa3"
+            or global_server_args_dict["attention_backend"] == "cutlass_mla"
         ):
             seq_lens_cpu = self.seq_lens.cpu()
         else:
@@ -1548,6 +1570,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 )
             ),
             extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
+            launch_done=self.launch_done,
         )
     def copy(self):
@@ -1630,6 +1653,9 @@ class ModelWorkerBatch:
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
+    # Overlap event
+    launch_done: Optional[threading.Event] = None
 @triton.jit
 def write_req_to_token_pool_triton(

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -248,9 +248,6 @@ class Scheduler(
         if not self.is_generation:
             self.enable_overlap = False
             logger.info("Overlap scheduler is disabled for embedding models.")
-        if self.model_config.is_multimodal:
-            self.enable_overlap = False
-            logger.info("Overlap scheduler is disabled for multimodal models.")
         # Launch a tensor parallel worker
         if self.enable_overlap:
@@ -578,6 +575,10 @@ class Scheduler(
                 bootstrap_port=self.server_args.disaggregation_bootstrap_port,
                 transfer_backend=self.transfer_backend,
             )
+            # Metric for pre-allocation
+            self.num_tokens_pre_allocated = 0
         elif self.disaggregation_mode == DisaggregationMode.PREFILL:
             # *2 for the headroom.
             buffer_size = self.max_running_requests * 2
@@ -593,7 +594,7 @@ class Scheduler(
             )
             metadata_buffers = [output_id_buffer]
-            self.disagg_prefill_pending_queue = PrefillBootstrapQueue(
+            self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
                 token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
                 req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
                 metadata_buffers=metadata_buffers,
@@ -641,6 +642,7 @@ class Scheduler(
             self.cur_batch = batch
             if batch:
+                batch.launch_done = threading.Event()
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
@@ -652,7 +654,7 @@ class Scheduler(
                         forward_mode=ForwardMode.DUMMY_FIRST,
                         next_batch_sampling_info=self.tp_worker.cur_sampling_info,
                     )
-                    self.process_batch_result(tmp_batch, None)
+                    self.process_batch_result(tmp_batch, None, batch.launch_done)
             if self.last_batch:
                 # Process the results of the last batch
@@ -660,7 +662,10 @@ class Scheduler(
                 tmp_batch.next_batch_sampling_info = (
                     self.tp_worker.cur_sampling_info if batch else None
                 )
-                self.process_batch_result(tmp_batch, tmp_result)
+                # NOTE: we should use current launched batch's launch_done event Instead of the last batch's
+                self.process_batch_result(
+                    tmp_batch, tmp_result, batch.launch_done if batch else None
+                )
             elif batch is None:
                 # When the server is idle, do self-check and re-init some states
                 self.check_memory()
@@ -787,6 +792,7 @@ class Scheduler(
                 return_hidden_states=recv_req.return_hidden_states,
                 eos_token_ids=self.model_config.hf_eos_token_id,
                 bootstrap_host=recv_req.bootstrap_host,
+                bootstrap_port=recv_req.bootstrap_port,
                 bootstrap_room=recv_req.bootstrap_room,
             )
             req.tokenizer = self.tokenizer
@@ -901,7 +907,7 @@ class Scheduler(
     def _add_request_to_queue(self, req: Req):
         req.queue_time_start = time.time()
         if self.disaggregation_mode == DisaggregationMode.PREFILL:
-            self.disagg_prefill_pending_queue.add(req)
+            self.disagg_prefill_bootstrap_queue.add(req)
         elif self.disaggregation_mode == DisaggregationMode.DECODE:
             self.disagg_decode_prealloc_queue.add(req)
         else:
@@ -991,8 +997,15 @@ class Scheduler(
             f"#cached-token: {adder.log_hit_tokens}, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
             f"#running-req: {running_bs}, "
-            f"#queue-req: {len(self.waiting_queue)}, "
         )
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
+            f += f"#queue-req: {len(self.waiting_queue)}, "
+            f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
+        else:
+            f += f"#queue-req: {len(self.waiting_queue)}"
         logger.info(f)
         if self.enable_metrics:
@@ -1028,15 +1041,14 @@ class Scheduler(
                 gap_latency / self.server_args.decode_log_interval
             )
+        msg = (
+            f"Decode batch. "
+            f"#running-req: {num_running_reqs}, "
+            f"#token: {num_used}, "
+            f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
+        )
         if self.spec_algorithm.is_none():
-            msg = (
-                f"Decode batch. "
-                f"#running-req: {num_running_reqs}, "
-                f"#token: {num_used}, "
-                f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-                f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
-                f"#queue-req: {len(self.waiting_queue)}, "
-            )
             spec_accept_length = 0
         else:
             spec_accept_length = (
@@ -1045,15 +1057,15 @@ class Scheduler(
             self.cum_spec_accept_length += self.spec_num_total_accepted_tokens
             self.cum_spec_accept_count += self.spec_num_total_forward_ct
             self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
-            msg = (
-                f"Decode batch. "
-                f"#running-req: {num_running_reqs}, "
-                f"#token: {num_used}, "
-                f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-                f"accept len: {spec_accept_length:.2f}, "
-                f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
-                f"#queue-req: {len(self.waiting_queue)}, "
-            )
+            msg += f"accept len: {spec_accept_length:.2f}, "
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            msg += f"pre-allocated usage: {self.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
+        msg += (
+            f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
+            f"#queue-req: {len(self.waiting_queue)}"
+        )
         logger.info(msg)
         if self.enable_metrics:
@@ -1406,14 +1418,15 @@ class Scheduler(
         self,
         batch: ScheduleBatch,
         result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
     ):
         if batch.forward_mode.is_decode():
-            self.process_batch_result_decode(batch, result)
+            self.process_batch_result_decode(batch, result, launch_done)
         elif batch.forward_mode.is_extend():
-            self.process_batch_result_prefill(batch, result)
+            self.process_batch_result_prefill(batch, result, launch_done)
         elif batch.forward_mode.is_idle():
             if self.enable_overlap:
-                self.tp_worker.resolve_batch_result(result.bid)
+                self.tp_worker.resolve_last_batch_result(launch_done)
                 if batch.next_batch_sampling_info:
                     batch.next_batch_sampling_info.update_regex_vocab_mask()
                     self.current_stream.synchronize()

sglang/srt/managers/scheduler_output_processor_mixin.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import threading
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -11,6 +12,7 @@ if TYPE_CHECKING:
         EmbeddingBatchResult,
         GenerationBatchResult,
         ScheduleBatch,
+        Scheduler,
     )
@@ -21,9 +23,10 @@ class SchedulerOutputProcessorMixin:
     """
     def process_batch_result_prefill(
-        self,
+        self: Scheduler,
         batch: ScheduleBatch,
         result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
     ):
         skip_stream_req = None
@@ -43,7 +46,11 @@ class SchedulerOutputProcessorMixin:
             )
             if self.enable_overlap:
-                logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+                logits_output, next_token_ids = (
+                    self.tp_worker.resolve_last_batch_result(
+                        launch_done,
+                    )
+                )
             else:
                 # Move next_token_ids and logprobs to cpu
                 next_token_ids = next_token_ids.tolist()
@@ -175,9 +182,10 @@ class SchedulerOutputProcessorMixin:
         self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
     def process_batch_result_decode(
-        self,
+        self: Scheduler,
         batch: ScheduleBatch,
         result: GenerationBatchResult,
+        launch_done: Optional[threading.Event] = None,
     ):
         logits_output, next_token_ids, bid = (
             result.logits_output,
@@ -187,7 +195,9 @@ class SchedulerOutputProcessorMixin:
         self.num_generated_tokens += len(batch.reqs)
         if self.enable_overlap:
-            logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+            logits_output, next_token_ids = self.tp_worker.resolve_last_batch_result(
+                launch_done
+            )
             next_token_logprobs = logits_output.next_token_logprobs
         elif batch.spec_algorithm.is_none():
             # spec decoding handles output logprobs inside verify process.
@@ -271,7 +281,7 @@ class SchedulerOutputProcessorMixin:
             self.log_decode_stats()
     def add_input_logprob_return_values(
-        self,
+        self: Scheduler,
         i: int,
         req: Req,
         output: LogitsProcessorOutput,
@@ -405,7 +415,7 @@ class SchedulerOutputProcessorMixin:
                     assert len(req.input_token_ids_logprobs_idx) == relevant_tokens_len
     def add_logprob_return_values(
-        self,
+        self: Scheduler,
         i: int,
         req: Req,
         pt: int,
@@ -436,7 +446,10 @@ class SchedulerOutputProcessorMixin:
         return num_input_logprobs
     def stream_output(
-        self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
     ):
         """Stream the output to detokenizer."""
         if self.is_generation:
@@ -445,7 +458,10 @@ class SchedulerOutputProcessorMixin:
             self.stream_output_embedding(reqs)
     def stream_output_generation(
-        self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
     ):
         rids = []
         finished_reasons: List[BaseFinishReason] = []
@@ -593,7 +609,7 @@ class SchedulerOutputProcessorMixin:
                 )
             )
-    def stream_output_embedding(self, reqs: List[Req]):
+    def stream_output_embedding(self: Scheduler, reqs: List[Req]):
         rids = []
         finished_reasons: List[BaseFinishReason] = []

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -419,7 +419,10 @@ class TokenizerManager:
             input_ids = self.tokenizer.encode(input_text)
         image_inputs: Dict = await self.mm_processor.process_mm_data_async(
-            obj.image_data, input_text or input_ids, obj, self.max_req_input_len
+            image_data=obj.image_data,
+            input_text=input_text or input_ids,
+            request_obj=obj,
+            max_req_input_len=self.max_req_input_len,
         )
         if image_inputs and "input_ids" in image_inputs:
             input_ids = image_inputs["input_ids"]
@@ -495,6 +498,7 @@ class TokenizerManager:
                 token_ids_logprob,
                 obj.stream,
                 bootstrap_host=obj.bootstrap_host,
+                bootstrap_port=obj.bootstrap_port,
                 bootstrap_room=obj.bootstrap_room,
                 lora_path=obj.lora_path,
                 input_embeds=input_embeds,

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -170,13 +170,13 @@ class TpModelWorker:
     def forward_batch_generation(
         self,
         model_worker_batch: ModelWorkerBatch,
-        launch_done: Optional[threading.Event] = None,
         skip_sample: bool = False,
     ) -> Tuple[LogitsProcessorOutput, Optional[torch.Tensor]]:
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
         logits_output = self.model_runner.forward(forward_batch)
-        if launch_done:
-            launch_done.set()
+        if model_worker_batch.launch_done is not None:
+            model_worker_batch.launch_done.set()
         if skip_sample:
             next_token_ids = None

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -132,7 +132,6 @@ class TpModelWorkerClient:
             batch_pt += 1
             # Create event
-            self.launch_done = threading.Event()
             copy_done = torch.get_device_module(self.device).Event()
             # Resolve future tokens in the input
@@ -141,7 +140,7 @@ class TpModelWorkerClient:
             # Run forward
             logits_output, next_token_ids = self.worker.forward_batch_generation(
-                model_worker_batch, self.launch_done
+                model_worker_batch
             )
             # Update the future token ids map
@@ -168,10 +167,16 @@ class TpModelWorkerClient:
             self.output_queue.put((copy_done, logits_output, next_token_ids))
-    def resolve_batch_result(self, bid: int):
+    def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
+        """
+        This function is called to resolve the last batch result and
+        wait for the current batch to be launched. Used in overlap mode.
+        """
         copy_done, logits_output, next_token_ids = self.output_queue.get()
+        if launch_done is not None:
+            launch_done.wait()
         copy_done.synchronize()
-        self.launch_done.wait()
         if logits_output.next_token_logprobs is not None:
             logits_output.next_token_logprobs = (

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -34,6 +34,8 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import psutil
 import torch
+import triton
+import triton.language as tl
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.utils import debug_timing, get_compiler_backend
@@ -405,6 +407,72 @@ def copy_two_array(loc, dst_1, src_1, dst_2, src_2, dtype, store_dtype):
     dst_2[loc] = src_2.to(dtype).view(store_dtype)
+@triton.jit
+def set_mla_kv_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    pid_blk = tl.program_id(1)
+    base = pid_blk * BLOCK
+    offs = base + tl.arange(0, BLOCK)
+    total_dim = nope_dim + rope_dim
+    mask = offs < total_dim
+    loc = tl.load(loc_ptr + pid_loc)
+    dst_ptr = kv_buffer_ptr + loc * buffer_stride + offs
+    if base + BLOCK <= nope_dim:
+        src = tl.load(
+            cache_k_nope_ptr + pid_loc * nope_stride + offs,
+            mask=mask,
+        )
+    else:
+        offs_rope = offs - nope_dim
+        src = tl.load(
+            cache_k_rope_ptr + pid_loc * rope_stride + offs_rope,
+            mask=mask,
+        )
+    tl.store(dst_ptr, src, mask=mask)
+def set_mla_kv_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    nope_dim = cache_k_nope.shape[-1]
+    rope_dim = cache_k_rope.shape[-1]
+    total_dim = nope_dim + rope_dim
+    BLOCK = 128
+    n_loc = loc.numel()
+    grid = (n_loc, triton.cdiv(total_dim, BLOCK))
+    set_mla_kv_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+        BLOCK=BLOCK,
+    )
 class MLATokenToKVPool(KVCache):
     def __init__(
         self,
@@ -504,6 +572,25 @@ class MLATokenToKVPool(KVCache):
         else:
             self.kv_buffer[layer_id][loc] = cache_k
+    def set_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k_nope: torch.Tensor,
+        cache_k_rope: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        if cache_k_nope.dtype != self.dtype:
+            cache_k_nope = cache_k_nope.to(self.dtype)
+            cache_k_rope = cache_k_rope.to(self.dtype)
+        if self.store_dtype != self.dtype:
+            cache_k_nope = cache_k_nope.view(self.store_dtype)
+            cache_k_rope = cache_k_rope.view(self.store_dtype)
+        set_mla_kv_buffer_triton(
+            self.kv_buffer[layer_id], loc, cache_k_nope, cache_k_rope
+        )
     def get_flat_data(self, indices):
         # prepare a large chunk of contiguous data for efficient transfer
         return torch.stack([self.kv_buffer[i][indices] for i in range(self.layer_num)])

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -134,7 +134,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             )
         gpu_mem = get_device_memory_capacity()
-        if gpu_mem is not None and gpu_mem > 81920:
+        # Batch size of each rank will not become so large when DP is on
+        if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1:
             capture_bs += list(range(160, 257, 8))
     if max(capture_bs) > model_runner.req_to_token_pool.size:
@@ -278,9 +279,9 @@ class CudaGraphRunner:
                 f"Capture cuda graph failed: {e}\n"
                 "Possible solutions:\n"
                 "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
-                "2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n"
+                "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                 "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph\n"
+                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )

sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl