PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/llama_eagle3.py CHANGED Viewed

@@ -19,6 +19,7 @@ from sglang.srt.utils import add_prefix
 # https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
 """Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+import copy
 from typing import Iterable, Optional, Tuple
 import torch
@@ -161,6 +162,10 @@ class LlamaModel(nn.Module):
         if hidden_states.shape[-1] != embeds.shape[-1]:
             hidden_states = self.fc(hidden_states)
+        # idle batch
+        if hidden_states.shape[0] == 0:
+            return hidden_states, [hidden_states]
         residual = None
         hidden_states, residual = self.midlayer(
             positions,
@@ -212,7 +217,12 @@ class LlamaForCausalLMEagle3(LlamaForCausalLM):
                 prefix=add_prefix("lm_head", prefix),
             )
-        self.logits_processor = LogitsProcessor(config)
+        config_ = copy.deepcopy(config)
+        config_.vocab_size = (
+            config_.draft_vocab_size
+        )  # draft logits processor has it's own vocab size
+        self.logits_processor = LogitsProcessor(config_)
         self.capture_aux_hidden_states = True
         self.hot_token_id = None

sglang/srt/models/longcat_flash.py CHANGED Viewed

@@ -821,8 +821,8 @@ class LongcatFlashForCausalLM(nn.Module):
             experts = layer.mlp.experts
             if isinstance(experts, DeepEPMoE):
                 for w in [
-                    experts.w13_weight_fp8,
-                    experts.w2_weight_fp8,
+                    (experts.w13_weight, experts.w13_weight_scale_inv),
+                    (experts.w2_weight, experts.w2_weight_scale_inv),
                 ]:
                     requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)

sglang/srt/models/minimax_m2.py CHANGED Viewed

@@ -122,7 +122,7 @@ class MiniMaxM2RMSNormTP(nn.Module):
         # Normalize and apply local weight shard
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-        x = x.to(orig_dtype) * self.weight
+        x = (x * self.weight).to(orig_dtype)
         return x

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -462,7 +462,7 @@ class Qwen2ForCausalLM(nn.Module):
                 self.pp_group.send(
                     self.model.embed_tokens.weight, dst=self.pp_group.last_rank
                 )
-            else:
+            elif self.pp_group.is_last_rank:
                 emb_token_weight = self.pp_group.recv(
                     size=(config.vocab_size, config.hidden_size),
                     dtype=next(self.model.parameters()).dtype,

sglang/srt/models/qwen2_moe.py CHANGED Viewed

@@ -473,10 +473,16 @@ class Qwen2MoeDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
         residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        hidden_states, residual = self.layer_communicator.prepare_attn(
-            hidden_states, residual, forward_batch
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
         )
         if hidden_states.shape[0] != 0:
@@ -553,6 +559,11 @@ class Qwen2MoeModel(nn.Module):
         # For EAGLE3 support
         self.layers_to_capture = []
+    def set_eagle3_layers_to_capture(self, layers_to_capture: List[int]):
+        self.layers_to_capture = layers_to_capture
+        for layer_id in self.layers_to_capture:
+            setattr(self.layers[layer_id], "_is_layer_to_capture", True)
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -585,12 +596,6 @@ class Qwen2MoeModel(nn.Module):
             )
         else:
             for i in range(self.start_layer, self.end_layer):
-                if i in self.layers_to_capture:
-                    aux_hidden_states.append(
-                        hidden_states + residual
-                        if residual is not None
-                        else hidden_states
-                    )
                 ctx = (
                     nullcontext()
                     if get_global_server_args().enable_piecewise_cuda_graph
@@ -599,7 +604,15 @@ class Qwen2MoeModel(nn.Module):
                 with ctx:
                     layer = self.layers[i]
                     hidden_states, residual = layer(
-                        positions, hidden_states, forward_batch, residual
+                        positions,
+                        hidden_states,
+                        forward_batch,
+                        residual,
+                        captured_last_layer_outputs=(
+                            aux_hidden_states
+                            if getattr(layer, "_is_layer_to_capture", False)
+                            else None
+                        ),
                     )
         if not self.pp_group.is_last_rank:
             return PPProxyTensors(
@@ -830,13 +843,15 @@ class Qwen2MoeForCausalLM(nn.Module):
         self.capture_aux_hidden_states = True
         if layer_ids is None:
             num_layers = self.config.num_hidden_layers
-            self.model.layers_to_capture = [
-                2,
-                num_layers // 2,
-                num_layers - 3,
-            ]  # Specific layers for EAGLE3 support
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
         else:
-            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
 EntryClass = Qwen2MoeForCausalLM

sglang/srt/models/qwen3.py CHANGED Viewed

@@ -361,7 +361,7 @@ class Qwen3ForCausalLM(nn.Module):
                 self.pp_group.send(
                     self.model.embed_tokens.weight, dst=self.pp_group.last_rank
                 )
-            else:
+            elif self.pp_group.is_last_rank:
                 emb_token_weight = self.pp_group.recv(
                     size=(config.vocab_size, config.hidden_size),
                     dtype=next(self.model.parameters()).dtype,

sglang/srt/models/qwen3_moe.py CHANGED Viewed

@@ -537,10 +537,16 @@ class Qwen3MoeDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
         residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        hidden_states, residual = self.layer_communicator.prepare_attn(
-            hidden_states, residual, forward_batch
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
         )
         if hidden_states.shape[0] != 0:
@@ -772,13 +778,15 @@ class Qwen3MoeForCausalLM(nn.Module):
         self.capture_aux_hidden_states = True
         if layer_ids is None:
             num_layers = self.config.num_hidden_layers
-            self.model.layers_to_capture = [
-                2,
-                num_layers // 2,
-                num_layers - 3,
-            ]  # Specific layers for EAGLE3 support
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
         else:
-            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/qwen3_next.py CHANGED Viewed

@@ -478,6 +478,13 @@ class Qwen3GatedDeltaNet(nn.Module):
         # reshape input data into 2D tensor
         core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
         z = z.reshape(-1, z.shape[-1])
+        # Add padding for DP-Attn
+        if is_dp_attention_enabled():
+            core_attn_out_pad = torch.zeros_like(z)
+            core_attn_out_pad[: core_attn_out.shape[0], :] = core_attn_out
+            core_attn_out = core_attn_out_pad
         core_attn_out = self.norm(core_attn_out, z)
         core_attn_out = core_attn_out.reshape(z_shape_og)
         core_attn_out = core_attn_out.reshape(*core_attn_out.shape[:-2], -1)

sglang/srt/multimodal/customized_mm_processor_utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Dict, Type
+from transformers import PretrainedConfig, ProcessorMixin
+# Useful for registering a custom processor different from Hugging Face's default.
+_CUSTOMIZED_MM_PROCESSOR: Dict[str, Type[ProcessorMixin]] = dict()
+def register_customized_processor(
+    processor_class: Type[ProcessorMixin],
+):
+    """Class decorator that maps a config class's model_type field to a customized processor class.
+    Args:
+        processor_class: A processor class that inherits from ProcessorMixin
+    Example:
+        ```python
+        @register_customized_processor(MyCustomProcessor)
+        class MyModelConfig(PretrainedConfig):
+            model_type = "my_model"
+        ```
+    """
+    def decorator(config_class: PretrainedConfig):
+        if not hasattr(config_class, "model_type"):
+            raise ValueError(
+                f"Class {config_class.__name__} with register_customized_processor should "
+                f"have a 'model_type' class attribute."
+            )
+        _CUSTOMIZED_MM_PROCESSOR[config_class.model_type] = processor_class
+        return config_class
+    return decorator

sglang/srt/multiplex/multiplexing_mixin.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""
+Mixin class providing multiplexing scheduling logic
+"""
+import logging
+import torch
+import torch.distributed as dist
+from torch.cuda.streams import ExternalStream
+from sglang.srt.distributed.parallel_state import set_pdmux_status
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.multiplex.pdmux_context import (
+    get_current_stream_idx,
+    get_sm_counts,
+    get_stream_groups,
+    initialize_stream_groups,
+    load_pdmux_config,
+    set_current_stream_idx,
+)
+logger = logging.getLogger(__name__)
+class SchedulerMultiplexMixin:
+    def init_pdmux(self):
+        # for pd_multiplexing, Init stream_groups, exclude normal stream for prefill only and decode only
+        self.pdmux_config = load_pdmux_config(self.server_args.pdmux_config_path)
+        initialize_stream_groups(self.gpu_id, self.pdmux_config)
+        self.stream_groups = get_stream_groups()
+        self.sm_counts = get_sm_counts()
+        self.real_sm_group_num = len(self.stream_groups)
+        logger.info(
+            f"PD-Multiplexing enabled with {self.real_sm_group_num} stream groups, sm_counts (prefill_sm, decode_sm): {self.sm_counts}"
+        )
+    # TODO(jason-fxz): This is a temporary demo
+    def adjust_stream_groups(self) -> tuple[int, tuple[ExternalStream, ExternalStream]]:
+        if not self.running_batch.is_empty() and self.split_prefill_batch:
+            decode_bs = self.running_batch.batch_size()
+            manual_divisions = self.pdmux_config.manual_divisions
+            if manual_divisions:
+                for i in range(len(manual_divisions)):
+                    _, _, threshold = manual_divisions[i]
+                    if decode_bs >= threshold:
+                        stream_idx = i + 1
+            else:
+                stream_idx = max(
+                    1,
+                    min(
+                        self.real_sm_group_num - 2,
+                        decode_bs
+                        * (self.real_sm_group_num - 2)
+                        // self.pdmux_config.decode_bs_divisor,
+                    ),
+                )
+            set_current_stream_idx(stream_idx)
+        elif not self.running_batch.is_empty():
+            set_current_stream_idx(self.real_sm_group_num - 1)
+        else:
+            set_current_stream_idx(0)
+        stream_idx = get_current_stream_idx()
+        self.tp_worker.model_runner.update_decode_attn_backend(stream_idx)
+        return stream_idx, self.stream_groups[stream_idx]
+    def update_split_prefill_batch(self, sm_count: int) -> bool:
+        if self.split_prefill_batch:
+            return False
+        # add new request
+        batch = self.get_new_batch_prefill()
+        if batch and not batch.is_empty():
+            batch.forward_mode = (
+                ForwardMode.SPLIT_PREFILL
+            )  # Set forward mode for split prefill
+            self.split_prefill_batch = batch
+            return True
+        return False
+    @torch.inference_mode()
+    def event_loop_pdmux(self):
+        """A scheduler loop for pd multiplexing."""
+        decode_done = False
+        prefill_done = False
+        wait_prefill_kernel_done = False
+        adjust_stream_group = False
+        stream_idx = get_current_stream_idx()
+        stream_group = self.stream_groups[stream_idx]
+        prefill_stream = stream_group[0]
+        decode_stream = stream_group[1]
+        torch.cuda.empty_cache()
+        logger.debug("Starting event loop for pd multiplexing...")
+        while True:
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                sm_count = self.sm_counts[stream_idx][0]
+                if not wait_prefill_kernel_done:
+                    adjust_stream_group = (
+                        self.update_split_prefill_batch(sm_count) or adjust_stream_group
+                    )
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                self.running_batch = self.update_running_batch(self.running_batch)
+                adjust_stream_group = adjust_stream_group or (
+                    stream_idx > 0 and self.running_batch.is_empty()
+                )
+                if self.running_batch.is_empty() and self.split_prefill_batch is None:
+                    self.check_memory()
+                    self.check_tree_cache()
+                    self.new_token_ratio = self.init_new_token_ratio
+                    self.maybe_sleep_on_idle()
+            if adjust_stream_group:
+                prefill_stream.synchronize()
+                decode_stream.synchronize()
+                stream_idx, stream_group = self.adjust_stream_groups()
+                prefill_stream = stream_group[0]
+                decode_stream = stream_group[1]
+                adjust_stream_group = False
+                logger.debug(
+                    f"Adjusting stream groups: {stream_idx}, prefill sm: {self.sm_counts[stream_idx][0]}, decode sm: {self.sm_counts[stream_idx][1]}"
+                )
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                # process decode batch
+                if self.running_batch and not self.running_batch.is_empty():
+                    decode_result = self.run_batch(self.running_batch)
+                    decode_done = True
+                else:
+                    decode_done = False
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if (
+                    self.split_prefill_batch
+                    and not self.split_prefill_batch.is_empty()
+                    and not wait_prefill_kernel_done
+                ):
+                    prefill_done = True
+                    forward_count = (
+                        max(
+                            1,
+                            self.pdmux_config.split_forward_token_budget
+                            // self.split_prefill_batch.extend_num_tokens,
+                        )
+                        if self.split_prefill_batch.extend_num_tokens > 0
+                        else self.model_config.num_hidden_layers
+                    )
+                    next_split_index = min(
+                        self.split_prefill_batch.split_index + forward_count,
+                        self.model_config.num_hidden_layers,
+                    )
+                    forward_count = (
+                        next_split_index - self.split_prefill_batch.split_index
+                    )
+                    self.split_prefill_batch.split_forward_count = forward_count
+                    prefill_result = self.run_batch(self.split_prefill_batch)
+                    if next_split_index == self.model_config.num_hidden_layers:
+                        self.split_prefill_batch.split_prefill_finished = True
+                        prefill_exe_done = prefill_stream.record_event()
+                    self.split_prefill_batch.split_index = next_split_index
+                elif wait_prefill_kernel_done:
+                    prefill_done = True
+                else:
+                    prefill_done = False
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                decode_stream.synchronize()
+                if decode_done:
+                    self.process_batch_result(self.running_batch, decode_result)
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if prefill_done and self.split_prefill_batch.split_prefill_finished:
+                    wait_prefill_kernel_done = True
+                    prefill_exe_done_flag = prefill_exe_done.query()
+                    flags = (
+                        torch.ones(1, device="cpu", dtype=torch.int32)
+                        if prefill_exe_done_flag
+                        else torch.zeros(1, device="cpu", dtype=torch.int32)
+                    )
+                    self.tp_cpu_group.allreduce(flags, dist.ReduceOp.SUM).wait()
+                    if flags.item() == self.tp_size:
+                        self.process_batch_result(
+                            self.split_prefill_batch, prefill_result
+                        )
+                        if self.running_batch and not self.running_batch.is_empty():
+                            self.running_batch.merge_batch(self.split_prefill_batch)
+                        else:
+                            self.running_batch = self.split_prefill_batch
+                        self.split_prefill_batch = None
+                        wait_prefill_kernel_done = False
+                        adjust_stream_group = True

sglang/srt/multiplex/pdmux_context.py ADDED Viewed

@@ -0,0 +1,164 @@
+from dataclasses import dataclass, field
+from typing import List
+import torch
+import yaml
+STREAM_GROUPS = []
+SM_COUNTS = []
+SM_GROUP_NUM = 8  # Default number of SM groups
+CURRENT_STREAM_IDX = 0
+CURRENT_STREAM_GROUP = None
+@dataclass
+class PDMuxConfig:
+    sm_group_num: int = 8
+    manual_divisions: List[List[int]] = field(
+        default_factory=list
+    )  # [prefill_sm, decode_sm, decode_bs_threshold]
+    split_forward_token_budget: int = 65536
+    decode_bs_divisor: int = 36
+def load_pdmux_config(config_path: str) -> PDMuxConfig:
+    """Load pdmux configuration from YAML file into a dataclass."""
+    if not config_path:
+        return PDMuxConfig()
+    with open(config_path, "r") as f:
+        raw = yaml.safe_load(f)
+    if "sm_group_num" not in raw:
+        raise ValueError("Missing required field: sm_group_num")
+    if raw["sm_group_num"] < 3:
+        raise ValueError("sm_group_num must greater than 3")
+    manual_divisions = raw.get("manual_divisions", [])
+    expected = raw["sm_group_num"] - 2
+    if manual_divisions and len(manual_divisions) != expected:
+        raise ValueError(
+            f"manual_divisions must have {expected} entries, "
+            f"but got {len(manual_divisions)}"
+        )
+    return PDMuxConfig(
+        sm_group_num=raw["sm_group_num"],
+        manual_divisions=manual_divisions,
+        split_forward_token_budget=raw.get("split_forward_token_budget", 65536),
+        decode_bs_divisor=raw.get("decode_bs_divisor", 36),
+    )
+def get_arch_constraints(compute_capability):
+    major, minor = compute_capability
+    # green context constraints for different architectures
+    if major == 6:
+        return 1, 1  # min_per_part, multiple
+    elif major == 7:
+        return 2, 2
+    elif major == 8:
+        return 4, 2
+    elif major == 9 and minor >= 0:
+        return 8, 8
+    else:
+        raise ValueError(f"Unsupported compute capability: {major}.{minor}")
+def divide_sm(total_sms, compute_capability, groups):
+    """
+    :param total_sms: total sm count on a single GPU
+    :param compute_capability: (major, minor)
+    :return: SM partition group(prefill sm, decode sm)
+    """
+    min_per_part, multiple = get_arch_constraints(compute_capability)
+    possible_values = [
+        x
+        for x in range(min_per_part, total_sms - min_per_part + 1, multiple)
+        if x >= total_sms - x and total_sms - x >= 16
+    ]
+    if not possible_values:
+        raise ValueError(
+            f"No valid partitions found for total SMs {total_sms} "
+            f"with constraints (min per part: {min_per_part}, multiple: {multiple})"
+        )
+    if len(possible_values) >= groups:
+        step = max(1, len(possible_values) // groups)
+        selected_values = possible_values[::step][:groups]
+    else:
+        selected_values = possible_values
+    divisions = []
+    for part1 in selected_values:
+        part2 = total_sms - part1
+        divisions.append((part1, part2))
+    divisions.reverse()  # Reverse to have larger prefill SM first
+    return divisions
+def initialize_stream_groups(gpu_id: int, config: PDMuxConfig):
+    from sgl_kernel import spatial
+    global STREAM_GROUPS, SM_COUNTS, SM_GROUP_NUM, CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    # for pd_multiplexing, Init stream_groups
+    device = torch.cuda.current_device()
+    total_sm_count = spatial.get_sm_available(gpu_id)
+    # (prefill_sm_count, decode_sm_count)
+    if config.manual_divisions:
+        divisions = [
+            (prefill_sm, decode_sm)
+            for prefill_sm, decode_sm, _ in config.manual_divisions
+        ]
+    else:
+        divisions = divide_sm(
+            total_sm_count,
+            torch.cuda.get_device_capability(device),
+            config.sm_group_num - 2,
+        )
+    SM_COUNTS = []
+    SM_COUNTS.append((total_sm_count, 0))  # Normal stream for prefill
+    SM_COUNTS.extend(divisions)  # Add the divided SM counts
+    SM_COUNTS.append((0, total_sm_count))  # Normal stream for decode
+    STREAM_GROUPS = []
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for prefill
+    for prefill_sm, decode_sm in divisions:
+        STREAM_GROUPS.append(
+            (spatial.create_greenctx_stream_by_value(prefill_sm, decode_sm, gpu_id))
+        )
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for decode
+    CURRENT_STREAM_IDX = 0
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+def set_current_stream_idx(idx: int):
+    global CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    if idx < 0 or idx >= len(STREAM_GROUPS):
+        raise ValueError(f"Invalid stream index: {idx}")
+    CURRENT_STREAM_IDX = idx
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+def get_stream_groups() -> list[tuple[torch.cuda.Stream, torch.cuda.Stream]]:
+    """Get the stream groups."""
+    return STREAM_GROUPS
+def get_sm_counts() -> list[tuple[int, int]]:
+    """Get the SM counts."""
+    return SM_COUNTS
+def get_current_stream_idx() -> int:
+    """Get the current stream index."""
+    return CURRENT_STREAM_IDX

sglang/srt/parser/conversation.py CHANGED Viewed

@@ -101,6 +101,7 @@ class Conversation:
     stop_token_ids: Optional[int] = None
     audio_data: Optional[List[str]] = None
+    image_token_at_prefix: bool = False
     def get_prompt(self) -> str:
         """Get the prompt for generation."""
@@ -445,6 +446,7 @@ class Conversation:
             image_token=self.image_token,
             video_token=self.video_token,
             audio_token=self.audio_token,
+            image_token_at_prefix=self.image_token_at_prefix,
         )
     def dict(self):
@@ -512,6 +514,7 @@ def generate_embedding_convs(
             image_token=conv_template.image_token,
             video_token=conv_template.video_token,
             audio_token=conv_template.audio_token,
+            image_token_at_prefix=conv_template.image_token_at_prefix,
         )
         real_content = ""
@@ -578,6 +581,7 @@ def generate_chat_conv(
         image_token=conv.image_token,
         audio_token=conv.audio_token,
         video_token=conv.video_token,
+        image_token_at_prefix=conv.image_token_at_prefix,
     )
     if isinstance(request.messages, str):
@@ -627,7 +631,7 @@ def generate_chat_conv(
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: works for llava and intervl2_5
-                        if conv.name in ["internvl-2-5"]:
+                        if conv.image_token_at_prefix:
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
@@ -820,6 +824,7 @@ register_conv_template(
         sep="<|im_end|>\n",
         stop_str=["<|im_end|>", "<|action_end|>"],
         image_token="<IMG_CONTEXT>",
+        image_token_at_prefix=True,
     )
 )
@@ -848,6 +853,7 @@ register_conv_template(
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         stop_str=["<｜end▁of▁sentence｜>"],
         image_token="<image>",
+        image_token_at_prefix=True,
     )
 )

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl