PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +7 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +13 -1
sglang/srt/entrypoints/openai/protocol.py +3 -1
sglang/srt/entrypoints/openai/serving_base.py +5 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +323 -242
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
sglang/srt/layers/moe/topk.py +90 -24
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +27 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/data_parallel_controller.py +4 -0
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +64 -1
sglang/srt/managers/scheduler_input_blocker.py +106 -0
sglang/srt/managers/tokenizer_manager.py +80 -15
sglang/srt/managers/tp_worker.py +8 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +83 -27
sglang/srt/models/deepseek_v2.py +75 -84
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +17 -71
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/poll_based_barrier.py +31 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +65 -6
sglang/srt/two_batch_overlap.py +8 -3
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +118 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +5 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +97 -80
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/janus_pro.py CHANGED Viewed

@@ -11,8 +11,8 @@ from sglang.srt.multimodal.processors.base_processor import (
 class JanusProImageProcessor(BaseMultimodalProcessor):
     models = [MultiModalityCausalLM]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.mm_tokens = MultimodalSpecialTokens(
             image_token=_processor.image_token,

sglang/srt/multimodal/processors/kimi_vl.py CHANGED Viewed

@@ -12,8 +12,8 @@ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTok
 class KimiVLImageProcessor(SGLangBaseProcessor):
     models = [KimiVLForConditionalGeneration]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.mm_tokens = MultimodalSpecialTokens(
             image_token="<|media_pad|>",
             # TODO: could we convert in MultimodalSpecialTokens?

sglang/srt/multimodal/processors/llava.py CHANGED Viewed

@@ -30,8 +30,8 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
         LlavaMistralForCausalLM,
     ]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
     @staticmethod
     def _process_single_image_task(
@@ -187,7 +187,7 @@ class LlavaMultimodalProcessor(BaseMultimodalProcessor):
             f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
         )
-    def __init__(self, hf_config, server_args, _processor):
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         assert hasattr(hf_config, "vision_config")
         assert hasattr(hf_config, "text_config")
         self.vision_config = hf_config.vision_config
@@ -196,7 +196,7 @@ class LlavaMultimodalProcessor(BaseMultimodalProcessor):
         if vision_type := getattr(self.vision_config, "model_type"):
             self.inner = self._get_sgl_processor_cls(vision_type)(
-                hf_config, server_args, _processor
+                hf_config, server_args, _processor, *args, **kwargs
             )
         else:
             raise ValueError(

sglang/srt/multimodal/processors/minicpm.py CHANGED Viewed

@@ -15,8 +15,8 @@ from sglang.srt.multimodal.processors.base_processor import (
 class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
     models = [MiniCPMV, MiniCPMO]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         # Collect special token ids
         tokenizer = self._processor.tokenizer
         self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
@@ -26,7 +26,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         self.im_start_id = getattr(tokenizer, "im_start_id", None)
         self.im_end_id = getattr(tokenizer, "im_end_id", None)
         self.im_token_id = getattr(tokenizer, "unk_id", None)
         self.mm_tokens = MultimodalSpecialTokens(
             image_token="(<image>./</image>)",
             audio_token="(<audio>./</audio>)",

sglang/srt/multimodal/processors/mlama.py CHANGED Viewed

@@ -10,8 +10,8 @@ from sglang.srt.multimodal.processors.base_processor import (
 class MllamaImageProcessor(BaseMultimodalProcessor):
     models = [MllamaForConditionalGeneration]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.mm_tokens = MultimodalSpecialTokens(
             image_token=self._processor.image_token,
             image_token_id=self._processor.image_token_id,

sglang/srt/multimodal/processors/mllama4.py CHANGED Viewed

@@ -18,16 +18,16 @@ from sglang.srt.multimodal.processors.base_processor import (
 class Mllama4ImageProcessor(BaseMultimodalProcessor):
     models = [Llama4ForConditionalGeneration]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.vision_config = hf_config.vision_config
         self.text_config = hf_config.text_config
-        self.boi_token_index = hf_config.boi_token_index
-        self.eoi_token_index = hf_config.eoi_token_index
-        self.image_token_index = hf_config.image_token_index
-        self.multimodal_tokens = MultimodalSpecialTokens(
+        self.IM_START_TOKEN_ID = hf_config.boi_token_index
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+        self.IM_TOKEN_ID = hf_config.image_token_index
+        self.mm_tokens = MultimodalSpecialTokens(
             image_token=_processor.image_token,
-            image_token_id=self.image_token_index,
+            image_token_id=self.IM_TOKEN_ID,
         ).build(_processor)
     async def process_mm_data_async(
@@ -37,114 +37,21 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         *args,
         **kwargs,
     ):
-        if isinstance(input_text, list):
-            assert len(input_text) and isinstance(input_text[0], int)
-            input_text = self._processor.tokenizer.decode(input_text)
-        # Process images and text using the base processor's load_mm_data method
-        processed_data = self.load_mm_data(
+        base_output = self.load_mm_data(
             prompt=input_text,
-            multimodal_tokens=self.multimodal_tokens,
             image_data=image_data,
-            return_text=True,
+            multimodal_tokens=self.mm_tokens,
         )
-        # Process the images using the processor
-        processor = self._processor
         # Process the prompt and images
-        processor_output = self.process_mm_data(
-            input_text=processed_data.input_text,
-            images=processed_data.images,
-        )
-        # Handle image resolutions and aspect ratios
-        if "pixel_values" not in processor_output:  # no image processed
-            return None
-        image_processor = processor.image_processor
-        tokenizer = self._processor.tokenizer
-        # Calculate tile size and find supported resolutions
-        tile_size = self.vision_config.image_size
-        max_num_tiles = getattr(self.vision_config, "max_patches", 1)
-        possible_resolutions = find_supported_resolutions(
-            max_num_chunks=max_num_tiles,
-            patch_size=SizeDict(height=tile_size, width=tile_size),
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
         )
-        # Find best fit for each image
-        best_fit_sizes = [
-            get_best_fit(
-                (image.size[1], image.size[0]),  # (height, width)
-                torch.tensor(possible_resolutions),
-                resize_to_max_canvas=image_processor.resize_to_max_canvas,
-            )
-            for image in processed_data.images
-        ]
-        # Calculate aspect ratios and patches per image
-        aspect_ratios = [
-            (image_size[0] // tile_size, image_size[1] // tile_size)
-            for image_size in best_fit_sizes
-        ]
-        patches_per_image = [
-            1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
-        ]
-        # Add to image_inputs
-        processor_output["aspect_ratios"] = aspect_ratios
-        processor_output["patches_per_image"] = torch.tensor(patches_per_image)
-        # Process embed_is_patch
-        vocab = tokenizer.get_vocab()
-        patch_id = vocab.get(processor.img_patch_token, -1)
-        image_end_id = vocab.get(processor.end_of_img_token, -1)
-        if patch_id != -1 and image_end_id != -1:
-            input_ids = processor_output["input_ids"].view(-1)
-            # Remove BOS token if present
-            if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
-                input_ids = input_ids[1:]
-            # Find image end indices and split input_ids
-            image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
-            if image_end_indices.size(0) > 0:
-                # Split at image boundaries
-                split_indices = (image_end_indices + 1)[:-1]
-                split_input_ids = torch.tensor_split(input_ids, split_indices)
-                split_input_ids = [x for x in split_input_ids if x.numel() > 0]
-                # Create embed_is_patch for each image
-                embed_is_patch = []
-                for per_image_input_ids in split_input_ids:
-                    embed_is_patch.append(per_image_input_ids == patch_id)
-                processor_output["embed_is_patch"] = embed_is_patch
-        # Convert to the format expected by SGLang
-        processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
-        processor_output["im_start_id"] = self.boi_token_index
-        processor_output["im_end_id"] = self.eoi_token_index
-        processor_output["im_token_id"] = self.image_token_index
-        image_offsets = self.get_mm_items_offset(
-            input_ids=torch.tensor(processor_output["input_ids"]),
-            mm_token_id=self.image_token_index,
-        )
-        # Add metadata for image processing
-        processor_output["mm_items"] = [
-            MultimodalDataItem(
-                feature=processor_output["pixel_values"],
-                modality=Modality.IMAGE,
-                offsets=image_offsets,
-            )
-        ]
-        return processor_output
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "im_token_id": self.IM_TOKEN_ID,
+        }

sglang/srt/multimodal/processors/phi4mm.py CHANGED Viewed

@@ -47,9 +47,9 @@ class Phi4MMProcessorAdapter(ProcessorMixin):
 class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
     models = [Phi4MMForCausalLM]
-    def __init__(self, hf_config, server_args, _processor):
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         self.processor = Phi4MMProcessorAdapter(_processor)
-        super().__init__(hf_config, server_args, self.processor)
+        super().__init__(hf_config, server_args, self.processor, *args, **kwargs)
         # the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
         # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py

sglang/srt/multimodal/processors/pixtral.py CHANGED Viewed

@@ -42,8 +42,8 @@ class PixtralProcessor(BaseMultimodalProcessor):
         return ncols, nrows
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.IM_TOKEN_ID = getattr(
             hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
         )

sglang/srt/multimodal/processors/qwen_audio.py CHANGED Viewed

@@ -11,8 +11,8 @@ from sglang.srt.multimodal.processors.base_processor import (
 class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
     models = [Qwen2AudioForConditionalGeneration]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
         self.AUDIO_TOKEN_REGEX = re.compile(
             r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"

sglang/srt/multimodal/processors/qwen_vl.py CHANGED Viewed

@@ -201,8 +201,8 @@ async def preprocess_video(
 class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
     models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
-    def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         # The regex that matches expanded image tokens.
         self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
         self.IM_END_TOKEN_ID = hf_config.vision_end_token_id

sglang/srt/multimodal/processors/vila.py CHANGED Viewed

@@ -34,8 +34,10 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
         hf_config: PretrainedConfig,
         server_args: ServerArgs,
         _processor: VILAProcessor,
+        *args,
+        **kwargs,
     ) -> None:
-        super().__init__(hf_config, server_args, _processor)
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         self.mm_tokens = MultimodalSpecialTokens(
             image_token=self._processor.tokenizer.image_token,
             image_token_id=hf_config.image_token_id,

sglang/srt/poll_based_barrier.py ADDED Viewed

@@ -0,0 +1,31 @@
+import torch
+from sglang.srt.distributed import get_world_group
+class PollBasedBarrier:
+    def __init__(self, noop: bool = False):
+        self._noop = noop
+        self._local_arrived = False
+    def local_arrive(self):
+        assert not self._local_arrived
+        self._local_arrived = True
+    def poll_global_arrived(self) -> bool:
+        global_arrived = self._compute_global_arrived()
+        output = self._local_arrived and global_arrived
+        if output:
+            self._local_arrived = False
+        return output
+    def _compute_global_arrived(self) -> bool:
+        local_arrived = self._noop or self._local_arrived
+        global_arrived = torch.tensor(local_arrived)
+        # Can optimize if bottleneck
+        torch.distributed.all_reduce(
+            global_arrived,
+            torch.distributed.ReduceOp.MIN,
+            group=get_world_group().cpu_group,
+        )
+        return global_arrived.item()

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
         One-time parsing: Detects and parses reasoning sections in the provided text.
         Returns both reasoning content and normal text separately.
         """
-        in_reasoning = self._in_reasoning or text.startswith(self.think_start_token)
+        in_reasoning = self._in_reasoning or self.think_start_token in text
         if not in_reasoning:
             return StreamingParseResult(normal_text=text)
@@ -231,6 +231,7 @@ class ReasoningParser:
         "deepseek-r1": DeepSeekR1Detector,
         "qwen3": Qwen3Detector,
         "qwen3-thinking": Qwen3ThinkingDetector,
+        "glm45": Qwen3Detector,
         "kimi": KimiDetector,
     }

sglang/srt/server_args.py CHANGED Viewed

@@ -19,6 +19,7 @@ import json
 import logging
 import os
 import random
+import sys
 import tempfile
 from typing import List, Literal, Optional, Union
@@ -74,6 +75,7 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
+    max_queued_requests: Optional[int] = sys.maxsize
     max_total_tokens: Optional[int] = None
     chunked_prefill_size: Optional[int] = None
     max_prefill_tokens: int = 16384
@@ -151,6 +153,8 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
+    decode_attention_backend: Optional[str] = None
+    prefill_attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
     grammar_backend: Optional[str] = None
     mm_attention_backend: Optional[str] = None
@@ -169,7 +173,8 @@ class ServerArgs:
     ep_size: int = 1
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
-    enable_flashinfer_moe: bool = False
+    enable_flashinfer_cutlass_moe: bool = False
+    enable_flashinfer_trtllm_moe: bool = False
     enable_flashinfer_allreduce_fusion: bool = False
     deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
     ep_num_redundant_experts: int = 0
@@ -386,13 +391,19 @@ class ServerArgs:
             )
             self.page_size = 128
-        if self.attention_backend == "flashmla":
+        if (
+            self.attention_backend == "flashmla"
+            or self.decode_attention_backend == "flashmla"
+        ):
             logger.warning(
                 "FlashMLA only supports a page_size of 64, change page_size to 64."
             )
             self.page_size = 64
-        if self.attention_backend == "cutlass_mla":
+        if (
+            self.attention_backend == "cutlass_mla"
+            or self.decode_attention_backend == "cutlass_mla"
+        ):
             logger.warning(
                 "Cutlass MLA only supports a page_size of 128, change page_size to 128."
             )
@@ -428,12 +439,16 @@ class ServerArgs:
             ), "Please enable dp attention when setting enable_dp_lm_head. "
         # MoE kernel
-        if self.enable_flashinfer_moe:
+        if self.enable_flashinfer_cutlass_moe:
             assert (
                 self.quantization == "modelopt_fp4"
             ), "modelopt_fp4 quantization is required for Flashinfer MOE"
             os.environ["TRTLLM_ENABLE_PDL"] = "1"
+        if self.enable_flashinfer_trtllm_moe:
+            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
+            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
         # DeepEP MoE
         if self.enable_deepep_moe:
             if self.deepep_mode == "normal":
@@ -458,6 +473,9 @@ class ServerArgs:
                 "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
             )
+        if self.enable_eplb:
+            assert self.enable_ep_moe or self.enable_deepep_moe
         if self.enable_expert_distribution_metrics and (
             self.expert_distribution_recorder_mode is None
         ):
@@ -497,7 +515,7 @@ class ServerArgs:
                 )
             model_arch = self.get_hf_config().architectures[0]
-            if model_arch == "DeepseekV3ForCausalLM":
+            if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
                 # Auto set draft_model_path DeepSeek-V3/R1
                 if self.speculative_draft_model_path is None:
                     self.speculative_draft_model_path = self.model_path
@@ -789,6 +807,12 @@ class ServerArgs:
             default=ServerArgs.max_running_requests,
             help="The maximum number of running requests.",
         )
+        parser.add_argument(
+            "--max-queued-requests",
+            type=int,
+            default=ServerArgs.max_queued_requests,
+            help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
+        )
         parser.add_argument(
             "--max-total-tokens",
             type=int,
@@ -1092,6 +1116,7 @@ class ServerArgs:
                 "pythonic",
                 "kimi_k2",
                 "qwen3_coder",
+                "glm45",
             ],
             default=ServerArgs.tool_call_parser,
             help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
@@ -1205,6 +1230,35 @@ class ServerArgs:
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
+        parser.add_argument(
+            "--decode-attention-backend",
+            type=str,
+            choices=[
+                "flashinfer",
+                "triton",
+                "torch_native",
+                "fa3",
+                "flashmla",
+                "cutlass_mla",
+            ],
+            default=ServerArgs.decode_attention_backend,
+            help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
+        )
+        parser.add_argument(
+            "--prefill-attention-backend",
+            type=str,
+            choices=[
+                "flashinfer",
+                "triton",
+                "torch_native",
+                "fa3",
+                "flashmla",
+                "cutlass_mla",
+            ],
+            default=ServerArgs.prefill_attention_backend,
+            help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
+        )
         parser.add_argument(
             "--sampling-backend",
             type=str,
@@ -1290,10 +1344,15 @@ class ServerArgs:
             help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
         )
         parser.add_argument(
-            "--enable-flashinfer-moe",
+            "--enable-flashinfer-cutlass-moe",
             action="store_true",
             help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
         )
+        parser.add_argument(
+            "--enable-flashinfer-trtllm-moe",
+            action="store_true",
+            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
+        )
         parser.add_argument(
             "--enable-flashinfer-allreduce-fusion",
             action="store_true",

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -1,7 +1,9 @@
+from __future__ import annotations
 import dataclasses
 import logging
 from dataclasses import replace
-from typing import Dict, List, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
 import torch
@@ -20,6 +22,9 @@ from sglang.srt.operations_strategy import OperationsStrategy
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
 from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
 _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
 logger = logging.getLogger(__name__)
@@ -802,7 +807,7 @@ class MaybeTboDeepEPDispatcher:
     def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
         return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
-    def dispatch(self, **kwargs):
+    def dispatch(self, **kwargs) -> DispatchOutput:
         return self._execute("dispatch", **kwargs)
     def dispatch_a(self, **kwargs):
@@ -811,7 +816,7 @@ class MaybeTboDeepEPDispatcher:
     def dispatch_b(self, **kwargs):
         return self._execute("dispatch_b", **kwargs)
-    def combine(self, **kwargs):
+    def combine(self, **kwargs) -> torch.Tensor:
         return self._execute("combine", **kwargs)
     def combine_a(self, **kwargs):

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl