sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +56 -12
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/model_config.py +5 -5
 - sglang/srt/distributed/parallel_state.py +0 -7
 - sglang/srt/entrypoints/engine.py +18 -15
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +75 -94
 - sglang/srt/environ.py +16 -2
 - sglang/srt/eplb/expert_distribution.py +30 -0
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/flashattention_backend.py +12 -2
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
 - sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +1 -0
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +19 -4
 - sglang/srt/layers/logits_processor.py +5 -0
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +79 -272
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +4 -4
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/quantization/__init__.py +3 -5
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +13 -1
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/managers/io_struct.py +3 -0
 - sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
 - sglang/srt/managers/scheduler.py +21 -15
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/tokenizer_manager.py +11 -19
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +82 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/model_executor/forward_batch_info.py +44 -3
 - sglang/srt/model_executor/model_runner.py +1 -149
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/models/deepseek_v2.py +147 -44
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v_moe.py +29 -196
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +2 -4
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/server_args.py +365 -186
 - sglang/srt/single_batch_overlap.py +2 -7
 - sglang/srt/utils/common.py +87 -42
 - sglang/srt/utils/hf_transformers_utils.py +7 -3
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
 - sglang/srt/models/vila.py +0 -306
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
| 
         @@ -1,64 +1,72 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            from typing import Any 
     | 
| 
      
 1 
     | 
    
         
            +
            from typing import Any
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            import torch.nn as nn
         
     | 
| 
       4 
4 
     | 
    
         
             
            from transformers.configuration_utils import PretrainedConfig
         
     | 
| 
       5 
5 
     | 
    
         
             
            from transformers.processing_utils import ProcessorMixin
         
     | 
| 
       6 
6 
     | 
    
         
             
            from transformers.tokenization_utils_base import PreTrainedTokenizerBase
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
     | 
    
         
            -
            from sglang.srt.managers.io_struct import  
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
                ImageDataInputItem,
         
     | 
| 
       12 
     | 
    
         
            -
            )
         
     | 
| 
       13 
     | 
    
         
            -
            from sglang.srt.models.vila import VILAForConditionalGeneration
         
     | 
| 
      
 8 
     | 
    
         
            +
            from sglang.srt.managers.io_struct import GenerateReqInput
         
     | 
| 
      
 9 
     | 
    
         
            +
            from sglang.srt.models.nvila import NVILAForConditionalGeneration
         
     | 
| 
      
 10 
     | 
    
         
            +
            from sglang.srt.models.nvila_lite import NVILALiteForConditionalGeneration
         
     | 
| 
       14 
11 
     | 
    
         
             
            from sglang.srt.multimodal.processors.base_processor import (
         
     | 
| 
       15 
12 
     | 
    
         
             
                BaseMultimodalProcessor,
         
     | 
| 
       16 
13 
     | 
    
         
             
                MultimodalSpecialTokens,
         
     | 
| 
       17 
14 
     | 
    
         
             
            )
         
     | 
| 
       18 
15 
     | 
    
         
             
            from sglang.srt.server_args import ServerArgs
         
     | 
| 
       19 
16 
     | 
    
         | 
| 
      
 17 
     | 
    
         
            +
            NUM_VIDEO_FRAMES = 8
         
     | 
| 
       20 
18 
     | 
    
         | 
| 
       21 
     | 
    
         
            -
            class VILAProcessor(ProcessorMixin):
         
     | 
| 
       22 
     | 
    
         
            -
                """A stub class for the VILA processor."""
         
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
                tokenizer: PreTrainedTokenizerBase
         
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
            class VILAMultimodalProcessor(BaseMultimodalProcessor):
         
     | 
| 
       28 
     | 
    
         
            -
                models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
         
     | 
| 
       29 
19 
     | 
    
         | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 20 
     | 
    
         
            +
            class NVILAMultimodalProcessor(BaseMultimodalProcessor):
         
     | 
| 
      
 21 
     | 
    
         
            +
                models: list[type[nn.Module]] = [
         
     | 
| 
      
 22 
     | 
    
         
            +
                    NVILAForConditionalGeneration,
         
     | 
| 
      
 23 
     | 
    
         
            +
                    NVILALiteForConditionalGeneration,
         
     | 
| 
      
 24 
     | 
    
         
            +
                ]
         
     | 
| 
       31 
25 
     | 
    
         | 
| 
       32 
26 
     | 
    
         
             
                def __init__(
         
     | 
| 
       33 
27 
     | 
    
         
             
                    self,
         
     | 
| 
       34 
28 
     | 
    
         
             
                    hf_config: PretrainedConfig,
         
     | 
| 
       35 
29 
     | 
    
         
             
                    server_args: ServerArgs,
         
     | 
| 
       36 
     | 
    
         
            -
                    _processor:  
     | 
| 
      
 30 
     | 
    
         
            +
                    _processor: ProcessorMixin,
         
     | 
| 
       37 
31 
     | 
    
         
             
                    *args,
         
     | 
| 
       38 
32 
     | 
    
         
             
                    **kwargs,
         
     | 
| 
       39 
33 
     | 
    
         
             
                ) -> None:
         
     | 
| 
       40 
34 
     | 
    
         
             
                    super().__init__(hf_config, server_args, _processor, *args, **kwargs)
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                    self._processor: ProcessorMixin
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                    tokenizer: PreTrainedTokenizerBase = getattr(self._processor, "tokenizer")
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
       41 
40 
     | 
    
         
             
                    self.mm_tokens = MultimodalSpecialTokens(
         
     | 
| 
       42 
     | 
    
         
            -
                        image_token= 
     | 
| 
      
 41 
     | 
    
         
            +
                        image_token=tokenizer.image_token,
         
     | 
| 
       43 
42 
     | 
    
         
             
                        image_token_id=hf_config.image_token_id,
         
     | 
| 
      
 43 
     | 
    
         
            +
                        video_token=tokenizer.video_token,
         
     | 
| 
       44 
44 
     | 
    
         
             
                        video_token_id=hf_config.video_token_id,
         
     | 
| 
       45 
45 
     | 
    
         
             
                    ).build(_processor)
         
     | 
| 
       46 
46 
     | 
    
         | 
| 
       47 
47 
     | 
    
         
             
                async def process_mm_data_async(
         
     | 
| 
       48 
48 
     | 
    
         
             
                    self,
         
     | 
| 
       49 
     | 
    
         
            -
                    image_data 
     | 
| 
       50 
     | 
    
         
            -
                     
     | 
| 
       51 
     | 
    
         
            -
                     
     | 
| 
      
 49 
     | 
    
         
            +
                    image_data,
         
     | 
| 
      
 50 
     | 
    
         
            +
                    audio_data,
         
     | 
| 
      
 51 
     | 
    
         
            +
                    input_text,
         
     | 
| 
      
 52 
     | 
    
         
            +
                    request_obj: GenerateReqInput,
         
     | 
| 
       52 
53 
     | 
    
         
             
                    **kwargs,
         
     | 
| 
       53 
     | 
    
         
            -
                ) ->  
     | 
| 
      
 54 
     | 
    
         
            +
                ) -> dict[str, Any] | None:
         
     | 
| 
       54 
55 
     | 
    
         
             
                    base_output = self.load_mm_data(
         
     | 
| 
       55 
56 
     | 
    
         
             
                        prompt=input_text,
         
     | 
| 
       56 
57 
     | 
    
         
             
                        multimodal_tokens=self.mm_tokens,
         
     | 
| 
       57 
     | 
    
         
            -
                        image_data=image_data,
         
     | 
| 
      
 58 
     | 
    
         
            +
                        image_data=request_obj.image_data,  # type: ignore
         
     | 
| 
      
 59 
     | 
    
         
            +
                        video_data=request_obj.video_data,  # type: ignore
         
     | 
| 
       58 
60 
     | 
    
         
             
                    )
         
     | 
| 
       59 
61 
     | 
    
         | 
| 
      
 62 
     | 
    
         
            +
                    for i, video in enumerate(base_output.videos):  # type: ignore
         
     | 
| 
      
 63 
     | 
    
         
            +
                        base_output.videos[i] = [x.asnumpy() for x in video]  # type: ignore
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
       60 
65 
     | 
    
         
             
                    mm_items, input_ids, _ = self.process_and_combine_mm_data(
         
     | 
| 
       61 
     | 
    
         
            -
                        base_output, 
     | 
| 
      
 66 
     | 
    
         
            +
                        base_output,
         
     | 
| 
      
 67 
     | 
    
         
            +
                        self.mm_tokens,
         
     | 
| 
      
 68 
     | 
    
         
            +
                        do_sample_frames=True,
         
     | 
| 
      
 69 
     | 
    
         
            +
                        num_frames=NUM_VIDEO_FRAMES,
         
     | 
| 
       62 
70 
     | 
    
         
             
                    )
         
     | 
| 
       63 
71 
     | 
    
         | 
| 
       64 
72 
     | 
    
         
             
                    return {
         
     | 
| 
         @@ -7,12 +7,12 @@ from PIL import Image 
     | 
|
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
            from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
         
     | 
| 
       9 
9 
     | 
    
         
             
            from sglang.srt.multimodal.processors.qwen_vl import (
         
     | 
| 
       10 
     | 
    
         
            -
                 
     | 
| 
      
 10 
     | 
    
         
            +
                QwenVLImageProcessor,
         
     | 
| 
       11 
11 
     | 
    
         
             
                resize_image_async,
         
     | 
| 
       12 
12 
     | 
    
         
             
            )
         
     | 
| 
       13 
13 
     | 
    
         | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
            class POINTSV15ChatProcessor( 
     | 
| 
      
 15 
     | 
    
         
            +
            class POINTSV15ChatProcessor(QwenVLImageProcessor):
         
     | 
| 
       16 
16 
     | 
    
         
             
                models = [POINTSV15ChatModel]
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
       18 
18 
     | 
    
         
             
                def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         
     | 
| 
         @@ -249,6 +249,31 @@ class GptOssDetector(BaseReasoningFormatDetector): 
     | 
|
| 
       249 
249 
     | 
    
         
             
                    )
         
     | 
| 
       250 
250 
     | 
    
         | 
| 
       251 
251 
     | 
    
         | 
| 
      
 252 
     | 
    
         
            +
            class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
         
     | 
| 
      
 253 
     | 
    
         
            +
                """
         
     | 
| 
      
 254 
     | 
    
         
            +
                Append `<think>` token to the beginning of the text.
         
     | 
| 
      
 255 
     | 
    
         
            +
                """
         
     | 
| 
      
 256 
     | 
    
         
            +
             
     | 
| 
      
 257 
     | 
    
         
            +
                def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
         
     | 
| 
      
 258 
     | 
    
         
            +
                    # scheduler.py need `reasoning_parser.detector.think_end_token`
         
     | 
| 
      
 259 
     | 
    
         
            +
                    super().__init__(
         
     | 
| 
      
 260 
     | 
    
         
            +
                        "<think>",
         
     | 
| 
      
 261 
     | 
    
         
            +
                        "</think>",
         
     | 
| 
      
 262 
     | 
    
         
            +
                        force_reasoning=force_reasoning,
         
     | 
| 
      
 263 
     | 
    
         
            +
                        stream_reasoning=stream_reasoning,
         
     | 
| 
      
 264 
     | 
    
         
            +
                    )
         
     | 
| 
      
 265 
     | 
    
         
            +
                    self.is_first_chunk = False
         
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
                def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
         
     | 
| 
      
 268 
     | 
    
         
            +
                    if not self.is_first_chunk:
         
     | 
| 
      
 269 
     | 
    
         
            +
                        self.is_first_chunk = True
         
     | 
| 
      
 270 
     | 
    
         
            +
                        new_text = self.think_start_token + new_text
         
     | 
| 
      
 271 
     | 
    
         
            +
                    return StreamingParseResult(normal_text=new_text)
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
                def detect_and_parse(self, text: str) -> StreamingParseResult:
         
     | 
| 
      
 274 
     | 
    
         
            +
                    return StreamingParseResult(normal_text=self.think_start_token + text)
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
       252 
277 
     | 
    
         
             
            class ReasoningParser:
         
     | 
| 
       253 
278 
     | 
    
         
             
                """
         
     | 
| 
       254 
279 
     | 
    
         
             
                Parser that handles both streaming and non-streaming scenarios for extracting
         
     | 
| 
         @@ -268,6 +293,8 @@ class ReasoningParser: 
     | 
|
| 
       268 
293 
     | 
    
         
             
                    "kimi": KimiDetector,
         
     | 
| 
       269 
294 
     | 
    
         
             
                    "qwen3": Qwen3Detector,
         
     | 
| 
       270 
295 
     | 
    
         
             
                    "qwen3-thinking": Qwen3Detector,
         
     | 
| 
      
 296 
     | 
    
         
            +
                    "minimax": Qwen3Detector,
         
     | 
| 
      
 297 
     | 
    
         
            +
                    "minimax-append-think": MiniMaxAppendThinkDetector,
         
     | 
| 
       271 
298 
     | 
    
         
             
                    "step3": DeepSeekR1Detector,
         
     | 
| 
       272 
299 
     | 
    
         
             
                }
         
     | 
| 
       273 
300 
     | 
    
         | 
| 
         @@ -285,7 +312,7 @@ class ReasoningParser: 
     | 
|
| 
       285 
312 
     | 
    
         
             
                        raise ValueError(f"Unsupported model type: {model_type}")
         
     | 
| 
       286 
313 
     | 
    
         | 
| 
       287 
314 
     | 
    
         
             
                    # Special cases where we override force_reasoning
         
     | 
| 
       288 
     | 
    
         
            -
                    if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
         
     | 
| 
      
 315 
     | 
    
         
            +
                    if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
         
     | 
| 
       289 
316 
     | 
    
         
             
                        force_reasoning = True
         
     | 
| 
       290 
317 
     | 
    
         | 
| 
       291 
318 
     | 
    
         
             
                    # Only pass force_reasoning if explicitly set, let detectors use their defaults
         
     |