PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

sglang/bench_serving.py +56 -12
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/model_config.py +5 -5
sglang/srt/distributed/parallel_state.py +0 -7
sglang/srt/entrypoints/engine.py +18 -15
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +75 -94
sglang/srt/environ.py +16 -2
sglang/srt/eplb/expert_distribution.py +30 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/flashattention_backend.py +12 -2
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +1 -0
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -272
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +4 -4
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/quantization/__init__.py +3 -5
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +13 -1
sglang/srt/layers/sampler.py +12 -1
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
sglang/srt/managers/scheduler.py +21 -15
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/tokenizer_manager.py +11 -19
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +82 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +44 -3
sglang/srt/model_executor/model_runner.py +1 -149
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v_moe.py +29 -196
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +2 -4
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/server_args.py +365 -186
sglang/srt/single_batch_overlap.py +2 -7
sglang/srt/utils/common.py +87 -42
sglang/srt/utils/hf_transformers_utils.py +7 -3
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
sglang/srt/models/vila.py +0 -306
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/{vila.py → nvila.py} RENAMED Viewed

@@ -1,64 +1,72 @@
-from typing import Any, Dict, List, Optional, Type
+from typing import Any
 import torch.nn as nn
 from transformers.configuration_utils import PretrainedConfig
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from sglang.srt.managers.io_struct import (
-    EmbeddingReqInput,
-    GenerateReqInput,
-    ImageDataInputItem,
-)
-from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.models.nvila import NVILAForConditionalGeneration
+from sglang.srt.models.nvila_lite import NVILALiteForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
 from sglang.srt.server_args import ServerArgs
+NUM_VIDEO_FRAMES = 8
-class VILAProcessor(ProcessorMixin):
-    """A stub class for the VILA processor."""
-    tokenizer: PreTrainedTokenizerBase
-class VILAMultimodalProcessor(BaseMultimodalProcessor):
-    models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
-    _processor: VILAProcessor
+class NVILAMultimodalProcessor(BaseMultimodalProcessor):
+    models: list[type[nn.Module]] = [
+        NVILAForConditionalGeneration,
+        NVILALiteForConditionalGeneration,
+    ]
     def __init__(
         self,
         hf_config: PretrainedConfig,
         server_args: ServerArgs,
-        _processor: VILAProcessor,
+        _processor: ProcessorMixin,
         *args,
         **kwargs,
     ) -> None:
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self._processor: ProcessorMixin
+        tokenizer: PreTrainedTokenizerBase = getattr(self._processor, "tokenizer")
         self.mm_tokens = MultimodalSpecialTokens(
-            image_token=self._processor.tokenizer.image_token,
+            image_token=tokenizer.image_token,
             image_token_id=hf_config.image_token_id,
+            video_token=tokenizer.video_token,
             video_token_id=hf_config.video_token_id,
         ).build(_processor)
     async def process_mm_data_async(
         self,
-        image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
-        input_text: str | List[int],
-        request_obj: GenerateReqInput | EmbeddingReqInput,
+        image_data,
+        audio_data,
+        input_text,
+        request_obj: GenerateReqInput,
         **kwargs,
-    ) -> Optional[Dict[str, Any]]:
+    ) -> dict[str, Any] | None:
         base_output = self.load_mm_data(
             prompt=input_text,
             multimodal_tokens=self.mm_tokens,
-            image_data=image_data,
+            image_data=request_obj.image_data,  # type: ignore
+            video_data=request_obj.video_data,  # type: ignore
         )
+        for i, video in enumerate(base_output.videos):  # type: ignore
+            base_output.videos[i] = [x.asnumpy() for x in video]  # type: ignore
         mm_items, input_ids, _ = self.process_and_combine_mm_data(
-            base_output, self.mm_tokens
+            base_output,
+            self.mm_tokens,
+            do_sample_frames=True,
+            num_frames=NUM_VIDEO_FRAMES,
         )
         return {

sglang/srt/multimodal/processors/points_v15_chat.py CHANGED Viewed

@@ -7,12 +7,12 @@ from PIL import Image
 from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
 from sglang.srt.multimodal.processors.qwen_vl import (
-    Qwen2_5VLImageProcessor,
+    QwenVLImageProcessor,
     resize_image_async,
 )
-class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
+class POINTSV15ChatProcessor(QwenVLImageProcessor):
     models = [POINTSV15ChatModel]
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):

sglang/srt/parser/reasoning_parser.py CHANGED Viewed

@@ -249,6 +249,31 @@ class GptOssDetector(BaseReasoningFormatDetector):
         )
+class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
+    """
+    Append `<think>` token to the beginning of the text.
+    """
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
+        # scheduler.py need `reasoning_parser.detector.think_end_token`
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+        )
+        self.is_first_chunk = False
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        if not self.is_first_chunk:
+            self.is_first_chunk = True
+            new_text = self.think_start_token + new_text
+        return StreamingParseResult(normal_text=new_text)
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        return StreamingParseResult(normal_text=self.think_start_token + text)
 class ReasoningParser:
     """
     Parser that handles both streaming and non-streaming scenarios for extracting
@@ -268,6 +293,8 @@ class ReasoningParser:
         "kimi": KimiDetector,
         "qwen3": Qwen3Detector,
         "qwen3-thinking": Qwen3Detector,
+        "minimax": Qwen3Detector,
+        "minimax-append-think": MiniMaxAppendThinkDetector,
         "step3": DeepSeekR1Detector,
     }
@@ -285,7 +312,7 @@ class ReasoningParser:
             raise ValueError(f"Unsupported model type: {model_type}")
         # Special cases where we override force_reasoning
-        if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
+        if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
             force_reasoning = True
         # Only pass force_reasoning if explicitly set, let detectors use their defaults

sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post1py3-none-any.whl