PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/bench_serving.py +49 -7
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +394 -76
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +4 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/rotary_embedding.py +0 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +59 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +81 -76
{sglang-0.4.4.post3.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -75,6 +75,7 @@ from sglang.srt.utils import (
     get_available_gpu_memory,
     init_custom_process_group,
     is_cuda,
+    is_flashinfer_available,
     is_hip,
     monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
@@ -123,6 +124,10 @@ class ModelRunner:
         self.page_size = server_args.page_size
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.use_mla_backend = (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and not server_args.disable_mla
+        )
         # Model-specific adjustment
         self.model_specific_adjustment()
@@ -147,15 +152,18 @@ class ModelRunner:
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
                 "enable_deepep_moe": server_args.enable_deepep_moe,
+                "deepep_mode": server_args.deepep_mode,
                 "device": server_args.device,
                 "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
                 "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
-                "enable_flashinfer_mla": server_args.enable_flashinfer_mla,
                 "enable_flashmla": server_args.enable_flashmla,
                 "disable_radix_cache": server_args.disable_radix_cache,
                 "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
                 "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
                 "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
+                "n_share_experts_fusion": server_args.n_share_experts_fusion,
+                "disable_shared_experts_fusion": server_args.disable_shared_experts_fusion,
+                "use_mla_backend": self.use_mla_backend,
             }
         )
@@ -216,27 +224,38 @@ class ModelRunner:
     def model_specific_adjustment(self):
         server_args = self.server_args
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not server_args.disable_mla
-        ):
+        if server_args.enable_flashinfer_mla:
+            # TODO: remove this branch after enable_flashinfer_mla is deprecated
+            logger.info("MLA optimization is turned on. Use flashinfer backend.")
+            server_args.attention_backend = "flashinfer"
+        elif server_args.enable_flashmla:
+            # TODO: remove this branch after enable_flashmla is deprecated
+            logger.info("MLA optimization is turned on. Use flashmla decode.")
+            server_args.attention_backend = "flashmla"
+        elif server_args.attention_backend is None:
+            # By default, use flashinfer for non-mla attention and triton for mla attention
+            if not self.use_mla_backend:
+                server_args.attention_backend = (
+                    "flashinfer" if is_flashinfer_available() else "triton"
+                )
+            else:
+                server_args.attention_backend = "triton"
+            logger.info(
+                f"Attention backend not set. Use {server_args.attention_backend} backend by default."
+            )
+        elif self.use_mla_backend:
             # TODO: add MLA optimization on CPU
             if server_args.device != "cpu":
-                if server_args.enable_flashinfer_mla:
-                    logger.info(
-                        "MLA optimization is turned on. Use flashinfer mla backend."
-                    )
-                    server_args.attention_backend = "flashinfer_mla"
-                elif server_args.enable_flashmla:
-                    logger.info("MLA optimization is turned on. Use flashmla decode.")
-                    server_args.attention_backend = "flashmla"
-                elif server_args.attention_backend == "fa3":
+                if server_args.attention_backend in ["flashinfer", "fa3", "triton"]:
                     logger.info(
-                        f"MLA optimization is turned on. Use flash attention 3 backend."
+                        f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
                     )
                 else:
-                    logger.info("MLA optimization is turned on. Use triton backend.")
-                    server_args.attention_backend = "triton"
+                    raise ValueError(
+                        f"Invalid attention backend for MLA: {server_args.attention_backend}"
+                    )
+            else:
+                raise ValueError(f"MLA optimization not supported on CPU.")
         if server_args.enable_double_sparsity:
             logger.info(
@@ -251,17 +270,16 @@ class ModelRunner:
             self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
         if self.is_multimodal:
-            self.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.90
             logger.info(
                 f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
                 f"because this is a multimodal model."
             )
-            if self.model_config.hf_config.architectures == [
-                "MllamaForConditionalGeneration"
-            ]:
-                logger.info("Automatically turn off --chunked-prefill-size for mllama.")
-                server_args.chunked_prefill_size = -1
+            logger.info(
+                "Automatically turn off --chunked-prefill-size for multimodal model."
+            )
+            server_args.chunked_prefill_size = -1
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
@@ -269,22 +287,11 @@ class ModelRunner:
                 "Qwen2_5_VLForConditionalGeneration"
             ]:
                 # TODO: qwen2-vl series does not support radix cache now, set disable_radix_cache=True automatically
-                logger.info(
-                    "Automatically turn off --chunked-prefill-size and disable radix cache for qwen-vl series."
-                )
-                server_args.chunked_prefill_size = -1
-                server_args.disable_radix_cache = True
-            if self.model_config.hf_config.architectures == ["DeepseekVL2ForCausalLM"]:
-                # TODO: deepseek-vl2 does not support radix cache now, set disable_radix_cache=True automatically
-                logger.info(
-                    "Automatically turn off --chunked-prefill-size and disable radix cache for deepseek-vl2."
-                )
-                server_args.chunked_prefill_size = -1
+                logger.info("Automatically disable radix cache for qwen-vl series.")
                 server_args.disable_radix_cache = True
         if server_args.enable_deepep_moe:
-            logger.info("DeepEP is turned on.")
+            logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
@@ -646,10 +653,7 @@ class ModelRunner:
         available_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not self.server_args.disable_mla
-        ):
+        if self.use_mla_backend:
             cell_size = (
                 (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
                 * self.model_config.num_hidden_layers
@@ -760,10 +764,7 @@ class ModelRunner:
             # Draft worker shares req_to_token_pool with the target worker.
             assert self.is_draft_worker
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not self.server_args.disable_mla
-        ):
+        if self.use_mla_backend:
             self.token_to_kv_pool = MLATokenToKVPool(
                 self.max_total_num_tokens,
                 page_size=self.page_size,
@@ -834,14 +835,21 @@ class ModelRunner:
     def init_attention_backend(self):
         """Init attention kernel backend."""
         if self.server_args.attention_backend == "flashinfer":
-            from sglang.srt.layers.attention.flashinfer_backend import (
-                FlashInferAttnBackend,
-            )
+            if not self.use_mla_backend:
+                from sglang.srt.layers.attention.flashinfer_backend import (
+                    FlashInferAttnBackend,
+                )
-            # Init streams
-            if self.server_args.speculative_algorithm == "EAGLE":
-                self.plan_stream_for_flashinfer = torch.cuda.Stream()
-            self.attn_backend = FlashInferAttnBackend(self)
+                # Init streams
+                if self.server_args.speculative_algorithm == "EAGLE":
+                    self.plan_stream_for_flashinfer = torch.cuda.Stream()
+                self.attn_backend = FlashInferAttnBackend(self)
+            else:
+                from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                    FlashInferMLAAttnBackend,
+                )
+                self.attn_backend = FlashInferMLAAttnBackend(self)
         elif self.server_args.attention_backend == "triton":
             assert self.sliding_window_size is None, (
                 "Window attention is not supported in the triton attention backend. "
@@ -867,12 +875,6 @@ class ModelRunner:
             )
             self.attn_backend = TorchNativeAttnBackend(self)
-        elif self.server_args.attention_backend == "flashinfer_mla":
-            from sglang.srt.layers.attention.flashinfer_mla_backend import (
-                FlashInferMLAAttnBackend,
-            )
-            self.attn_backend = FlashInferMLAAttnBackend(self)
         elif self.server_args.attention_backend == "flashmla":
             from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -489,6 +489,14 @@ class DummyModelLoader(BaseModelLoader):
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
+            # Model weight loading consists of two stages:
+            # 1. Initial weight loading.
+            # 2. Post-processing of weights, including assigning specific member variables.
+            # For `dummy_init`, only the second stage is required.
+            if hasattr(model, "post_load_weights"):
+                model.post_load_weights()
         return model.eval()

sglang/srt/models/clip.py CHANGED Viewed

@@ -17,7 +17,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.managers.schedule_batch import MultimodalInputs
 from sglang.srt.model_executor.model_runner import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import add_prefix
+from sglang.srt.utils import add_prefix, flatten_nested_list
 class CLIPVisionEmbeddings(nn.Module):
@@ -368,7 +368,6 @@ class CLIPVisionTransformer(nn.Module):
         self,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(pixel_values.to(self.device))
         hidden_states = self.pre_layrnorm(hidden_states)
@@ -456,12 +455,18 @@ class CLIPModel(nn.Module):
         get_embedding: bool = True,
     ):
         assert get_embedding, "CLIPEmbeddingModel is only used for embedding"
-        image_inputs = None
+        mm_inputs = []
         if forward_batch.mm_inputs is not None:
-            image_inputs = forward_batch.mm_inputs
-        if image_inputs is not None and image_inputs[0] is not None:
-            vision_outputs = self.vision_model(image_inputs[0].pixel_values)
+            mm_inputs = forward_batch.mm_inputs
+        pixel_values_list = [
+            item.pixel_values
+            for item in flatten_nested_list(
+                [mm_input.mm_items for mm_input in mm_inputs if mm_input is not None]
+            )
+        ]
+        if len(pixel_values_list) != 0:
+            pixel_values = torch.concat(pixel_values_list)
+            vision_outputs = self.vision_model(pixel_values)
             pooled_output = vision_outputs[:, 0, :]
             image_embeds = self.visual_projection(pooled_output)
             image_embeds = nn.functional.normalize(image_embeds, p=2, dim=1)

sglang/srt/models/deepseek_janus_pro.py CHANGED Viewed

@@ -51,7 +51,7 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternTokenPairs,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalInputs, global_server_args_dict
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.llama import LlamaForCausalLM
@@ -1959,8 +1959,8 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
         )
         self.logits_processor = LogitsProcessor(config)
-    def get_image_feature(self, image_input: MultimodalInputs) -> torch.Tensor:
-        pixel_values = image_input.pixel_values
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.concat([item.pixel_values for item in items], dim=0)
         bs, n = pixel_values.shape[0:2]
         pixel_values = pixel_values.to(
             device=self.vision_model.device, dtype=self.vision_model.dtype
@@ -1976,7 +1976,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
         return images_embeds
     def get_input_embeddings(self) -> nn.Embedding:
-        return self.language_model.model.embed_tokens
+        return self.language_model.get_input_embeddings()
     @torch.no_grad()
     def forward(
@@ -1984,23 +1984,18 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
         input_ids: torch.LongTensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
+        get_embedding: bool = False,
     ) -> torch.Tensor:
-        inputs_embeds = general_mm_embed_routine(
+        hidden_states = general_mm_embed_routine(
             input_ids=input_ids,
             forward_batch=forward_batch,
-            embed_tokens=self.get_input_embeddings(),
-            mm_data_embedding_func=self.get_image_feature,
-        )
-        return self.language_model(
-            input_ids=None,
+            image_data_embedding_func=self.get_image_feature,
+            language_model=self.language_model,
             positions=positions,
-            forward_batch=forward_batch,
-            input_embeds=inputs_embeds,
-            get_embedding=False,
         )
+        return hidden_states
     def prepare_gen_img_embeds(self, image_ids: torch.LongTensor):
         return self.gen_aligner(self.gen_embed(image_ids))

sglang 0.4.4.post3__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.4.post4py3-none-any.whl