PyPI - sglang - Versions diffs - 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +5 -1
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +17 -2
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +65 -20
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +70 -15
sglang/srt/entrypoints/engine.py +5 -9
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +148 -72
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +105 -66
sglang/srt/function_call/function_call_parser.py +6 -4
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
sglang/srt/layers/activation.py +11 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +46 -25
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +88 -34
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +33 -14
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/utils.py +0 -9
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/lora/lora_manager.py +133 -169
sglang/srt/lora/lora_registry.py +188 -0
sglang/srt/lora/mem_pool.py +2 -2
sglang/srt/managers/cache_controller.py +62 -13
sglang/srt/managers/io_struct.py +19 -1
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +27 -11
sglang/srt/managers/scheduler.py +48 -26
sglang/srt/managers/tokenizer_manager.py +62 -28
sglang/srt/managers/tp_worker.py +5 -4
sglang/srt/mem_cache/allocator.py +67 -7
sglang/srt/mem_cache/hicache_storage.py +17 -1
sglang/srt/mem_cache/hiradix_cache.py +35 -18
sglang/srt/mem_cache/memory_pool_host.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +61 -25
sglang/srt/model_executor/forward_batch_info.py +201 -29
sglang/srt/model_executor/model_runner.py +109 -37
sglang/srt/models/deepseek_v2.py +63 -30
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/mllama4.py +10 -3
sglang/srt/models/qwen2_moe.py +2 -6
sglang/srt/models/qwen3_moe.py +6 -8
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +48 -5
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/server_args.py +132 -60
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils.py +113 -69
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_activation.py +50 -1
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/unquant.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sglang.srt.utils import (
 )
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.layer import EPMoE
     from sglang.srt.layers.moe.topk import TopKOutput
 has_triton_kernels = importlib.util.find_spec("triton_kernels") is not None
@@ -129,6 +130,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         super().__init__()
         self.use_triton_kernels = use_triton_kernels
+        self.triton_kernel_moe_forward = None
+        if torch.cuda.is_available() and has_triton_kernels:
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_forward as _tk_forward,
+            )
+            self.triton_kernel_moe_forward = _tk_forward
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -194,6 +203,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
+        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+        if isinstance(layer, EPMoE):
+            return layer.run_moe(
+                hidden_states=x,
+                topk_output=topk_output,
+            )
         return self.forward(
             x=x,
             layer=layer,
@@ -219,16 +237,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     ) -> torch.Tensor:
         if self.use_triton_kernels:
-            # TODO(ch-wan): re-enable the Triton kernel
-            raise NotImplementedError("The Triton kernel is temporarily disabled.")
-            # return triton_kernel_moe_forward(
-            #     hidden_states=x,
-            #     w1=layer.w13_weight,
-            #     w2=layer.w2_weight,
-            #     gating_output=router_logits,
-            #     topk=top_k,
-            #     renormalize=renormalize,
-            # )
+            return self.triton_kernel_moe_forward(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_output=topk_output,
+            )
         else:
             if _use_aiter:
                 assert not no_combine, "unsupported"
@@ -354,69 +368,3 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         raise NotImplementedError("The TPU backend currently does not support MoE.")
     forward_native = forward_cpu
-class UnquantizedEPMoEMethod(FusedMoEMethodBase, CustomOp):
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts_per_partition: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts_per_partition,
-                2 * intermediate_size,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts_per_partition,
-                hidden_size,
-                intermediate_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        # scale
-        layer.register_parameter("w13_input_scale", None)
-        layer.register_parameter("w13_weight_scale", None)
-        ones_tensor = torch.ones(num_experts_per_partition, dtype=torch.float32)
-        w2_input_scale = torch.nn.Parameter(
-            ones_tensor,
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_input_scale", w2_input_scale)
-        set_weight_attrs(w2_input_scale, extra_weight_attrs)
-        w2_weight_scale = torch.nn.Parameter(
-            ones_tensor,
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        hidden_states: torch.Tensor,
-        topk_output: TopKOutput,
-    ) -> torch.Tensor:
-        raise NotImplementedError

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -17,15 +17,6 @@ from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_np
 if TYPE_CHECKING:
     from sglang.srt.layers.quantization.base_config import QuantizationConfig
-_is_cuda = is_cuda()
-_is_npu = is_npu()
-_is_cpu_amx_available = cpu_has_amx_support()
-_is_cpu = is_cpu()
-_is_hip = is_hip()
-if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip):
-    from vllm._custom_ops import scaled_fp8_quant
 def is_layer_skipped(
     prefix: str,

sglang/srt/layers/quantization/w4afp8.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import logging
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import torch
 from torch.nn import Module
@@ -17,6 +17,9 @@ from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.utils import is_layer_skipped
 from sglang.srt.utils import set_weight_attrs
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.layer import EPMoE, TopKOutput
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = logging.getLogger(__name__)
@@ -84,13 +87,14 @@ class W4AFp8Config(QuantizationConfig):
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[QuantizeMethodBase]:
         from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
-        elif isinstance(layer, FusedMoE):
+        elif isinstance(layer, EPMoE):
             return W4AFp8MoEMethod(self)
         return None
@@ -105,8 +109,8 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
     def create_weights(
         self,
-        layer: Module,
-        num_experts_per_partition: int,
+        layer: EPMoE,
+        num_experts: int,
         hidden_size: int,
         intermediate_size: int,
         params_dtype: torch.dtype,
@@ -117,7 +121,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.empty(
-                num_experts_per_partition,
+                num_experts,
                 intermediate_size * 2,
                 hidden_size // 2,
                 dtype=torch.int8,
@@ -130,7 +134,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # down_proj (row parallel)
         w2_weight = torch.nn.Parameter(
             torch.empty(
-                num_experts_per_partition,
+                num_experts,
                 hidden_size,
                 intermediate_size // 2,
                 dtype=torch.int8,
@@ -142,7 +146,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         w13_weight_scale = torch.nn.Parameter(
             torch.zeros(
-                num_experts_per_partition,
+                num_experts,
                 2 * intermediate_size,
                 hidden_size // self.quant_config.group_size,
                 dtype=torch.float32,
@@ -154,7 +158,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         w2_weight_scale = torch.nn.Parameter(
             torch.zeros(
-                num_experts_per_partition,
+                num_experts,
                 hidden_size,
                 intermediate_size // self.quant_config.group_size,
                 dtype=torch.float32,
@@ -166,14 +170,14 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # Input scales
         w13_input_scale = torch.nn.Parameter(
-            torch.ones((num_experts_per_partition, 2), dtype=torch.bfloat16),
+            torch.ones((num_experts, 2), dtype=torch.bfloat16),
             requires_grad=False,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
         set_weight_attrs(w13_input_scale, extra_weight_attrs)
         w2_input_scale = torch.nn.Parameter(
-            torch.ones(num_experts_per_partition, dtype=torch.bfloat16),
+            torch.ones(num_experts, dtype=torch.bfloat16),
             requires_grad=False,
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
@@ -183,25 +187,25 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         device = layer.w13_weight.device
         self.a_strides1 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             hidden_size,
             device=device,
             dtype=torch.int64,
         )
         self.c_strides1 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             2 * intermediate_size,
             device=device,
             dtype=torch.int64,
         )
         self.a_strides2 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             intermediate_size,
             device=device,
             dtype=torch.int64,
         )
         self.c_strides2 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             hidden_size,
             device=device,
             dtype=torch.int64,
@@ -212,13 +216,13 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         self.s_strides2 = self.c_strides2
         self.expert_offsets = torch.empty(
-            (num_experts_per_partition + 1), dtype=torch.int32, device=device
+            (num_experts + 1), dtype=torch.int32, device=device
         )
         self.problem_sizes1 = torch.empty(
-            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+            (num_experts, 3), dtype=torch.int32, device=device
         )
         self.problem_sizes2 = torch.empty(
-            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+            (num_experts, 3), dtype=torch.int32, device=device
         )
         return
@@ -266,3 +270,50 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
             [w2_input_scale_max], dtype=dtype, device=device
         )
         layer.w2_input_scale = Parameter(new_w2_input_scale, requires_grad=False)
+    def apply(
+        self,
+        layer: EPMoE,
+        hidden_states: torch.Tensor,
+        topk_output: TopKOutput,
+    ) -> torch.Tensor:
+        # TODO(ch-wan): move it out of this class
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+        topk_ids, topk_weights, _ = topk_output
+        local_topk_ids = topk_ids
+        if layer.expert_map is not None:
+            "Translate info from expert_map to topk_ids"
+            local_topk_ids = torch.where(
+                layer.expert_map[topk_ids] != layer.num_experts,
+                layer.expert_map[topk_ids],
+                layer.num_experts,
+            )
+        return cutlass_w4a8_moe(
+            layer.start_expert_id,
+            layer.end_expert_id,
+            layer.num_experts,
+            hidden_states,
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale_inv,
+            layer.w2_weight_scale_inv,
+            topk_weights,
+            topk_ids,
+            local_topk_ids,
+            self.a_strides1,
+            self.b_strides1,
+            self.c_strides1,
+            self.a_strides2,
+            self.b_strides2,
+            self.c_strides2,
+            self.s_strides13,
+            self.s_strides2,
+            self.expert_offsets,
+            self.problem_sizes1,
+            self.problem_sizes2,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+        )

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -12,14 +12,16 @@
 # limitations under the License.
 # ==============================================================================
 """Radix attention."""
+from __future__ import annotations
 from enum import Enum
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 from torch import nn
-from sglang.srt.layers.quantization.base_config import QuantizationConfig
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 class AttentionType(Enum):

sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl