PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +73 -14
sglang/compile_deep_gemm.py +13 -7
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +30 -7
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -12
sglang/srt/entrypoints/engine.py +31 -20
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +94 -94
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +23 -2
sglang/srt/eplb/expert_distribution.py +64 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +19 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +24 -1
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +35 -6
sglang/srt/layers/logits_processor.py +9 -20
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +78 -289
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +35 -10
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +13 -84
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +130 -46
sglang/srt/layers/sampler.py +12 -1
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +29 -4
sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +185 -144
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +165 -78
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +253 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +55 -14
sglang/srt/model_executor/model_runner.py +77 -170
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +296 -78
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +29 -197
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +23 -2
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +35 -5
sglang/srt/models/qwen3_moe.py +18 -12
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +459 -199
sglang/srt/single_batch_overlap.py +2 -4
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +142 -74
sglang/srt/utils/hf_transformers_utils.py +38 -12
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
sglang/srt/models/vila.py +0 -306
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/token_dispatcher/__init__.py CHANGED Viewed

@@ -12,9 +12,9 @@ from sglang.srt.layers.moe.token_dispatcher.deepep import (
     DeepEPConfig,
     DeepEPDispatcher,
     DeepEPLLCombineInput,
-    DeepEPLLOutput,
+    DeepEPLLDispatchOutput,
     DeepEPNormalCombineInput,
-    DeepEPNormalOutput,
+    DeepEPNormalDispatchOutput,
 )
 from sglang.srt.layers.moe.token_dispatcher.mooncake import (
     MooncakeCombineInput,
@@ -44,8 +44,8 @@ __all__ = [
     "StandardCombineInput",
     "DeepEPConfig",
     "DeepEPDispatcher",
-    "DeepEPNormalOutput",
-    "DeepEPLLOutput",
+    "DeepEPNormalDispatchOutput",
+    "DeepEPLLDispatchOutput",
     "DeepEPLLCombineInput",
     "DeepEPNormalCombineInput",
 ]

sglang/srt/layers/moe/token_dispatcher/base.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (
         DeepEPLLCombineInput,
-        DeepEPLLOutput,
+        DeepEPLLDispatchOutput,
         DeepEPNormalCombineInput,
-        DeepEPNormalOutput,
+        DeepEPNormalDispatchOutput,
         StandardCombineInput,
         StandardDispatchOutput,
     )
@@ -28,22 +28,28 @@ class DispatchOutputChecker:
     ) -> TypeGuard[StandardDispatchOutput]:
         return dispatch_output.format.is_standard()
+    @staticmethod
+    def format_is_triton_kernels(
+        dispatch_output: DispatchOutput,
+    ) -> TypeGuard[StandardDispatchOutput]:
+        return dispatch_output.format.is_standard()
     @staticmethod
     def format_is_deepep_normal(
         dispatch_output: DispatchOutput,
-    ) -> TypeGuard[DeepEPNormalOutput]:
+    ) -> TypeGuard[DeepEPNormalDispatchOutput]:
         return dispatch_output.format.is_deepep_normal()
     @staticmethod
     def format_is_deepep_ll(
         dispatch_output: DispatchOutput,
-    ) -> TypeGuard[DeepEPLLOutput]:
+    ) -> TypeGuard[DeepEPLLDispatchOutput]:
         return dispatch_output.format.is_deepep_ll()
     @staticmethod
     def format_is_deepep(
         dispatch_output: DispatchOutput,
-    ) -> TypeGuard[Union[DeepEPNormalOutput, DeepEPLLOutput]]:
+    ) -> TypeGuard[Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput]]:
         return dispatch_output.format.is_deepep()

sglang/srt/layers/moe/token_dispatcher/deepep.py CHANGED Viewed

@@ -58,7 +58,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
 logger = logging.getLogger(__name__)
-class DeepEPNormalOutput(NamedTuple):
+class DeepEPNormalDispatchOutput(NamedTuple):
     """DeepEP normal dispatch output."""
     hidden_states: torch.Tensor
@@ -72,7 +72,7 @@ class DeepEPNormalOutput(NamedTuple):
         return DispatchOutputFormat.DEEPEP_NORMAL
-class DeepEPLLOutput(NamedTuple):
+class DeepEPLLDispatchOutput(NamedTuple):
     """DeepEP low latency dispatch output."""
     hidden_states: torch.Tensor
@@ -87,14 +87,16 @@ class DeepEPLLOutput(NamedTuple):
         return DispatchOutputFormat.DEEPEP_LL
-assert isinstance(DeepEPNormalOutput, DispatchOutput)
-assert isinstance(DeepEPLLOutput, DispatchOutput)
+assert isinstance(DeepEPNormalDispatchOutput, DispatchOutput)
+assert isinstance(DeepEPLLDispatchOutput, DispatchOutput)
 class DeepEPNormalCombineInput(NamedTuple):
     """DeepEP normal combine input."""
-    pass
+    hidden_states: torch.Tensor
+    topk_ids: torch.Tensor
+    topk_weights: torch.Tensor
     @property
     def format(self) -> CombineInputFormat:
@@ -104,7 +106,9 @@ class DeepEPNormalCombineInput(NamedTuple):
 class DeepEPLLCombineInput(NamedTuple):
     """DeepEP low latency combine input."""
-    pass
+    hidden_states: torch.Tensor
+    topk_ids: torch.Tensor
+    topk_weights: torch.Tensor
     @property
     def format(self) -> CombineInputFormat:
@@ -327,7 +331,7 @@ class _DeepEPDispatcherImplBase:
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         raise NotImplementedError
@@ -383,7 +387,7 @@ class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
         else:
             hidden_states_scale = None
-        return DeepEPNormalOutput(
+        return DeepEPNormalDispatchOutput(
             hidden_states,
             hidden_states_scale,
             topk_ids,
@@ -457,7 +461,7 @@ class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM or _use_aiter or _is_npu:
@@ -562,7 +566,7 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
         else:
             hidden_states_scale = None
-        deepep_output = DeepEPLLOutput(
+        deepep_output = DeepEPLLDispatchOutput(
             hidden_states,
             hidden_states_scale,
             topk_ids,
@@ -613,7 +617,7 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         hidden_states, event, hook = self._combine_core(
             hidden_states,
@@ -639,7 +643,7 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         buffer = self._get_buffer()
@@ -756,18 +760,21 @@ class DeepEPDispatcher(BaseDispatcher):
         del self._dispatch_intermediate_state
         return self._get_impl().dispatch_b(*inner_state)
-    def combine(self, *args, **kwargs) -> Tuple:
-        self.combine_a(*args, **kwargs)
+    def combine(
+        self,
+        combine_input: CombineInput,
+        overlap_args: Optional[CombineOverlapArgs] = None,
+    ) -> Tuple:
+        self.combine_a(combine_input, overlap_args)
         ret = self.combine_b()
         return ret
     def combine_a(
         self,
-        hidden_states: torch.Tensor,
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"] = None,
+        combine_input: CombineInput,
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
+        hidden_states, topk_ids, topk_weights = combine_input
         self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
         inner_state = self._get_impl().combine_a(
             hidden_states=hidden_states,

sglang/srt/layers/moe/token_dispatcher/standard.py CHANGED Viewed

@@ -88,7 +88,7 @@ class StandardDispatcher(BaseDispatcher):
                 topk_output = topk_output._replace(
                     topk_ids=self.local_expert_mapping[topk_output.topk_ids]
                 )
-            elif TopKOutputChecker.format_is_triton_kernel(topk_output):
+            elif TopKOutputChecker.format_is_triton_kernels(topk_output):
                 raise NotImplementedError()
         return StandardDispatchOutput(

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -111,10 +111,10 @@ class TopKOutputChecker:
         return topk_output.format.is_standard()
     @staticmethod
-    def format_is_triton_kernel(
+    def format_is_triton_kernels(
         topk_output: TopKOutput,
     ) -> TypeGuard[TritonKernelTopKOutput]:
-        return topk_output.format.is_triton_kernel()
+        return topk_output.format.is_triton_kernels()
     @staticmethod
     def format_is_bypassed(topk_output: TopKOutput) -> TypeGuard[BypassedTopKOutput]:
@@ -129,7 +129,7 @@ class TopKOutputFormat(Enum):
     def is_standard(self) -> bool:
         return self == TopKOutputFormat.STANDARD
-    def is_triton_kernel(self) -> bool:
+    def is_triton_kernels(self) -> bool:
         return self == TopKOutputFormat.TRITON_KERNEL
     def is_bypassed(self) -> bool:
@@ -254,7 +254,7 @@ class TopK(CustomOp):
     ) -> TopKOutput:
         if self.topk_config.output_format is not None:
             output_format = self.topk_config.output_format
-        elif get_moe_runner_backend().is_triton_kernel():
+        elif get_moe_runner_backend().is_triton_kernels():
             output_format = TopKOutputFormat.TRITON_KERNEL
         elif (
             should_use_flashinfer_trtllm_moe()
@@ -314,16 +314,41 @@ class TopK(CustomOp):
         num_token_non_padded: Optional[torch.Tensor] = None,
         expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
     ) -> TopKOutput:
-        global_num_experts = router_logits.shape[-1]
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if global_num_experts == 256:
+        use_grouped_topk = self.topk_config.use_grouped_topk
+        torch_native = self.topk_config.torch_native
+        renormalize = self.topk_config.renormalize
+        if not use_grouped_topk and not torch_native:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax(
+                router_logits,
+                k=self.topk_config.top_k,
+            )
+            topk_weights = topk_weights.to(torch.float32)
+            if renormalize:
+                topk_weights_sum = (
+                    topk_weights.sum(dim=-1, keepdim=True)
+                    if self.topk_config.num_fused_shared_experts == 0
+                    else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+                )
+                topk_weights = topk_weights / topk_weights_sum
+            if expert_location_dispatch_info is not None:
+                topk_ids = topk_ids_logical_to_physical(
+                    topk_ids, expert_location_dispatch_info
+                )
+            get_global_expert_distribution_recorder().on_select_experts(
+                topk_ids=topk_ids
+            )
+            return StandardTopKOutput(topk_weights, topk_ids, _)
+        if use_grouped_topk and not torch_native and router_logits.shape[-1] == 256:
+            # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
             routed_scaling_factor = self.topk_config.routed_scaling_factor or 1
-            router_logits = router_logits.to(torch.float32)
             topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
+                router_logits.to(torch.float32),
                 k=self.topk_config.top_k,
                 bias=self.topk_config.correction_bias.to(torch.float32),
                 k_group=self.topk_config.topk_group,
@@ -335,7 +360,7 @@ class TopK(CustomOp):
                 eps=float(1e-20),
             )
-            if self.topk_config.renormalize:
+            if renormalize:
                 topk_weights_sum = (
                     topk_weights.sum(dim=-1, keepdim=True)
                     if self.topk_config.num_fused_shared_experts == 0

sglang/srt/layers/moe/utils.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MoeRunnerBackend(Enum):
     AUTO = "auto"
     DEEP_GEMM = "deep_gemm"
     TRITON = "triton"
-    TRITON_KERNEL = "triton_kernel"
+    TRITON_KERNELS = "triton_kernel"
     FLASHINFER_TRTLLM = "flashinfer_trtllm"
     FLASHINFER_CUTLASS = "flashinfer_cutlass"
     FLASHINFER_MXFP4 = "flashinfer_mxfp4"
@@ -67,8 +67,8 @@ class MoeRunnerBackend(Enum):
     def is_triton(self):
         return self == MoeRunnerBackend.TRITON
-    def is_triton_kernel(self):
-        return self == MoeRunnerBackend.TRITON_KERNEL
+    def is_triton_kernels(self):
+        return self == MoeRunnerBackend.TRITON_KERNELS
     def is_flashinfer_trtllm(self):
         return self == MoeRunnerBackend.FLASHINFER_TRTLLM
@@ -152,7 +152,6 @@ def initialize_moe_config(server_args: ServerArgs):
 def get_moe_a2a_backend() -> MoeA2ABackend:
     global MOE_A2A_BACKEND
     if MOE_A2A_BACKEND is None:
-        logger.warning("MOE_A2A_BACKEND is not initialized, using default backend")
         MOE_A2A_BACKEND = MoeA2ABackend.NONE
     return MOE_A2A_BACKEND

sglang/srt/layers/pooler.py CHANGED Viewed

@@ -20,7 +20,9 @@ class PoolingType(IntEnum):
 @dataclass
 class EmbeddingPoolerOutput:
-    embeddings: torch.Tensor
+    # Pooler can return list[tensor] instead of tensor if the dimension of each tensor in the batch is different
+    # due to different per-request matryoshka dim truncation
+    embeddings: torch.Tensor | list[torch.Tensor]
 class Pooler(nn.Module):
@@ -42,6 +44,7 @@ class Pooler(nn.Module):
     def forward(
         self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
     ) -> EmbeddingPoolerOutput:
         if self.pooling_type == PoolingType.LAST:
             last_token_indices = torch.cumsum(forward_batch.extend_seq_lens, dim=0) - 1
             pooled_data = hidden_states[last_token_indices]
@@ -53,8 +56,24 @@ class Pooler(nn.Module):
         else:
             raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+        if forward_batch.dimensions is not None:
+            all_same_dimensions = len(set(forward_batch.dimensions)) == 1
+            if all_same_dimensions:
+                pooled_data = pooled_data[..., : forward_batch.dimensions[0]]
+            else:
+                pooled_data = [
+                    tensor[..., :dim]
+                    for tensor, dim in zip(pooled_data, forward_batch.dimensions)
+                ]
         if self.normalize:
-            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    nn.functional.normalize(tensor, p=2, dim=-1)
+                    for tensor in pooled_data
+                ]
+            else:
+                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=-1)
         return EmbeddingPoolerOutput(embeddings=pooled_data)

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -7,36 +7,16 @@ from typing import TYPE_CHECKING, Dict, Optional, Type
 import torch
-try:
-    from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
-    from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
-    from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
-    from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
-    from vllm.model_executor.layers.quantization.gguf import GGUFConfig
-    from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-        GPTQMarlin24Config,
-    )
-    from vllm.model_executor.layers.quantization.marlin import MarlinConfig
-    from vllm.model_executor.layers.quantization.qqq import QQQConfig
-    from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
-    VLLM_AVAILABLE = True
-except ImportError as e:
-    VLLM_AVAILABLE = False
-    VLLM_IMPORT_ERROR = e
-    # Define empty classes as placeholders when vllm is not available
-    class DummyConfig:
-        def override_quantization_method(self, *args, **kwargs):
-            return None
-    AQLMConfig = BitsAndBytesConfig = CompressedTensorsConfig = DeepSpeedFPConfig = (
-        ExpertsInt8Config
-    ) = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = Int8TpuConfig = (
-        DummyConfig
-    )
+# Define empty classes as placeholders when vllm is not available
+class DummyConfig:
+    def override_quantization_method(self, *args, **kwargs):
+        return None
+CompressedTensorsConfig = DummyConfig
+from sglang.srt.layers.quantization.auto_round import AutoRoundConfig
 from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
@@ -45,6 +25,7 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
+from sglang.srt.layers.quantization.gguf import GGUFConfig
 from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import (
     ModelOptFp4Config,
@@ -64,7 +45,7 @@ _is_mxfp_supported = mxfp_supported()
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.topk import TopKOutput
-# Base quantization methods that don't depend on vllm
+# Base quantization methods
 BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "fp8": Fp8Config,
     "blockwise_int8": BlockInt8Config,
@@ -75,6 +56,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "w8a8_fp8": W8A8Fp8Config,
     "awq": AWQConfig,
     "awq_marlin": AWQMarlinConfig,
+    "gguf": GGUFConfig,
     "gptq": GPTQConfig,
     "gptq_marlin": GPTQMarlinConfig,
     "moe_wna16": MoeWNA16Config,
@@ -83,6 +65,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "w4afp8": W4AFp8Config,
     "petit_nvfp4": PetitNvFp4Config,
     "fbgemm_fp8": FBGEMMFp8Config,
+    "auto-round": AutoRoundConfig,
 }
@@ -102,20 +85,8 @@ elif _is_mxfp_supported and is_hip():
             "mxfp4": Mxfp4Config,
         }
     )
-# VLLM-dependent quantization methods
-VLLM_QUANTIZATION_METHODS = {
-    "aqlm": AQLMConfig,
-    "deepspeedfp": DeepSpeedFPConfig,
-    "tpu_int8": Int8TpuConfig,
-    "marlin": MarlinConfig,
-    "gguf": GGUFConfig,
-    "gptq_marlin_24": GPTQMarlin24Config,
-    "bitsandbytes": BitsAndBytesConfig,
-    "qqq": QQQConfig,
-    "experts_int8": ExpertsInt8Config,
-}
-QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS, **VLLM_QUANTIZATION_METHODS}
+QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS}
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
@@ -124,50 +95,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
             f"Invalid quantization method: {quantization}. "
             f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
         )
-    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
-        raise ValueError(
-            f"{quantization} quantization requires some operators from vllm. "
-            f"Please install vllm by `pip install vllm==0.9.0.1`\n"
-            f"Import error: {VLLM_IMPORT_ERROR}"
-        )
     return QUANTIZATION_METHODS[quantization]
 original_isinstance = builtins.isinstance
-def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
-    """
-    Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
-    can recognize sglang layers
-    """
-    if not VLLM_AVAILABLE:
-        return
-    if reverse:
-        builtins.isinstance = original_isinstance
-        return
-    from vllm.model_executor.layers.fused_moe import FusedMoE
-    from vllm.model_executor.layers.linear import LinearBase
-    from vllm.model_executor.layers.vocab_parallel_embedding import (
-        VocabParallelEmbedding,
-    )
-    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
-    from sglang.srt.layers.vocab_parallel_embedding import (
-        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
-    )
-    def patched_isinstance(obj, classinfo):
-        if classinfo is LinearBase:
-            return original_isinstance(obj, PatchedLinearBase)
-        if classinfo is FusedMoE:
-            return original_isinstance(obj, PatchedFusedMoE)
-        if classinfo is VocabParallelEmbedding:
-            return original_isinstance(obj, PatchedVocabParallelEmbedding)
-        return original_isinstance(obj, classinfo)
-    builtins.isinstance = patched_isinstance

sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl