PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/bench_serving.py +72 -10
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +6 -16
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +582 -125
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +79 -6
sglang/srt/layers/quantization/__init__.py +137 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +44 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -127
sglang/srt/managers/scheduler.py +29 -23
sglang/srt/managers/tokenizer_manager.py +1 -2
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +16 -13
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +64 -59
sglang/srt/model_loader/loader.py +19 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +568 -0
sglang/srt/models/deepseek_janus_pro.py +12 -17
sglang/srt/models/deepseek_v2.py +339 -123
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +20 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +106 -93
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +120 -25
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +94 -25
sglang/srt/utils.py +137 -51
sglang/test/runners.py +27 -2
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +14 -27
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -12,17 +12,19 @@
 # limitations under the License.
 # ==============================================================================
+import os
 from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
+from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
 _is_cuda = is_cuda()
 _is_hip = is_hip()
-from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
 expert_distribution_recorder = ExpertDistributionRecorder()
@@ -102,11 +104,13 @@ def grouped_topk(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
+    n_share_experts_fusion: int = 0,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     scores = torch.softmax(gating_output, dim=-1)
     num_token = scores.shape[0]
+    num_experts = scores.shape[1]
     group_scores = (
         scores.view(num_token, num_expert_group, -1).max(dim=-1).values
     )  # [n, n_group]
@@ -122,15 +126,30 @@ def grouped_topk(
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
     topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    if n_share_experts_fusion:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + n_share_experts_fusion,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        topk_weights[:, -1] = (
+            topk_weights[:, :-1].sum(dim=-1) / 2.5
+        )  # 2.5 is the routed_scaling_factor.
     if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if n_share_experts_fusion == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
-@torch.compile(dynamic=True, backend=get_compiler_backend())
-def biased_grouped_topk(
+def biased_grouped_topk_impl(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     correction_bias: torch.Tensor,
@@ -138,11 +157,13 @@ def biased_grouped_topk(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
+    n_share_experts_fusion: int = 0,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     scores = gating_output.sigmoid()
     num_token = scores.shape[0]
+    num_experts = scores.shape[1]
     scores_for_choice = scores.view(num_token, -1) + correction_bias.unsqueeze(0)
     group_scores = (
         scores_for_choice.view(num_token, num_expert_group, -1)
@@ -165,12 +186,59 @@ def biased_grouped_topk(
     _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
     topk_weights = scores.gather(1, topk_ids)
+    if n_share_experts_fusion:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + n_share_experts_fusion,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        topk_weights[:, -1] = (
+            topk_weights[:, :-1].sum(dim=-1) / 2.5
+        )  # 2.5 is the routed_scaling_factor.
     if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if n_share_experts_fusion == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+def biased_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    compiled: bool = True,
+    n_share_experts_fusion: int = 0,
+):
+    biased_grouped_topk_fn = (
+        torch.compile(
+            biased_grouped_topk_impl, dynamic=True, backend=get_compiler_backend()
+        )
+        if compiled
+        else biased_grouped_topk_impl
+    )
+    return biased_grouped_topk_fn(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        n_share_experts_fusion=n_share_experts_fusion,
+    )
 def select_experts(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
@@ -183,7 +251,10 @@ def select_experts(
     correction_bias: Optional[torch.Tensor] = None,
     torch_native: bool = False,
 ):
-    # DeekSeekv2 uses grouped_top_k
+    n_share_experts_fusion = 0
+    if global_server_args_dict["n_share_experts_fusion"] is not None:
+        n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
+    # DeekSeek V2/V3/R1 serices models uses grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None
@@ -195,6 +266,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
+                n_share_experts_fusion=n_share_experts_fusion,
             )
         else:
             topk_weights, topk_ids = biased_grouped_topk(
@@ -205,6 +277,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
+                n_share_experts_fusion=n_share_experts_fusion,
             )
     elif torch_native and custom_routing_function is None:
         topk_weights, topk_ids = fused_topk_native(

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -9,13 +9,24 @@ import torch
 try:
     from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
-    from vllm.model_executor.layers.quantization.awq import AWQConfig
-    from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
+    from vllm.model_executor.layers.quantization.awq_marlin import (
+        AWQMarlinConfig,
+        AWQMoEMethod,
+    )
     from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+        CompressedTensorsW8A8Fp8MoEMethod,
+        CompressedTensorsWNA16MoEMethod,
+    )
     from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
     from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
     from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
     from vllm.model_executor.layers.quantization.gguf import GGUFConfig
+    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod,
+        GPTQMarlinMoEMethod,
+    )
     from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
         GPTQMarlin24Config,
     )
@@ -23,33 +34,39 @@ try:
     from vllm.model_executor.layers.quantization.qqq import QQQConfig
     from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
-    from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
     VLLM_AVAILABLE = True
 except ImportError:
     VLLM_AVAILABLE = False
     # Define empty classes as placeholders when vllm is not available
     class DummyConfig:
-        pass
+        def override_quantization_method(self, *args, **kwargs):
+            return None
+    AQLMConfig = AWQMarlinConfig = BitsAndBytesConfig = CompressedTensorsConfig = (
+        DeepSpeedFPConfig
+    ) = ExpertsInt8Config = FBGEMMFp8Config = GGUFConfig = GPTQMarlin24Config = (
+        MarlinConfig
+    ) = QQQConfig = Int8TpuConfig = DummyConfig
-    AQLMConfig = AWQConfig = AWQMarlinConfig = BitsAndBytesConfig = (
-        CompressedTensorsConfig
-    ) = DummyConfig
-    DeepSpeedFPConfig = ExpertsInt8Config = FBGEMMFp8Config = GGUFConfig = (
-        GPTQMarlin24Config
-    ) = DummyConfig
-    MarlinConfig = QQQConfig = Int8TpuConfig = DummyConfig
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.quantization.awq import AWQConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
+from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    UnquantizedEmbeddingMethod,
+)
 # Base quantization methods that don't depend on vllm
 BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -58,29 +75,29 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "modelopt": ModelOptFp8Config,
     "w8a8_int8": W8A8Int8Config,
     "w8a8_fp8": W8A8Fp8Config,
+    "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
 }
-# Add vllm-dependent methods if available
-QUANTIZATION_METHODS = BASE_QUANTIZATION_METHODS.copy()
-if VLLM_AVAILABLE:
-    VLLM_QUANTIZATION_METHODS = {
-        "aqlm": AQLMConfig,
-        "awq": AWQConfig,
-        "deepspeedfp": DeepSpeedFPConfig,
-        "tpu_int8": Int8TpuConfig,
-        "fbgemm_fp8": FBGEMMFp8Config,
-        "marlin": MarlinConfig,
-        "gguf": GGUFConfig,
-        "gptq_marlin_24": GPTQMarlin24Config,
-        "awq_marlin": AWQMarlinConfig,
-        "bitsandbytes": BitsAndBytesConfig,
-        "qqq": QQQConfig,
-        "experts_int8": ExpertsInt8Config,
-        "gptq_marlin": GPTQMarlinConfig,
-        "gptq": GPTQConfig,
-    }
-    QUANTIZATION_METHODS.update(VLLM_QUANTIZATION_METHODS)
+# VLLM-dependent quantization methods
+VLLM_QUANTIZATION_METHODS = {
+    "aqlm": AQLMConfig,
+    "awq": AWQConfig,
+    "deepspeedfp": DeepSpeedFPConfig,
+    "tpu_int8": Int8TpuConfig,
+    "fbgemm_fp8": FBGEMMFp8Config,
+    "marlin": MarlinConfig,
+    "gguf": GGUFConfig,
+    "gptq_marlin_24": GPTQMarlin24Config,
+    "awq_marlin": AWQMarlinConfig,
+    "bitsandbytes": BitsAndBytesConfig,
+    "qqq": QQQConfig,
+    "experts_int8": ExpertsInt8Config,
+    "gptq_marlin": GPTQMarlinConfig,
+    "gptq": GPTQConfig,
+}
+QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS, **VLLM_QUANTIZATION_METHODS}
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
@@ -89,6 +106,12 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
             f"Invalid quantization method: {quantization}. "
             f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
         )
+    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
+        raise ValueError(
+            f"{quantization} quantization requires some operators from vllm. "
+            "Pleaes install vllm by `pip install vllm==0.7.2`"
+        )
     return QUANTIZATION_METHODS[quantization]
@@ -153,13 +176,6 @@ def get_linear_quant_method(
     prefix: str,
     linear_method_cls: type,
 ):
-    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-    from sglang.srt.layers.vocab_parallel_embedding import (
-        ParallelLMHead,
-        UnquantizedEmbeddingMethod,
-    )
     cloned_config = deepcopy(config)
     parallel_lm_head_quantized = (
         isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized
@@ -186,31 +202,19 @@ def get_linear_quant_method(
 def gptq_get_quant_method(self, layer, prefix):
-    if not VLLM_AVAILABLE:
-        return None
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    try:
-        from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinLinearMethod,
-            GPTQMarlinMoEMethod,
-        )
+    if isinstance(layer, FusedMoE):
+        return GPTQMarlinMoEMethod(self)
-        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-        if isinstance(layer, FusedMoE):
-            return GPTQMarlinMoEMethod(self)
-        if isinstance(self, GPTQConfig):
-            return get_linear_quant_method(
-                self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
-            )
-        elif isinstance(self, GPTQMarlinConfig):
-            return get_linear_quant_method(
-                self, layer, prefix=prefix, linear_method_cls=GPTQMarlinLinearMethod
-            )
-    except ImportError:
-        pass
+    if isinstance(self, GPTQConfig):
+        return get_linear_quant_method(
+            self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
+        )
+    elif isinstance(self, GPTQMarlinConfig):
+        return get_linear_quant_method(
+            self, layer, prefix=prefix, linear_method_cls=GPTQMarlinLinearMethod
+        )
     return None
@@ -229,33 +233,28 @@ def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
         builtins.isinstance = original_isinstance
         return
-    try:
-        from vllm.model_executor.layers.fused_moe import FusedMoE
-        from vllm.model_executor.layers.linear import LinearBase
-        from vllm.model_executor.layers.vocab_parallel_embedding import (
-            VocabParallelEmbedding,
-        )
+    from vllm.model_executor.layers.fused_moe import FusedMoE
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding,
+    )
-        from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
-        from sglang.srt.layers.moe.fused_moe_triton.layer import (
-            FusedMoE as PatchedFusedMoE,
-        )
-        from sglang.srt.layers.vocab_parallel_embedding import (
-            VocabParallelEmbedding as PatchedVocabParallelEmbedding,
-        )
+    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
+    from sglang.srt.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
+    )
-        def patched_isinstance(obj, classinfo):
-            if classinfo is LinearBase:
-                return original_isinstance(obj, PatchedLinearBase)
-            if classinfo is FusedMoE:
-                return original_isinstance(obj, PatchedFusedMoE)
-            if classinfo is VocabParallelEmbedding:
-                return original_isinstance(obj, PatchedVocabParallelEmbedding)
-            return original_isinstance(obj, classinfo)
-        builtins.isinstance = patched_isinstance
-    except ImportError:
-        return
+    def patched_isinstance(obj, classinfo):
+        if classinfo is LinearBase:
+            return original_isinstance(obj, PatchedLinearBase)
+        if classinfo is FusedMoE:
+            return original_isinstance(obj, PatchedFusedMoE)
+        if classinfo is VocabParallelEmbedding:
+            return original_isinstance(obj, PatchedVocabParallelEmbedding)
+        return original_isinstance(obj, classinfo)
+    builtins.isinstance = patched_isinstance
 def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
@@ -263,91 +262,64 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
     Monkey patch the apply function of vllm's FusedMoEMethodBase.
     Convert sglang arguments to vllm arguments.
     """
-    if not VLLM_AVAILABLE:
-        return
-    try:
-        original_apply = class_obj.apply
-        sig = inspect.signature(original_apply)
-        param_names = list(sig.parameters.keys())
-        has_correction_bias = "e_score_correction_bias" in param_names
-        def new_apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None,
-            correction_bias: Optional[torch.Tensor] = None,
-            activation: str = "silu",
-            inplace: bool = True,
-            no_combine: bool = False,
-        ):
-            assert activation == "silu"
-            assert inplace and not no_combine
-            kwargs = {
-                "self": self,
-                "layer": layer,
-                "x": x,
-                "router_logits": router_logits,
-                "top_k": top_k,
-                "renormalize": renormalize,
-                "use_grouped_topk": use_grouped_topk,
-                "topk_group": topk_group,
-                "num_expert_group": num_expert_group,
-                "custom_routing_function": custom_routing_function,
-            }
-            if correction_bias is not None:
-                if not has_correction_bias:
-                    raise ValueError(
-                        "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
-                    )
-                kwargs["e_score_correction_bias"] = correction_bias
-            return original_apply(**kwargs)
-        setattr(class_obj, "apply", new_apply)
-    except (ImportError, AttributeError):
-        return
+    original_apply = class_obj.apply
+    sig = inspect.signature(original_apply)
+    param_names = list(sig.parameters.keys())
+    has_correction_bias = "e_score_correction_bias" in param_names
+    def new_apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
+    ):
+        assert activation == "silu"
+        assert inplace and not no_combine
+        kwargs = {
+            "self": self,
+            "layer": layer,
+            "x": x,
+            "router_logits": router_logits,
+            "top_k": top_k,
+            "renormalize": renormalize,
+            "use_grouped_topk": use_grouped_topk,
+            "topk_group": topk_group,
+            "num_expert_group": num_expert_group,
+            "custom_routing_function": custom_routing_function,
+        }
+        if correction_bias is not None:
+            if not has_correction_bias:
+                raise ValueError(
+                    "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
+                )
+            kwargs["e_score_correction_bias"] = correction_bias
+        return original_apply(**kwargs)
+    setattr(class_obj, "apply", new_apply)
 def monkey_patch_quant_configs():
     """Apply all monkey patches in one place."""
-    if not VLLM_AVAILABLE:
-        return
+    setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
+    setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-    try:
-        from vllm.model_executor.layers.quantization.awq_marlin import AWQMoEMethod
-        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
-            CompressedTensorsW8A8Fp8MoEMethod,
-            CompressedTensorsWNA16MoEMethod,
-        )
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinMoEMethod,
-        )
-        setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
-        setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-        monkey_patch_moe_apply(AWQMoEMethod)
-        monkey_patch_moe_apply(GPTQMarlinMoEMethod)
-        monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
-        monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
-    except ImportError:
-        return
+    monkey_patch_moe_apply(AWQMoEMethod)
+    monkey_patch_moe_apply(GPTQMarlinMoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
 # Only apply monkey patches if vllm is available
 if VLLM_AVAILABLE:
     monkey_patch_quant_configs()
-__all__ = [
-    "get_quantization_config",
-    "QUANTIZATION_METHODS",
-]

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl