PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/bench_serving.py +72 -10
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +6 -16
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +582 -125
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +79 -6
sglang/srt/layers/quantization/__init__.py +137 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +44 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -127
sglang/srt/managers/scheduler.py +29 -23
sglang/srt/managers/tokenizer_manager.py +1 -2
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +16 -13
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +64 -59
sglang/srt/model_loader/loader.py +19 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +568 -0
sglang/srt/models/deepseek_janus_pro.py +12 -17
sglang/srt/models/deepseek_v2.py +339 -123
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +20 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +106 -93
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +120 -25
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +94 -25
sglang/srt/utils.py +137 -51
sglang/test/runners.py +27 -2
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +14 -27
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -33,7 +33,7 @@ class SchedulerMetricsCollector:
     def __init__(self, labels: Dict[str, str]) -> None:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
-        from prometheus_client import Gauge
+        from prometheus_client import Gauge, Histogram
         self.labels = labels
         self.last_log_time = time.time()
@@ -139,10 +139,10 @@ class TokenizerMetricsCollector:
             labelnames=labels.keys(),
             buckets=[
                 0.1,
-                0.3,
-                0.5,
-                0.7,
-                0.9,
+                0.2,
+                0.4,
+                0.6,
+                0.8,
                 1,
                 2,
                 4,
@@ -153,36 +153,9 @@ class TokenizerMetricsCollector:
                 40,
                 60,
                 80,
-                120,
-                160,
-            ],
-        )
-        self.histogram_time_per_output_token = Histogram(
-            name="sglang:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
-            labelnames=labels.keys(),
-            buckets=[
-                0.002,
-                0.005,
-                0.010,
-                0.020,
-                0.030,
-                0.040,
-                0.050,
-                0.060,
-                0.070,
-                0.080,
-                0.090,
-                0.100,
-                0.150,
-                0.200,
-                0.300,
-                0.400,
-                0.600,
-                0.800,
-                1.000,
-                2.000,
+                100,
+                200,
+                400,
             ],
         )
@@ -202,17 +175,18 @@ class TokenizerMetricsCollector:
                 0.030,
                 0.035,
                 0.040,
-                0.050,
-                0.075,
+                0.060,
+                0.080,
                 0.100,
-                0.150,
                 0.200,
-                0.300,
                 0.400,
-                0.500,
-                0.750,
+                0.600,
+                0.800,
                 1.000,
                 2.000,
+                4.000,
+                6.000,
+                8.000,
             ],
         )
@@ -224,23 +198,22 @@ class TokenizerMetricsCollector:
                 0.1,
                 0.2,
                 0.4,
+                0.6,
                 0.8,
                 1,
                 2,
-                5,
+                4,
+                6,
+                8,
                 10,
                 20,
                 40,
                 60,
                 80,
                 100,
-                150,
                 200,
-                250,
-                300,
-                350,
-                500,
-                1000,
+                400,
+                800,
             ],
         )
@@ -256,13 +229,10 @@ class TokenizerMetricsCollector:
     ):
         self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
         self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
-        self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
+        if cached_tokens > 0:
+            self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
         self.num_requests_total.labels(**self.labels).inc(1)
         self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
-        if generation_tokens >= 1:
-            self.histogram_time_per_output_token.labels(**self.labels).observe(
-                e2e_latency / generation_tokens
-            )
     def observe_time_to_first_token(self, value: float):
         self.histogram_time_to_first_token.labels(**self.labels).observe(value)

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -116,16 +116,18 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
     if capture_bs is None:
         if server_args.speculative_algorithm is None:
             if server_args.disable_cuda_graph_padding:
-                capture_bs = list(range(1, 33)) + [64, 96, 128, 160]
+                capture_bs = list(range(1, 33)) + range(40, 161, 16)
             else:
-                capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+                capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
         else:
             # Since speculative decoding requires more cuda graph memory, we
             # capture less.
-            capture_bs = list(range(1, 9)) + list(range(9, 33, 2)) + [64, 96, 128, 160]
+            capture_bs = (
+                list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
+            )
-    if _is_hip:
-        capture_bs += [i * 8 for i in range(21, 33)]
+        if _is_hip:
+            capture_bs += list(range(160, 257, 8))
     if max(capture_bs) > model_runner.req_to_token_pool.size:
         # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
@@ -174,6 +176,7 @@ class CudaGraphRunner:
         self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
         self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
         self.enable_dp_attention = model_runner.server_args.enable_dp_attention
+        self.enable_sp_layernorm = model_runner.server_args.enable_sp_layernorm
         self.speculative_algorithm = model_runner.server_args.speculative_algorithm
         self.tp_size = model_runner.server_args.tp_size
         self.dp_size = model_runner.server_args.dp_size
@@ -245,8 +248,8 @@ class CudaGraphRunner:
                 )
             else:
                 self.encoder_lens = None
-            if self.enable_dp_attention:
+            if self.enable_dp_attention or self.enable_sp_layernorm:
+                # TODO(ch-wan): SP layernorm should use a different logic to manage gathered_buffer
                 self.gathered_buffer = torch.zeros(
                     (
                         self.max_bs * self.dp_size * self.num_tokens_per_bs,
@@ -288,7 +291,7 @@ class CudaGraphRunner:
             self.model_runner.token_to_kv_pool.capture_mode = False
     def can_run(self, forward_batch: ForwardBatch):
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             total_global_tokens = sum(forward_batch.global_num_tokens_cpu)
             is_bs_supported = forward_batch.can_run_dp_cuda_graph and (
@@ -369,7 +372,7 @@ class CudaGraphRunner:
             encoder_lens = None
         mrope_positions = self.mrope_positions[:, :bs]
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             self.global_num_tokens_gpu.copy_(
                 torch.tensor(
                     [
@@ -471,7 +474,7 @@ class CudaGraphRunner:
         raw_num_token = raw_bs * self.num_tokens_per_bs
         # Pad
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             index = bisect.bisect_left(
                 self.capture_bs, sum(forward_batch.global_num_tokens_cpu)
             )
@@ -488,16 +491,16 @@ class CudaGraphRunner:
         self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
         self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
         self.positions[:raw_num_token].copy_(forward_batch.positions)
-        if forward_batch.decode_seq_lens_cpu is not None:
+        if forward_batch.seq_lens_cpu is not None:
             if bs != raw_bs:
                 self.seq_lens_cpu.fill_(1)
-            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.decode_seq_lens_cpu)
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
         if self.is_encoder_decoder:
             self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
         if forward_batch.mrope_positions is not None:
             self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             self.global_num_tokens_gpu.copy_(forward_batch.global_num_tokens_gpu)
         if hasattr(forward_batch.spec_info, "hidden_states"):

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -104,6 +104,9 @@ class ForwardMode(IntEnum):
             or self == ForwardMode.IDLE
         )
+    def is_extend_or_draft_extend(self):
+        return self == ForwardMode.EXTEND or self == ForwardMode.DRAFT_EXTEND
     def is_dummy_first(self):
         return self == ForwardMode.DUMMY_FIRST
@@ -148,6 +151,9 @@ class ForwardBatch:
     # The sum of all sequence lengths
     seq_lens_sum: int
+    # Optional seq_lens on cpu
+    seq_lens_cpu: Optional[torch.Tensor] = None
     # For logprob
     return_logprob: bool = False
     top_logprobs_nums: Optional[List[int]] = None
@@ -162,9 +168,6 @@ class ForwardBatch:
     # Position information
     positions: torch.Tensor = None
-    # For decode
-    decode_seq_lens_cpu: Optional[torch.Tensor] = None
     # For extend
     extend_num_tokens: Optional[int] = None
     extend_seq_lens: Optional[torch.Tensor] = None
@@ -293,12 +296,14 @@ class ForwardBatch:
         ):
             ret.positions = ret.spec_info.positions
+        # Get seq_lens_cpu if needed
+        if ret.seq_lens_cpu is None:
+            ret.seq_lens_cpu = batch.seq_lens_cpu
         # Init position information
         if ret.forward_mode.is_decode():
             if ret.positions is None:
                 ret.positions = clamp_position(batch.seq_lens)
-            if ret.decode_seq_lens_cpu is None:
-                ret.decode_seq_lens_cpu = batch.decode_seq_lens
         else:
             ret.extend_seq_lens = torch.tensor(
                 batch.extend_seq_lens, dtype=torch.int32
@@ -353,11 +358,6 @@ class ForwardBatch:
         for mm_input in valid_inputs[1:]:
             merged.merge(mm_input)
-        if isinstance(merged.pixel_values, np.ndarray):
-            merged.pixel_values = torch.from_numpy(merged.pixel_values)
-        if isinstance(merged.audio_features, np.ndarray):
-            merged.audio_features = torch.from_numpy(merged.audio_features)
         return merged
     def contains_image_inputs(self) -> bool:

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -64,6 +64,7 @@ from sglang.srt.model_loader.loader import (
 )
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
@@ -74,6 +75,7 @@ from sglang.srt.utils import (
     get_available_gpu_memory,
     init_custom_process_group,
     is_cuda,
+    is_flashinfer_available,
     is_hip,
     monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
@@ -122,6 +124,10 @@ class ModelRunner:
         self.page_size = server_args.page_size
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.use_mla_backend = (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and not server_args.disable_mla
+        )
         # Model-specific adjustment
         self.model_specific_adjustment()
@@ -146,15 +152,18 @@ class ModelRunner:
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
                 "enable_deepep_moe": server_args.enable_deepep_moe,
+                "deepep_mode": server_args.deepep_mode,
                 "device": server_args.device,
                 "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
                 "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
-                "enable_flashinfer_mla": server_args.enable_flashinfer_mla,
                 "enable_flashmla": server_args.enable_flashmla,
                 "disable_radix_cache": server_args.disable_radix_cache,
                 "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
                 "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
                 "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
+                "n_share_experts_fusion": server_args.n_share_experts_fusion,
+                "disable_shared_experts_fusion": server_args.disable_shared_experts_fusion,
+                "use_mla_backend": self.use_mla_backend,
             }
         )
@@ -215,23 +224,38 @@ class ModelRunner:
     def model_specific_adjustment(self):
         server_args = self.server_args
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not server_args.disable_mla
-        ):
+        if server_args.enable_flashinfer_mla:
+            # TODO: remove this branch after enable_flashinfer_mla is deprecated
+            logger.info("MLA optimization is turned on. Use flashinfer backend.")
+            server_args.attention_backend = "flashinfer"
+        elif server_args.enable_flashmla:
+            # TODO: remove this branch after enable_flashmla is deprecated
+            logger.info("MLA optimization is turned on. Use flashmla decode.")
+            server_args.attention_backend = "flashmla"
+        elif server_args.attention_backend is None:
+            # By default, use flashinfer for non-mla attention and triton for mla attention
+            if not self.use_mla_backend:
+                server_args.attention_backend = (
+                    "flashinfer" if is_flashinfer_available() else "triton"
+                )
+            else:
+                server_args.attention_backend = "triton"
+            logger.info(
+                f"Attention backend not set. Use {server_args.attention_backend} backend by default."
+            )
+        elif self.use_mla_backend:
             # TODO: add MLA optimization on CPU
             if server_args.device != "cpu":
-                if server_args.enable_flashinfer_mla:
+                if server_args.attention_backend in ["flashinfer", "fa3", "triton"]:
                     logger.info(
-                        "MLA optimization is turned on. Use flashinfer mla backend."
+                        f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
                     )
-                    server_args.attention_backend = "flashinfer_mla"
-                elif server_args.enable_flashmla:
-                    logger.info("MLA optimization is turned on. Use flashmla decode.")
-                    server_args.attention_backend = "flashmla"
                 else:
-                    logger.info("MLA optimization is turned on. Use triton backend.")
-                    server_args.attention_backend = "triton"
+                    raise ValueError(
+                        f"Invalid attention backend for MLA: {server_args.attention_backend}"
+                    )
+            else:
+                raise ValueError(f"MLA optimization not supported on CPU.")
         if server_args.enable_double_sparsity:
             logger.info(
@@ -246,17 +270,16 @@ class ModelRunner:
             self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
         if self.is_multimodal:
-            self.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.90
             logger.info(
                 f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
                 f"because this is a multimodal model."
             )
-            if self.model_config.hf_config.architectures == [
-                "MllamaForConditionalGeneration"
-            ]:
-                logger.info("Automatically turn off --chunked-prefill-size for mllama.")
-                server_args.chunked_prefill_size = -1
+            logger.info(
+                "Automatically turn off --chunked-prefill-size for multimodal model."
+            )
+            server_args.chunked_prefill_size = -1
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
@@ -264,25 +287,11 @@ class ModelRunner:
                 "Qwen2_5_VLForConditionalGeneration"
             ]:
                 # TODO: qwen2-vl series does not support radix cache now, set disable_radix_cache=True automatically
-                logger.info(
-                    "Automatically turn off --chunked-prefill-size and disable radix cache for qwen-vl series."
-                )
-                server_args.chunked_prefill_size = -1
-                server_args.disable_radix_cache = True
-            if self.model_config.hf_config.architectures == ["DeepseekVL2ForCausalLM"]:
-                # TODO: deepseek-vl2 does not support radix cache now, set disable_radix_cache=True automatically
-                logger.info(
-                    "Automatically turn off --chunked-prefill-size and disable radix cache for deepseek-vl2."
-                )
-                server_args.chunked_prefill_size = -1
+                logger.info("Automatically disable radix cache for qwen-vl series.")
                 server_args.disable_radix_cache = True
         if server_args.enable_deepep_moe:
-            logger.info("DeepEP is turned on.")
-            assert (
-                server_args.enable_dp_attention == True
-            ), "Currently DeepEP is bind to Attention DP. Set '--enable-dp-attention --enable-deepep-moe'"
+            logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
@@ -644,10 +653,7 @@ class ModelRunner:
         available_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not self.server_args.disable_mla
-        ):
+        if self.use_mla_backend:
             cell_size = (
                 (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
                 * self.model_config.num_hidden_layers
@@ -758,10 +764,7 @@ class ModelRunner:
             # Draft worker shares req_to_token_pool with the target worker.
             assert self.is_draft_worker
-        if (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not self.server_args.disable_mla
-        ):
+        if self.use_mla_backend:
             self.token_to_kv_pool = MLATokenToKVPool(
                 self.max_total_num_tokens,
                 page_size=self.page_size,
@@ -832,14 +835,21 @@ class ModelRunner:
     def init_attention_backend(self):
         """Init attention kernel backend."""
         if self.server_args.attention_backend == "flashinfer":
-            from sglang.srt.layers.attention.flashinfer_backend import (
-                FlashInferAttnBackend,
-            )
+            if not self.use_mla_backend:
+                from sglang.srt.layers.attention.flashinfer_backend import (
+                    FlashInferAttnBackend,
+                )
-            # Init streams
-            if self.server_args.speculative_algorithm == "EAGLE":
-                self.plan_stream_for_flashinfer = torch.cuda.Stream()
-            self.attn_backend = FlashInferAttnBackend(self)
+                # Init streams
+                if self.server_args.speculative_algorithm == "EAGLE":
+                    self.plan_stream_for_flashinfer = torch.cuda.Stream()
+                self.attn_backend = FlashInferAttnBackend(self)
+            else:
+                from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                    FlashInferMLAAttnBackend,
+                )
+                self.attn_backend = FlashInferMLAAttnBackend(self)
         elif self.server_args.attention_backend == "triton":
             assert self.sliding_window_size is None, (
                 "Window attention is not supported in the triton attention backend. "
@@ -865,12 +875,6 @@ class ModelRunner:
             )
             self.attn_backend = TorchNativeAttnBackend(self)
-        elif self.server_args.attention_backend == "flashinfer_mla":
-            from sglang.srt.layers.attention.flashinfer_mla_backend import (
-                FlashInferMLAAttnBackend,
-            )
-            self.attn_backend = FlashInferMLAAttnBackend(self)
         elif self.server_args.attention_backend == "flashmla":
             from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
@@ -881,7 +885,7 @@ class ModelRunner:
                 "Please use `--attention-backend flashinfer`."
             )
             logger.warning(
-                "FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
+                "FlashAttention v3 Backend is in Beta. Multimodal, FP8, and Speculative Decoding are not supported."
             )
             from sglang.srt.layers.attention.flashattention_backend import (
                 FlashAttentionBackend,
@@ -1082,8 +1086,9 @@ def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tenso
 def _unwrap_tensor(tensor, tp_rank):
     if isinstance(tensor, LocalSerializedTensor):
-        return tensor.get(tp_rank)
-    return tensor
+        monkey_patch_torch_reductions()
+        tensor = tensor.get(tp_rank)
+    return tensor.to(torch.cuda.current_device())
 @dataclass

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -14,7 +14,6 @@ from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
-import gguf
 import huggingface_hub
 import numpy as np
 import torch
@@ -490,6 +489,14 @@ class DummyModelLoader(BaseModelLoader):
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
+            # Model weight loading consists of two stages:
+            # 1. Initial weight loading.
+            # 2. Post-processing of weights, including assigning specific member variables.
+            # For `dummy_init`, only the second stage is required.
+            if hasattr(model, "post_load_weights"):
+                model.post_load_weights()
         return model.eval()
@@ -1155,6 +1162,17 @@ class GGUFModelLoader(BaseModelLoader):
         See "Standardized tensor names" in
         https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
         """
+        # only load the gguf module when needed
+        try:
+            import gguf
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install gguf via `pip install gguf` to use gguf quantizer."
+            ) from err
         config = model_config.hf_config
         model_type = config.model_type
         # hack: ggufs have a different name than transformers

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing import (
 )
 import filelock
-import gguf
 import huggingface_hub.constants
 import numpy as np
 import safetensors.torch
@@ -93,7 +92,7 @@ def convert_bin_to_safetensor_file(
     pt_filename: str,
     sf_filename: str,
 ) -> None:
-    loaded = torch.load(pt_filename, map_location="cpu")
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     shared = _shared_pointers(loaded)
@@ -381,7 +380,7 @@ def np_cache_weights_iterator(
                 disable=not enable_tqdm,
                 bar_format=_BAR_FORMAT,
             ):
-                state = torch.load(bin_file, map_location="cpu")
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -464,6 +463,8 @@ def pt_weights_iterator(
 def get_gguf_extra_tensor_names(
     gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
 ) -> List[str]:
+    import gguf
     reader = gguf.GGUFReader(gguf_file)
     expected_gguf_keys = set(gguf_to_hf_name_map.keys())
     exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
@@ -479,6 +480,8 @@ def gguf_quant_weights_iterator(
     them to torch tensors
     """
+    import gguf
     reader = gguf.GGUFReader(gguf_file)
     for tensor in reader.tensors:

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl