PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +0 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +26 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +62 -6
sglang/srt/disaggregation/mini_lb.py +5 -1
sglang/srt/disaggregation/mooncake/conn.py +32 -62
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/prefill.py +40 -4
sglang/srt/disaggregation/utils.py +15 -0
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +114 -71
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +8 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -57
sglang/srt/layers/quantization/fp8_utils.py +187 -262
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +3 -2
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +1 -0
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +2 -4
sglang/srt/managers/scheduler.py +12 -71
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +7 -2
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/model_runner.py +20 -27
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +289 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +29 -201
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +34 -32
sglang/srt/speculative/eagle_worker.py +4 -7
sglang/srt/utils.py +16 -1
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +3 -3
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +92 -91
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # ==============================================================================
 """Radix attention."""
+from enum import Enum
 from typing import Optional
 from torch import nn
@@ -22,6 +23,18 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+class AttentionType(Enum):
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
 class RadixAttention(nn.Module):
     """
     The attention layer implementation.
@@ -39,6 +52,7 @@ class RadixAttention(nn.Module):
         sliding_window_size: int = -1,
         is_cross_attention: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
+        attn_type=AttentionType.DECODER,
         prefix: str = "",
         use_irope: bool = False,
     ):
@@ -64,6 +78,7 @@ class RadixAttention(nn.Module):
             self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
         if self.quant_method is not None:
             self.quant_method.create_weights(self)
+        self.attn_type = attn_type
     def forward(
         self,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -11,10 +11,11 @@ from sglang.srt.custom_op import CustomOp
 from sglang.srt.utils import is_cuda_available
 _is_cuda_available = is_cuda_available()
 if _is_cuda_available:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -159,7 +160,7 @@ class RotaryEmbedding(CustomOp):
             )
         else:
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-            ops.rotary_embedding(
+            vllm_rotary_embedding(
                 positions,
                 query,
                 key,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -93,28 +93,23 @@ class Sampler(nn.Module):
                     ).clamp(min=torch.finfo(probs.dtype).min)
                 max_top_k_round, batch_size = 32, probs.shape[0]
-                uniform_samples = torch.rand(
-                    (max_top_k_round, batch_size), device=probs.device
-                )
                 if sampling_info.need_min_p_sampling:
                     probs = top_k_renorm_prob(probs, sampling_info.top_ks)
                     probs = top_p_renorm_prob(probs, sampling_info.top_ps)
                     batch_next_token_ids = min_p_sampling_from_probs(
-                        probs, uniform_samples, sampling_info.min_ps
+                        probs, sampling_info.min_ps
                     )
                 else:
-                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                    # Check Nan will throw exception, only check when crash_on_warnings is True
+                    check_nan = self.use_nan_detection and crash_on_warnings()
+                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
                         probs,
-                        uniform_samples,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
                         filter_apply_order="joint",
+                        check_nan=check_nan,
                     )
-                    if self.use_nan_detection and not torch.all(success):
-                        logger.warning("Detected errors during sampling!")
-                        batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":
                 # A slower fallback implementation with torch native operations.
                 batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -75,7 +75,7 @@ class BaseLoRABackend:
         qkv_lora_a: torch.Tensor,
         qkv_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for QKV Layer.
@@ -98,7 +98,7 @@ class BaseLoRABackend:
         gate_up_lora_a: torch.Tensor,
         gate_up_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
@@ -115,3 +115,19 @@ class BaseLoRABackend:
     def set_batch_info(self, batch_info: LoRABatchInfo):
         self.batch_info = batch_info
+def get_backend_from_name(name: str) -> BaseLoRABackend:
+    """
+    Get corresponding backend class from backend's name
+    """
+    if name == "triton":
+        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
+        return TritonLoRABackend
+    elif name == "flashinfer":
+        from sglang.srt.lora.backend.flashinfer_backend import FlashInferLoRABackend
+        return FlashInferLoRABackend
+    else:
+        raise ValueError(f"Invalid backend: {name}")

sglang/srt/lora/backend/flashinfer_backend.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Tuple
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.utils import is_flashinfer_available

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.triton_ops import (
     gate_up_lora_b_fwd,
     qkv_lora_b_fwd,

sglang/srt/lora/layers.py CHANGED Viewed

@@ -16,7 +16,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 class BaseLayerWithLoRA(nn.Module):

sglang/srt/lora/lora.py CHANGED Viewed

@@ -27,7 +27,7 @@ from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torch
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend, get_backend_from_name
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
 from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -14,7 +14,6 @@
 """DetokenizerManager is a process that detokenizes the token ids."""
 import dataclasses
-import json
 import logging
 import os
 import signal

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -834,6 +834,7 @@ class ProfileReq:
     activities: Optional[List[str]] = None
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
+    profile_id: Optional[str] = None
 @dataclass

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-    Multi-modality utils
+Multi-modality utils
 """
+import logging
 from abc import abstractmethod
 from typing import Callable, List, Optional, Tuple
@@ -12,11 +13,11 @@ from sglang.srt.managers.schedule_batch import (
     MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
-    logger,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import print_warning_once
-from sglang.utils import logger
+logger = logging.getLogger(__name__)
 class MultiModalityDataPaddingPattern:

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -5,8 +5,6 @@ import logging
 import pkgutil
 from functools import lru_cache
-from transformers import PROCESSOR_MAPPING
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
 )

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,8 +8,6 @@ from typing import List, Optional
 import numpy as np
 import PIL
-from decord import VideoReader, cpu
-from PIL import Image
 from transformers import BaseImageProcessorFast
 from sglang.srt.managers.schedule_batch import Modality
@@ -102,6 +100,9 @@ class BaseMultimodalProcessor(ABC):
         """
         estimate the total frame count from all visual input
         """
+        # Lazy import because decord is not available on some arm platforms.
+        from decord import VideoReader, cpu
         # Before processing inputs
         estimated_frames_list = []
         for image in image_data:

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -67,7 +67,6 @@ global_server_args_dict = {
     "attention_backend": ServerArgs.attention_backend,
     "sampling_backend": ServerArgs.sampling_backend,
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "disable_mla": ServerArgs.disable_mla,
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
@@ -77,12 +76,11 @@ global_server_args_dict = {
     "device": ServerArgs.device,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "enable_flashmla": ServerArgs.enable_flashmla,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
+    "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
     "chunked_prefill_size": ServerArgs.chunked_prefill_size,
     "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
     "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
 }
@@ -1481,7 +1479,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 global_server_args_dict["use_mla_backend"]
                 and global_server_args_dict["attention_backend"] == "flashinfer"
             )
-            or global_server_args_dict["enable_flashmla"]
+            or global_server_args_dict["attention_backend"] == "flashmla"
             or global_server_args_dict["attention_backend"] == "fa3"
         ):
             seq_lens_cpu = self.seq_lens.cpu()

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -391,6 +391,7 @@ class Scheduler(
         self.torch_profiler = None
         self.torch_profiler_output_dir: Optional[str] = None
         self.profiler_activities: Optional[List[str]] = None
+        self.profiler_id: Optional[str] = None
         self.profiler_target_forward_ct: Optional[int] = None
         # Init metrics stats
@@ -484,7 +485,7 @@ class Scheduler(
                 self.tree_cache = HiRadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
-                    tp_cache_group=self.tp_worker.get_tp_cpu_group(),
+                    tp_cache_group=self.tp_cpu_group,
                     page_size=self.page_size,
                     hicache_ratio=server_args.hicache_ratio,
                 )
@@ -553,7 +554,7 @@ class Scheduler(
             # The decode requests polling kv cache
             self.disagg_decode_transfer_queue = DecodeTransferQueue(
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
                 metadata_buffers=metadata_buffers,
             )
@@ -568,7 +569,7 @@ class Scheduler(
                 scheduler=self,
                 transfer_queue=self.disagg_decode_transfer_queue,
                 tree_cache=self.tree_cache,
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 tp_rank=self.tp_rank,
                 tp_size=self.tp_size,
                 bootstrap_port=self.server_args.disaggregation_bootstrap_port,
@@ -597,7 +598,7 @@ class Scheduler(
                 tp_rank=self.tp_rank,
                 tp_size=self.tp_size,
                 bootstrap_port=self.server_args.disaggregation_bootstrap_port,
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 transfer_backend=self.transfer_backend,
                 scheduler=self,
             )
@@ -664,70 +665,6 @@ class Scheduler(
             self.last_batch = batch
-    @torch.no_grad()
-    def event_loop_normal_disagg_prefill(self):
-        """A normal scheduler loop for prefill worker in disaggregation mode."""
-        while True:
-            recv_reqs = self.recv_requests()
-            self.process_input_requests(recv_reqs)
-            self.waiting_queue.extend(
-                self.disagg_prefill_pending_queue.pop_bootstrapped()
-            )
-            self.process_prefill_chunk()
-            batch = self.get_new_batch_prefill()
-            self.cur_batch = batch
-            if batch:
-                result = self.run_batch(batch)
-                self.process_batch_result_disagg_prefill(batch, result)
-            if len(self.disagg_prefill_inflight_queue) > 0:
-                self.process_disagg_prefill_inflight_queue()
-            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-            self.last_batch = batch
-            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
-            # Otherwise, it hangs under high concurrency
-            self.running_batch.batch_is_full = False
-    @torch.no_grad()
-    def event_loop_normal_disagg_decode(self):
-        """A normal scheduler loop for decode worker in disaggregation mode."""
-        while True:
-            recv_reqs = self.recv_requests()
-            self.process_input_requests(recv_reqs)
-            # polling and allocating kv cache
-            self.process_decode_queue()
-            batch = self.get_next_disagg_decode_batch_to_run()
-            self.cur_batch = batch
-            if batch:
-                # Generate fake extend output.
-                if batch.forward_mode.is_extend():
-                    # Note: Logprobs should be handled on the prefill engine.
-                    self.stream_output(
-                        batch.reqs, [False for _ in range(len(batch.reqs))]
-                    )
-                else:
-                    result = self.run_batch(batch)
-                    self.process_batch_result(batch, result)
-            if batch is None and (
-                len(self.disagg_decode_transfer_queue.queue)
-                + len(self.disagg_decode_prealloc_queue.queue)
-                == 0
-            ):
-                # When the server is idle, do self-check and re-init some states
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-            self.last_batch = batch
     def recv_requests(self) -> List[Req]:
         """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
         if self.attn_tp_rank == 0:
@@ -1869,6 +1806,7 @@ class Scheduler(
                 recv_req.activities,
                 recv_req.with_stack,
                 recv_req.record_shapes,
+                recv_req.profile_id,
             )
         else:
             return self.stop_profile()
@@ -1880,6 +1818,7 @@ class Scheduler(
         activities: Optional[List[str]],
         with_stack: Optional[bool],
         record_shapes: Optional[bool],
+        profile_id: Optional[str],
     ) -> None:
         if self.profiler_activities:
             return ProfileReqOutput(
@@ -1894,9 +1833,11 @@ class Scheduler(
         self.torch_profiler_output_dir = output_dir
         self.profiler_activities = activities
+        self.profiler_id = profile_id
         logger.info(
-            "Profiling starts. Traces will be saved to: %s",
+            "Profiling starts. Traces will be saved to: %s (with id %s)",
             self.torch_profiler_output_dir,
+            self.profiler_id,
         )
         activity_map = {
@@ -1938,14 +1879,14 @@ class Scheduler(
             self.torch_profiler.export_chrome_trace(
                 os.path.join(
                     self.torch_profiler_output_dir,
-                    str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+                    self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
                 )
             )
         if "MEM" in self.profiler_activities:
             memory_profile_path = os.path.join(
                 self.torch_profiler_output_dir,
-                str(time.time()) + f"-TP-{self.tp_rank}-memory" + ".pickle",
+                self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
             )
             torch.cuda.memory._dump_snapshot(memory_profile_path)
             torch.cuda.memory._record_memory_history(enabled=None)

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -650,6 +650,7 @@ class TokenizerManager:
             output_dir=output_dir,
             num_steps=num_steps,
             activities=activities,
+            profile_id=str(time.time()),
         )
         result = (await self.start_profile_communicator(req))[0]
         if not result.success:

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -92,7 +92,7 @@ class HiRadixCache(RadixCache):
             self.ongoing_write_through[node.id] = node
             self.inc_lock_ref(node)
         else:
-            return None
+            return 0
         return len(host_indices)
@@ -153,6 +153,7 @@ class HiRadixCache(RadixCache):
             if x.host_value is None:
                 if self.cache_controller.write_policy == "write_back":
                     num_evicted += self.write_backup(x)
+                    pending_nodes.append(x)
                 elif self.cache_controller.write_policy == "write_through_selective":
                     num_evicted += self._evict_write_through_selective(x)
                 else:
@@ -177,6 +178,9 @@ class HiRadixCache(RadixCache):
             while len(self.ongoing_write_through) > 0:
                 self.writing_check()
                 time.sleep(0.1)
+            for node in pending_nodes:
+                assert node.host_value is not None
+                self._evict_write_through(node)
     def _evict_write_through(self, node: TreeNode):
         # evict a node already written to host

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -286,8 +286,12 @@ class MHATokenToKVPool(KVCache):
             self.get_key_buffer(i).nbytes for i in range(self.layer_num)
         ] + [self.get_value_buffer(i).nbytes for i in range(self.layer_num)]
         kv_item_lens = [
-            self.get_key_buffer(i)[0].nbytes for i in range(self.layer_num)
-        ] + [self.get_value_buffer(i)[0].nbytes for i in range(self.layer_num)]
+            self.get_key_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.layer_num)
+        ] + [
+            self.get_value_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.layer_num)
+        ]
         return kv_data_ptrs, kv_data_lens, kv_item_lens
     # Todo: different memory layout
@@ -414,6 +418,7 @@ class MLATokenToKVPool(KVCache):
         enable_memory_saver: bool,
     ):
         self.size = size
+        self.page_size = page_size
         self.dtype = dtype
         self.device = device
         if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -37,11 +37,11 @@ from sglang.srt.model_executor.forward_batch_info import (
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.utils import get_available_gpu_memory, is_hip
-_is_hip = is_hip()
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
+_is_hip = is_hip()
 def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
     for sub in model._modules.values():

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -73,6 +73,7 @@ from sglang.srt.utils import (
     MultiprocessingSerializer,
     enable_show_time_cost,
     get_available_gpu_memory,
+    get_bool_env_var,
     init_custom_process_group,
     is_cuda,
     is_fa3_default_architecture,
@@ -127,10 +128,7 @@ class ModelRunner:
         self.page_size = server_args.page_size
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
-        self.use_mla_backend = (
-            self.model_config.attention_arch == AttentionArch.MLA
-            and not server_args.disable_mla
-        )
+        self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
         self.attention_chunk_size = model_config.attention_chunk_size
         # Model-specific adjustment
@@ -139,18 +137,12 @@ class ModelRunner:
         if server_args.show_time_cost:
             enable_show_time_cost()
-        if server_args.disable_outlines_disk_cache:
-            from outlines.caching import disable_cache
-            disable_cache()
         # Global vars
         global_server_args_dict.update(
             {
                 "attention_backend": server_args.attention_backend,
                 "sampling_backend": server_args.sampling_backend,
                 "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
-                "disable_mla": server_args.disable_mla,
                 "torchao_config": server_args.torchao_config,
                 "enable_nan_detection": server_args.enable_nan_detection,
                 "enable_dp_attention": server_args.enable_dp_attention,
@@ -160,13 +152,12 @@ class ModelRunner:
                 "device": server_args.device,
                 "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
                 "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
-                "enable_flashmla": server_args.enable_flashmla,
                 "disable_radix_cache": server_args.disable_radix_cache,
                 "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
+                "moe_dense_tp_size": server_args.moe_dense_tp_size,
                 "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
                 "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
                 "n_share_experts_fusion": server_args.n_share_experts_fusion,
-                "disable_shared_experts_fusion": server_args.disable_shared_experts_fusion,
                 "disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
                 "use_mla_backend": self.use_mla_backend,
             }
@@ -229,15 +220,7 @@ class ModelRunner:
     def model_specific_adjustment(self):
         server_args = self.server_args
-        if server_args.enable_flashinfer_mla:
-            # TODO: remove this branch after enable_flashinfer_mla is deprecated
-            logger.info("MLA optimization is turned on. Use flashinfer backend.")
-            server_args.attention_backend = "flashinfer"
-        elif server_args.enable_flashmla:
-            # TODO: remove this branch after enable_flashmla is deprecated
-            logger.info("MLA optimization is turned on. Use flashmla decode.")
-            server_args.attention_backend = "flashmla"
-        elif server_args.attention_backend is None:
+        if server_args.attention_backend is None:
             # By default, use flashinfer for non-mla attention and triton for mla attention
             if not self.use_mla_backend:
                 if (
@@ -263,7 +246,12 @@ class ModelRunner:
         elif self.use_mla_backend:
             # TODO: add MLA optimization on CPU
             if server_args.device != "cpu":
-                if server_args.attention_backend in ["flashinfer", "fa3", "triton"]:
+                if server_args.attention_backend in [
+                    "flashinfer",
+                    "fa3",
+                    "triton",
+                    "flashmla",
+                ]:
                     logger.info(
                         f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
                     )
@@ -320,7 +308,6 @@ class ModelRunner:
             logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
         if not self.use_mla_backend:
-            logger.info("Disable chunked prefix cache for non-MLA backend.")
             server_args.disable_chunked_prefix_cache = True
         elif self.page_size > 1:
             logger.info("Disable chunked prefix cache when page size > 1.")
@@ -387,10 +374,16 @@ class ModelRunner:
         local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
         if self.tp_size > 1:
             if min_per_gpu_memory < local_gpu_memory * 0.9:
-                raise ValueError(
-                    "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
-                    f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
-                )
+                if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
+                    logger.warning(
+                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
+                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
+                    )
+                else:
+                    raise ValueError(
+                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
+                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
+                    )
         logger.info(
             f"Init torch distributed ends. mem usage={(before_avail_memory - local_gpu_memory):.2f} GB"

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl