PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +3 -6
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +6 -2
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +105 -6
sglang/srt/disaggregation/mini_lb.py +74 -9
sglang/srt/disaggregation/mooncake/conn.py +33 -63
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +137 -17
sglang/srt/disaggregation/utils.py +32 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +883 -209
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +20 -5
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +17 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -88
sglang/srt/layers/quantization/fp8_utils.py +189 -264
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +9 -8
sglang/srt/layers/sampler.py +7 -12
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +15 -4
sglang/srt/managers/scheduler.py +28 -77
sglang/srt/managers/tokenizer_manager.py +116 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +41 -29
sglang/srt/mem_cache/memory_pool.py +38 -15
sglang/srt/model_executor/cuda_graph_runner.py +15 -10
sglang/srt/model_executor/model_runner.py +39 -31
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +292 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +31 -203
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +86 -72
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +6 -14
sglang/srt/utils.py +62 -6
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -1,13 +1,6 @@
 from typing import Any, Callable, Dict, List, Optional
 import torch
-from sglang.srt.utils import is_cuda_available, set_weight_attrs
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import int8_scaled_mm
 from torch.nn.parameter import Parameter
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
@@ -18,6 +11,11 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.utils import is_cuda, set_weight_attrs
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import int8_scaled_mm
 class W8A8Int8Config(QuantizationConfig):
@@ -233,6 +231,7 @@ class W8A8Int8MoEMethod:
         apply_router_weight_on_input: bool = False,
         inplace: bool = True,
         no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
@@ -248,6 +247,7 @@ class W8A8Int8MoEMethod:
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
         )
         return fused_experts(

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # ==============================================================================
 """Radix attention."""
+from enum import Enum
 from typing import Optional
 from torch import nn
@@ -22,6 +23,18 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+class AttentionType(Enum):
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
 class RadixAttention(nn.Module):
     """
     The attention layer implementation.
@@ -39,6 +52,7 @@ class RadixAttention(nn.Module):
         sliding_window_size: int = -1,
         is_cross_attention: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
+        attn_type=AttentionType.DECODER,
         prefix: str = "",
         use_irope: bool = False,
     ):
@@ -64,6 +78,7 @@ class RadixAttention(nn.Module):
             self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
         if self.quant_method is not None:
             self.quant_method.create_weights(self)
+        self.attn_type = attn_type
     def forward(
         self,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,13 +8,14 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import is_cuda
-_is_cuda_available = is_cuda_available()
-if _is_cuda_available:
+_is_cuda = is_cuda()
+if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -81,7 +82,7 @@ class RotaryEmbedding(CustomOp):
         cache = self._compute_cos_sin_cache()
         # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
-        if not _is_cuda_available:
+        if not _is_cuda:
             cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
@@ -148,7 +149,7 @@ class RotaryEmbedding(CustomOp):
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if _is_cuda_available and (self.head_size in [64, 128, 256, 512]):
+        if _is_cuda and (self.head_size in [64, 128, 256, 512]):
             apply_rope_with_cos_sin_cache_inplace(
                 positions=positions,
                 query=query,
@@ -159,7 +160,7 @@ class RotaryEmbedding(CustomOp):
             )
         else:
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-            ops.rotary_embedding(
+            vllm_rotary_embedding(
                 positions,
                 query,
                 key,
@@ -651,7 +652,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
     def forward(self, *args, **kwargs):
         if torch.compiler.is_compiling():
             return self.forward_native(*args, **kwargs)
-        if _is_cuda_available:
+        if _is_cuda:
             return self.forward_cuda(*args, **kwargs)
         else:
             return self.forward_native(*args, **kwargs)

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -10,9 +10,9 @@ from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda_available
+from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import (
         min_p_sampling_from_probs,
         top_k_renorm_prob,
@@ -93,28 +93,23 @@ class Sampler(nn.Module):
                     ).clamp(min=torch.finfo(probs.dtype).min)
                 max_top_k_round, batch_size = 32, probs.shape[0]
-                uniform_samples = torch.rand(
-                    (max_top_k_round, batch_size), device=probs.device
-                )
                 if sampling_info.need_min_p_sampling:
                     probs = top_k_renorm_prob(probs, sampling_info.top_ks)
                     probs = top_p_renorm_prob(probs, sampling_info.top_ps)
                     batch_next_token_ids = min_p_sampling_from_probs(
-                        probs, uniform_samples, sampling_info.min_ps
+                        probs, sampling_info.min_ps
                     )
                 else:
-                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                    # Check Nan will throw exception, only check when crash_on_warnings is True
+                    check_nan = self.use_nan_detection and crash_on_warnings()
+                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
                         probs,
-                        uniform_samples,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
                         filter_apply_order="joint",
+                        check_nan=check_nan,
                     )
-                    if self.use_nan_detection and not torch.all(success):
-                        logger.warning("Detected errors during sampling!")
-                        batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":
                 # A slower fallback implementation with torch native operations.
                 batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -75,7 +75,7 @@ class BaseLoRABackend:
         qkv_lora_a: torch.Tensor,
         qkv_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for QKV Layer.
@@ -98,7 +98,7 @@ class BaseLoRABackend:
         gate_up_lora_a: torch.Tensor,
         gate_up_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
@@ -115,3 +115,19 @@ class BaseLoRABackend:
     def set_batch_info(self, batch_info: LoRABatchInfo):
         self.batch_info = batch_info
+def get_backend_from_name(name: str) -> BaseLoRABackend:
+    """
+    Get corresponding backend class from backend's name
+    """
+    if name == "triton":
+        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
+        return TritonLoRABackend
+    elif name == "flashinfer":
+        from sglang.srt.lora.backend.flashinfer_backend import FlashInferLoRABackend
+        return FlashInferLoRABackend
+    else:
+        raise ValueError(f"Invalid backend: {name}")

sglang/srt/lora/backend/flashinfer_backend.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Tuple
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.utils import is_flashinfer_available

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.triton_ops import (
     gate_up_lora_b_fwd,
     qkv_lora_b_fwd,

sglang/srt/lora/layers.py CHANGED Viewed

@@ -16,7 +16,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 class BaseLayerWithLoRA(nn.Module):

sglang/srt/lora/lora.py CHANGED Viewed

@@ -27,7 +27,7 @@ from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torch
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend, get_backend_from_name
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
 from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -30,6 +30,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
@@ -174,6 +175,10 @@ class DataParallelController:
         if not server_args.enable_dp_attention:
             logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
         # Launch tensor parallel scheduler processes
         scheduler_pipe_readers = []
         tp_size_per_node = server_args.tp_size // server_args.nnodes
@@ -208,7 +213,8 @@ class DataParallelController:
                 target=run_scheduler_process,
                 args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
             )
-            proc.start()
+            with memory_saver_adapter.configure_subprocess():
+                proc.start()
             self.scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -14,7 +14,6 @@
 """DetokenizerManager is a process that detokenizes the token ids."""
 import dataclasses
-import json
 import logging
 import os
 import signal

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -96,8 +96,8 @@ class GenerateReqInput:
     return_hidden_states: bool = False
     # For disaggregated inference
-    bootstrap_host: Optional[str] = None
-    bootstrap_room: Optional[int] = None
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
     def normalize_batch_and_arguments(self):
         """
@@ -397,6 +397,12 @@ class GenerateReqInput:
                 else None
             ),
             return_hidden_states=self.return_hidden_states,
+            bootstrap_host=(
+                self.bootstrap_host[i] if self.bootstrap_host is not None else None
+            ),
+            bootstrap_room=(
+                self.bootstrap_room[i] if self.bootstrap_room is not None else None
+            ),
         )
@@ -665,10 +671,15 @@ class BatchEmbeddingOut:
 @dataclass
-class FlushCacheReq:
+class FlushCacheReqInput:
     pass
+@dataclass
+class FlushCacheReqOutput:
+    success: bool
 @dataclass
 class UpdateWeightFromDiskReqInput:
     # The model path with the new weights
@@ -834,6 +845,7 @@ class ProfileReq:
     activities: Optional[List[str]] = None
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
+    profile_id: Optional[str] = None
 @dataclass

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-    Multi-modality utils
+Multi-modality utils
 """
+import logging
 from abc import abstractmethod
 from typing import Callable, List, Optional, Tuple
@@ -12,11 +13,11 @@ from sglang.srt.managers.schedule_batch import (
     MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
-    logger,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import print_warning_once
-from sglang.utils import logger
+logger = logging.getLogger(__name__)
 class MultiModalityDataPaddingPattern:

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -5,8 +5,6 @@ import logging
 import pkgutil
 from functools import lru_cache
-from transformers import PROCESSOR_MAPPING
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
 )

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,8 +8,6 @@ from typing import List, Optional
 import numpy as np
 import PIL
-from decord import VideoReader, cpu
-from PIL import Image
 from transformers import BaseImageProcessorFast
 from sglang.srt.managers.schedule_batch import Modality
@@ -102,6 +100,9 @@ class BaseMultimodalProcessor(ABC):
         """
         estimate the total frame count from all visual input
         """
+        # Lazy import because decord is not available on some arm platforms.
+        from decord import VideoReader, cpu
         # Before processing inputs
         estimated_frames_list = []
         for image in image_data:

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -67,7 +67,6 @@ global_server_args_dict = {
     "attention_backend": ServerArgs.attention_backend,
     "sampling_backend": ServerArgs.sampling_backend,
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "disable_mla": ServerArgs.disable_mla,
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
@@ -77,12 +76,11 @@ global_server_args_dict = {
     "device": ServerArgs.device,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "enable_flashmla": ServerArgs.enable_flashmla,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
+    "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
     "chunked_prefill_size": ServerArgs.chunked_prefill_size,
     "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
     "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
 }
@@ -541,6 +539,11 @@ class Req:
         # The first output_id transferred from prefill instance.
         self.transferred_output_id: Optional[int] = None
+        # For overlap schedule, we delay the kv transfer until `process_batch_result_disagg_prefill` rather than `process_prefill_chunk` in non-overlap
+        # This is because kv is not ready in `process_prefill_chunk`.
+        # We use `tmp_end_idx` to store the end index of the kv cache to send.
+        self.tmp_end_idx: int = -1
     @property
     def seqlen(self):
         return len(self.origin_input_ids) + len(self.output_ids)
@@ -573,6 +576,14 @@ class Req:
                 self.prefix_indices, self.last_node = tree_cache.match_prefix(
                     rid=self.rid, key=self.adjust_max_prefix_ids()
                 )
+        elif enable_hierarchical_cache:
+            # in case last_node is evicted during scheduling, we need to update the prefix_indices
+            while self.last_node.evicted:
+                self.prefix_indices = self.prefix_indices[
+                    : -len(self.last_node.host_value)
+                ]
+                self.last_node = self.last_node.parent
         self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
     def adjust_max_prefix_ids(self):
@@ -1481,7 +1492,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 global_server_args_dict["use_mla_backend"]
                 and global_server_args_dict["attention_backend"] == "flashinfer"
             )
-            or global_server_args_dict["enable_flashmla"]
+            or global_server_args_dict["attention_backend"] == "flashmla"
             or global_server_args_dict["attention_backend"] == "fa3"
         ):
             seq_lens_cpu = self.seq_lens.cpu()

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl