PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

sglang/bench_one_batch.py +113 -17
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -117
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +3 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +22 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +8 -5
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +106 -15
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +55 -13
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +40 -15
sglang/srt/layers/communicator.py +35 -8
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +9 -8
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +87 -107
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +59 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +8 -7
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +15 -4
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +10 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +61 -32
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +21 -4
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +30 -8
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +170 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +59 -22
sglang/srt/managers/tokenizer_manager.py +137 -67
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +48 -17
sglang/srt/model_executor/model_runner.py +24 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +95 -50
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +102 -27
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +7 -4
sglang/srt/models/qwen3_moe.py +39 -14
sglang/srt/models/step3_vl.py +10 -1
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/operations_strategy.py +1 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +218 -23
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +163 -9
sglang/srt/utils.py +41 -26
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +4 -4
sglang/test/test_utils.py +4 -4
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -16,6 +16,7 @@ limitations under the License.
 import logging
 import math
 import threading
+import time
 from queue import Empty, Full, PriorityQueue, Queue
 from typing import TYPE_CHECKING, List, Optional
@@ -195,6 +196,8 @@ class PrefetchOperation(StorageOperation):
         self._done_flag = False
         self._lock = threading.Lock()
+        self.start_time = time.monotonic()
         super().__init__(host_indices, token_ids, last_hash)
     def increment(self, num_tokens: int):
@@ -236,18 +239,19 @@ class HiCacheController:
         self.enable_storage = False
         # todo: move backend initialization to storage backend module
         if storage_backend is not None:
+            self.storage_backend_type = storage_backend
             from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
             if storage_backend == "file":
                 self.storage_backend = HiCacheFile()
                 self.get_hash_str = get_hash_str
             elif storage_backend == "nixl":
-                from sglang.srt.mem_cache.nixl.hicache_nixl import HiCacheNixl
+                from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
                 self.storage_backend = HiCacheNixl()
                 self.get_hash_str = get_hash_str
             elif storage_backend == "mooncake":
-                from sglang.srt.mem_cache.mooncake_store.mooncake_store import (
+                from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
                     MooncakeStore,
                     get_hash_str_mooncake,
                 )
@@ -277,6 +281,12 @@ class HiCacheController:
             self.enable_storage = True
             # todo: threshold policy for prefetching
             self.prefetch_threshold = max(prefetch_threshold, self.page_size)
+            self.prefetch_capacity_limit = int(
+                0.8 * (self.mem_pool_host.size - self.mem_pool_device.size)
+            )
+            # tracking the number of tokens locked in prefetching, updated by the main scheduler thread
+            self.prefetch_tokens_occupied = 0
             # create a new communication group for synchronizing storage operations across TP workers
             self.tp_world_size = torch.distributed.get_world_size(group=tp_group)
             if self.tp_world_size > 1:
@@ -524,7 +534,7 @@ class HiCacheController:
         host_indices: torch.Tensor,
         new_input_tokens: List[int],
         last_hash: Optional[str] = None,
-    ) -> int:
+    ) -> PrefetchOperation:
         """
         Prefetch KV caches from storage backend to host memory.
         """
@@ -573,6 +583,9 @@ class HiCacheController:
         self.storage_backend.batch_get(key_strs, buffer_ptrs, buffer_sizes)
         operation.increment(len(operation.hash_value) * self.page_size)
+    def is_mooncake_backend(self):
+        return self.storage_backend_type == "mooncake"
     def prefetch_io_aux_func(self):
         """
         Auxiliary function conducting IO operations for prefetching.
@@ -580,13 +593,25 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.prefetch_buffer.get(block=True, timeout=1)
-                if isinstance(self.storage_backend, MooncakeStore):
+                if self.is_mooncake_backend():
                     self.mooncake_page_transfer(operation)
+                elif self.storage_backend_type == "hf3fs":
+                    self.generic_page_transfer(operation, batch_size=128)
                 else:
                     self.generic_page_transfer(operation)
             except Empty:
                 continue
+    def prefetch_rate_limit_check(self) -> bool:
+        """
+        Rate limit the prefetching operations to avoid overwhelming the storage backend.
+        """
+        # cancel prefetch if too much memory is occupied
+        if self.prefetch_tokens_occupied >= self.prefetch_capacity_limit:
+            return False
+        # todo: more sophisticated rate limiting based on storage backend performance
+        return True
     def prefetch_thread_func(self):
         """
         Manage prefetching operations from storage backend to host memory.
@@ -600,34 +625,36 @@ class HiCacheController:
                 if operation is None:
                     continue
-                last_hash = operation.last_hash
-                tokens_to_fetch = operation.token_ids
                 storage_hit_count = 0
-                remaining_tokens = len(tokens_to_fetch)
-                hash_value = []
-                while remaining_tokens >= self.page_size:
-                    last_hash = self.get_hash_str(
-                        tokens_to_fetch[
-                            storage_hit_count : storage_hit_count + self.page_size
-                        ],
-                        last_hash,
-                    )
-                    # todo, more unified interface
-                    if not isinstance(self.storage_backend, MooncakeStore):
-                        if not self.storage_backend.exists(last_hash):
-                            break
-                    hash_value.append(last_hash)
-                    storage_hit_count += self.page_size
-                    remaining_tokens -= self.page_size
-                if isinstance(self.storage_backend, MooncakeStore):
-                    # deferring to batch exists for mooncake store
-                    exist_result = self.storage_backend.exists(hash_value)
-                    storage_hit_count = (
-                        sum(1 for v in exist_result.values() if v != 0) * self.page_size
-                    )
+                if self.prefetch_rate_limit_check():
+                    last_hash = operation.last_hash
+                    tokens_to_fetch = operation.token_ids
+                    remaining_tokens = len(tokens_to_fetch)
+                    hash_value = []
+                    while remaining_tokens >= self.page_size:
+                        last_hash = self.get_hash_str(
+                            tokens_to_fetch[
+                                storage_hit_count : storage_hit_count + self.page_size
+                            ],
+                            last_hash,
+                        )
+                        # todo, more unified interface
+                        if not self.is_mooncake_backend():
+                            if not self.storage_backend.exists(last_hash):
+                                break
+                        hash_value.append(last_hash)
+                        storage_hit_count += self.page_size
+                        remaining_tokens -= self.page_size
+                    if self.is_mooncake_backend():
+                        # deferring to batch exists for mooncake store
+                        exist_result = self.storage_backend.exists(hash_value)
+                        storage_hit_count = (
+                            sum(1 for v in exist_result.values() if v != 0)
+                            * self.page_size
+                        )
                 if self.tp_world_size > 1:
                     storage_hit_count_tensor = torch.tensor(
@@ -744,8 +771,10 @@ class HiCacheController:
                     remaining_tokens -= self.page_size
                 operation.hash_value = hash_value
-                if isinstance(self.storage_backend, MooncakeStore):
+                if self.is_mooncake_backend():
                     self.mooncake_page_backup(operation)
+                elif self.storage_backend_type == "hf3fs":
+                    self.generic_page_backup(operation, batch_size=128)
                 else:
                     self.generic_page_backup(operation)

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -16,9 +16,13 @@
 import logging
 import multiprocessing as mp
 import signal
+import struct
+import sys
 import threading
 import time
 from enum import Enum, auto
+from multiprocessing import shared_memory
+from typing import Dict, List
 import psutil
 import setproctitle
@@ -32,6 +36,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.schedule_batch import Req
 from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.utils import DPBalanceMeta
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
@@ -45,6 +50,7 @@ class LoadBalanceMethod(Enum):
     ROUND_ROBIN = auto()
     SHORTEST_QUEUE = auto()
+    MINIMUM_TOKENS = auto()
     @classmethod
     def from_str(cls, method: str):
@@ -58,7 +64,16 @@ class LoadBalanceMethod(Enum):
 class DataParallelController:
     """A controller that dispatches requests to multiple data parallel workers."""
-    def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        dp_balance_meta: DPBalanceMeta,
+    ) -> None:
+        # for dp balance
+        self.global_balance_id = 0
+        self.balance_meta = dp_balance_meta
         # Parse args
         self.max_total_num_tokens = None
         self.server_args = server_args
@@ -79,6 +94,7 @@ class DataParallelController:
         dispatch_lookup = {
             LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
             LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
+            LoadBalanceMethod.MINIMUM_TOKENS: self.minimum_tokens_scheduler,
         }
         self.dispatching = dispatch_lookup[self.load_balance_method]
@@ -234,6 +250,7 @@ class DataParallelController:
                         pp_rank,
                         dp_rank,
                         writer,
+                        self.balance_meta,
                     ),
                 )
                 with memory_saver_adapter.configure_subprocess():
@@ -269,6 +286,33 @@ class DataParallelController:
     def shortest_queue_scheduler(self, input_requests):
         raise NotImplementedError()
+    def minimum_tokens_scheduler(self, req):
+        # This variable corresponds to the balance_id in TokenizedGenerateReqInput.
+        # We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
+        def get_next_global_balance_id() -> int:
+            INT32_MAX = 2147483647
+            current_id = self.global_balance_id
+            self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
+            return current_id
+        req.dp_balance_id = get_next_global_balance_id()
+        with self.balance_meta.mutex:
+            # 1. local_tokens represents the tokens currently inferring on the worker,
+            #  while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
+            onfly_info = self.balance_meta.get_shared_onfly()
+            local_tokens = self.balance_meta.get_shared_local_tokens()
+            total_tokens = [
+                local_token + sum(onfly_dict.values())
+                for local_token, onfly_dict in zip(local_tokens, onfly_info)
+            ]
+            target_worker = total_tokens.index(min(total_tokens))
+            onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
+            # 2. write the new onfly info to the shm
+            self.balance_meta.set_shared_onfly_info(onfly_info)
+        # logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
+        self.workers[target_worker].send_pyobj(req)
     def event_loop(self):
         while True:
             while True:
@@ -302,9 +346,12 @@ def run_data_parallel_controller_process(
     setproctitle.setproctitle("sglang::data_parallel_controller")
     configure_logger(server_args)
     parent_process = psutil.Process().parent()
+    balance_meta = DPBalanceMeta(server_args.dp_size)
     try:
-        controller = DataParallelController(server_args, port_args)
+        controller = DataParallelController(
+            server_args, port_args, dp_balance_meta=balance_meta
+        )
         pipe_writer.send(
             {
                 "status": "ready",
@@ -323,3 +370,6 @@ def run_data_parallel_controller_process(
         traceback = get_exception_traceback()
         logger.error(f"DataParallelController hit an exception: {traceback}")
         parent_process.send_signal(signal.SIGQUIT)
+    finally:
+        # we need to destruct mp.Manager() in balance_meta
+        balance_meta.destructor()

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -216,7 +216,7 @@ class DetokenizerManager:
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             output_strs=output_strs,
-            output_ids=None,
+            output_ids=recv_obj.decode_ids,
             prompt_tokens=recv_obj.prompt_tokens,
             completion_tokens=recv_obj.completion_tokens,
             cached_tokens=recv_obj.cached_tokens,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.multimodal.mm_utils import has_valid_data
 from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.utils import ImageData
 # Handle serialization of Image for pydantic
 if TYPE_CHECKING:
@@ -45,7 +46,7 @@ class SessionParams:
 # Type definitions for multimodal input data
 # Individual data item types for each modality
-ImageDataInputItem = Union[Image, str, Dict]
+ImageDataInputItem = Union[Image, str, ImageData, Dict]
 AudioDataInputItem = Union[str, Dict]
 VideoDataInputItem = Union[str, Dict]
 # Union type for any multimodal data item
@@ -101,8 +102,10 @@ class GenerateReqInput:
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
-    # The path to the LoRA
+    # The path to the LoRA adaptors
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # The uid of LoRA adaptors, should be initialized by tokenizer manager
+    lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
     # Session info for continual prompting
     session_params: Optional[Union[List[Dict], Dict]] = None
@@ -123,6 +126,9 @@ class GenerateReqInput:
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
+    # For background responses (OpenAI responses API)
+    background: bool = False
     def contains_mm_input(self) -> bool:
         return (
             has_valid_data(self.image_data)
@@ -500,7 +506,7 @@ class TokenizedGenerateReqInput:
     stream: bool
     # LoRA related
-    lora_path: Optional[str] = None  # None means just use the base model
+    lora_id: Optional[str] = None  # None means just use the base model
     # The input embeds
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
@@ -523,6 +529,9 @@ class TokenizedGenerateReqInput:
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
+    # For dp balance
+    dp_balance_id: int = -1
 @dataclass
 class EmbeddingReqInput:
@@ -554,6 +563,9 @@ class EmbeddingReqInput:
     # For cross-encoder requests
     is_cross_encoder_request: bool = False
+    # For background responses (OpenAI responses API)
+    background: bool = False
     def normalize_batch_and_arguments(self):
         # at least one of text, input_ids, or image should be provided
         if self.text is None and self.input_ids is None and self.image_data is None:
@@ -648,6 +660,8 @@ class TokenizedEmbeddingReqInput:
     token_type_ids: List[int]
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams
+    # For dp balance
+    dp_balance_id: int = -1
 @dataclass
@@ -1068,6 +1082,8 @@ class LoadLoRAAdapterReqInput:
     lora_name: str
     # The path of loading.
     lora_path: str
+    # Whether to pin the LoRA adapter in memory.
+    pinned: bool = False
     # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
     lora_id: Optional[str] = None
@@ -1076,6 +1092,7 @@ class LoadLoRAAdapterReqInput:
             lora_id=self.lora_id,
             lora_name=self.lora_name,
             lora_path=self.lora_path,
+            pinned=self.pinned,
         )
@@ -1097,7 +1114,7 @@ class UnloadLoRAAdapterReqInput:
 class LoRAUpdateResult:
     success: bool
     error_message: Optional[str] = None
-    loaded_adapters: Dict[str, LoRARef] = field(default_factory=dict)
+    loaded_adapters: Optional[Dict[str, LoRARef]] = None
 LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -388,24 +388,18 @@ def _get_chunked_prefill_embedding(
             embedding_per_req = data_embedding_func(embedding_items_per_req)
             if not embedding_cache.put(embedding_items_hash, embedding_per_req):
                 print_warning_once(
-                    "Multimodal embedding cache is full. Consider increasing the "
-                    "`SGLANG_VLM_CACHE_SIZE_MB` environment variable."
+                    "Multimodal embedding cache is full. This typically occurs when a single "
+                    "embedding exceeds the cache size limit. Consider increasing the "
+                    "`SGLANG_VLM_CACHE_SIZE_MB` environment variable or reducing the input "
+                    "embedding size."
                 )
-        embedding_per_req_chunk, _, end_index = get_embedding_chunk(
+        embedding_per_req_chunk, _, _ = get_embedding_chunk(
             embedding=embedding_per_req,
             extend_prefix_len=prefix_length[i],
             extend_seq_len=extend_length[i] if i < len(extend_length) else 0,
             items_offset=items_offset,
         )
-        # remove this item from cache if chunk reaches to the end
-        embedding_per_req_length = (
-            embedding_per_req.shape[0]
-            if embedding_per_req.dim() == 2
-            else embedding_per_req.shape[0] * embedding_per_req.shape[1]
-        )
-        if end_index == embedding_per_req_length:
-            embedding_cache.free(embedding_items_hash)
         embedding_list.append(embedding_per_req_chunk)
     if len(embedding_list) == 0:
         return None

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -84,10 +84,10 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "disable_radix_cache",
     "enable_dp_attention",
     "enable_two_batch_overlap",
+    "tbo_token_distribution_threshold",
     "enable_dp_lm_head",
-    "enable_deepep_moe",
+    "moe_a2a_backend",
     "deepep_mode",
-    "enable_ep_moe",
     "enable_flashinfer_cutlass_moe",
     "enable_flashinfer_trtllm_moe",
     "enable_flashinfer_allreduce_fusion",
@@ -107,7 +107,10 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "num_reserved_decode_tokens",
     "weight_loader_disable_mmap",
     "enable_triton_kernel_moe",
+    "enable_flashinfer_mxfp4_moe",
     "enable_multimodal",
+    "enable_symm_mem",
+    "quantization",
 ]
 # Put some global args for easy access
@@ -422,7 +425,7 @@ class Req:
         token_ids_logprob: List[int] = None,
         stream: bool = False,
         origin_input_ids_unpadded: Optional[Tuple[int]] = None,
-        lora_path: Optional[str] = None,
+        lora_id: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         token_type_ids: List[int] = None,
         session_id: Optional[str] = None,
@@ -466,7 +469,7 @@ class Req:
         self.sampling_params = sampling_params
         self.custom_logit_processor = custom_logit_processor
         self.return_hidden_states = return_hidden_states
-        self.lora_path = lora_path
+        self.lora_id = lora_id
         # Memory pool info
         self.req_pool_idx: Optional[int] = None
@@ -844,6 +847,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # The sum of all sequence lengths
     seq_lens_sum: int = None
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: torch.Tensor = None  # shape: [b], int32
     # For DP attention
     global_num_tokens: Optional[List[int]] = None
@@ -916,8 +921,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         is_hybrid = False
         if isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator):
-            assert isinstance(tree_cache, SWARadixCache) or isinstance(
-                tree_cache, SWAChunkCache
+            assert (
+                tree_cache is None
+                or isinstance(tree_cache, SWARadixCache)
+                or isinstance(tree_cache, SWAChunkCache)
             ), "SWARadixCache or SWAChunkCache is required for SWATokenToKVPoolAllocator"
             is_hybrid = True
@@ -1127,6 +1134,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
         extend_num_tokens = sum(len(ids) for ids in input_ids)
         seq_lens = [len(r.fill_ids) for r in reqs]
+        orig_seq_lens = [max(len(r.fill_ids), len(r.origin_input_ids)) for r in reqs]
         prefix_lens = [len(r.prefix_indices) for r in reqs]
         extend_lens = [r.extend_input_len for r in reqs]
@@ -1143,6 +1151,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
+        orig_seq_lens_tensor = torch.tensor(orig_seq_lens, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
         prefix_lens_tensor = torch.tensor(
             prefix_lens, dtype=torch.int64, device=self.device
         )
@@ -1256,6 +1267,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.input_ids = input_ids_tensor
         self.req_pool_indices = req_pool_indices_tensor
         self.seq_lens = seq_lens_tensor
+        self.orig_seq_lens = orig_seq_lens_tensor
         self.out_cache_loc = out_cache_loc
         self.input_embeds = (
             torch.tensor(input_embeds).to(self.device, non_blocking=True)
@@ -1503,6 +1515,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.forward_mode = ForwardMode.IDLE
         self.input_ids = torch.empty(0, dtype=torch.int64, device=self.device)
         self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.orig_seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int64, device=self.device)
         self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens_sum = 0
@@ -1557,9 +1570,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if self.enable_overlap:
             # Do not use in-place operations in the overlap mode
             self.seq_lens = self.seq_lens + 1
+            self.orig_seq_lens = self.orig_seq_lens + 1
         else:
             # A faster in-place version
             self.seq_lens.add_(1)
+            self.orig_seq_lens.add_(1)
         self.seq_lens_sum += bs
         # free memory
@@ -1623,6 +1638,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
         self.req_pool_indices = self.req_pool_indices[keep_indices_device]
         self.seq_lens = self.seq_lens[keep_indices_device]
+        self.orig_seq_lens = self.orig_seq_lens[keep_indices_device]
         self.out_cache_loc = None
         self.seq_lens_sum = self.seq_lens.sum().item()
         self.output_ids = self.output_ids[keep_indices_device]
@@ -1655,6 +1671,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             [self.req_pool_indices, other.req_pool_indices]
         )
         self.seq_lens = torch.cat([self.seq_lens, other.seq_lens])
+        self.orig_seq_lens = torch.cat([self.orig_seq_lens, other.orig_seq_lens])
         self.out_cache_loc = None
         self.seq_lens_sum += other.seq_lens_sum
         if self.output_ids is not None:
@@ -1704,6 +1721,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             or attention_backend_str == "flashmla"
             or attention_backend_str == "cutlass_mla"
             or attention_backend_str == "ascend"
+            or attention_backend_str == "trtllm_mha"
             or global_server_args_dict["enable_two_batch_overlap"]
         ):
             seq_lens_cpu = (
@@ -1728,6 +1746,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             input_ids=self.input_ids,
             req_pool_indices=self.req_pool_indices,
             seq_lens=self.seq_lens,
+            orig_seq_lens=self.orig_seq_lens,
             out_cache_loc=self.out_cache_loc,
             seq_lens_cpu=seq_lens_cpu,
             seq_lens_sum=self.seq_lens_sum,
@@ -1749,7 +1768,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             encoder_lens=self.encoder_lens,
             encoder_lens_cpu=self.encoder_lens_cpu,
             encoder_out_cache_loc=self.encoder_out_cache_loc,
-            lora_paths=[req.lora_path for req in self.reqs],
+            lora_ids=[req.lora_id for req in self.reqs],
             sampling_info=self.sampling_info,
             input_embeds=self.input_embeds,
             token_type_ids=self.token_type_ids,
@@ -1890,11 +1909,14 @@ class ModelWorkerBatch:
     encoder_out_cache_loc: Optional[torch.Tensor]
     # For LoRA
-    lora_paths: Optional[List[str]]
+    lora_ids: Optional[List[str]]
     # Sampling info
     sampling_info: SamplingBatchInfo
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: Optional[torch.Tensor] = None
     # The input Embeds
     input_embeds: Optional[torch.Tensor] = None

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -455,7 +455,9 @@ class PrefillAdder:
         if not self.is_hybrid:
             # Skip this logic for swa. The SWA has different memory management, and
             # this mechanism is underestimating the memory usage.
-            cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
+            cur_rem_tokens = self.cur_rem_tokens - self.ceil_paged_tokens(
+                req.extend_input_len
+            )
             tokens_freed = 0
             for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
                 # tokens_left gives a reservative calculation as the last token is not stored

sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl