PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/conversation.py +0 -112
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
sglang/srt/disaggregation/prefill.py +1 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +11 -0
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +35 -15
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/hf_transformers_utils.py +25 -10
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +27 -10
sglang/srt/layers/communicator.py +14 -4
sglang/srt/layers/linear.py +7 -1
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/ep_moe/layer.py +11 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +26 -23
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/utils.py +43 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp8.py +5 -1
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +7 -1
sglang/srt/lora/lora_registry.py +7 -0
sglang/srt/managers/cache_controller.py +8 -4
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +3 -2
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +144 -6
sglang/srt/managers/template_manager.py +25 -22
sglang/srt/managers/tokenizer_manager.py +114 -62
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +17 -3
sglang/srt/model_executor/forward_batch_info.py +13 -3
sglang/srt/model_executor/model_runner.py +5 -0
sglang/srt/models/deepseek_v2.py +23 -17
sglang/srt/models/glm4_moe.py +82 -19
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2_moe.py +1 -4
sglang/srt/models/qwen3_moe.py +7 -8
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/operations_strategy.py +1 -1
sglang/srt/server_args.py +80 -20
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +6 -4
sglang/srt/utils.py +3 -24
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +80 -74
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -64,6 +64,7 @@ from sglang.srt.hf_transformers_utils import (
 )
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
 from sglang.srt.managers.io_struct import (
     AbortReq,
     CloseSessionReqInput,
@@ -125,7 +126,7 @@ from sglang.srt.managers.scheduler_update_weights_mixin import (
 from sglang.srt.managers.session_controller import Session
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
-from sglang.srt.managers.utils import validate_input_length
+from sglang.srt.managers.utils import DPBalanceMeta, validate_input_length
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
@@ -137,7 +138,6 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
 from sglang.srt.utils import (
-    DeepEPMode,
     DynamicGradMode,
     broadcast_pyobj,
     configure_gc_logger,
@@ -203,6 +203,7 @@ class Scheduler(
         moe_ep_rank: int,
         pp_rank: int,
         dp_rank: Optional[int],
+        dp_balance_meta: Optional[DPBalanceMeta] = None,
     ):
         # Parse args
         self.server_args = server_args
@@ -522,6 +523,15 @@ class Scheduler(
             ]
         )
+        self.balance_meta = dp_balance_meta
+        if (
+            server_args.enable_dp_attention
+            and server_args.load_balance_method == "minimum_tokens"
+        ):
+            assert dp_balance_meta is not None
+        self.recv_dp_balance_id_this_term = []
     def init_tokenizer(self):
         server_args = self.server_args
@@ -569,7 +579,23 @@ class Scheduler(
                 page_size=self.page_size,
             )
         else:
-            if self.enable_hierarchical_cache:
+            if os.environ.get("SGLANG_EXPERIMENTAL_CPP_RADIX_TREE") == "1":
+                # lazy import to avoid JIT overhead
+                from sglang.srt.mem_cache.radix_cache_cpp import RadixCacheCpp
+                self.tree_cache = RadixCacheCpp(
+                    disable=False,
+                    use_hicache=self.enable_hierarchical_cache,
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool=self.token_to_kv_pool_allocator,
+                    tp_cache_group=self.tp_cpu_group,
+                    page_size=self.page_size,
+                    hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
+                    enable_kv_cache_events=self.enable_kv_cache_events,
+                )
+            elif self.enable_hierarchical_cache:
                 self.tree_cache = HiRadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
@@ -1033,6 +1059,12 @@ class Scheduler(
         self,
         recv_req: TokenizedGenerateReqInput,
     ):
+        if (
+            self.server_args.enable_dp_attention
+            and self.server_args.load_balance_method == "minimum_tokens"
+        ):
+            self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
         # Create a new request
         if (
             recv_req.session_params is None
@@ -1443,6 +1475,11 @@ class Scheduler(
         # Handle DP attention
         if need_dp_attn_preparation:
+            if (
+                self.server_args.load_balance_method == "minimum_tokens"
+                and self.forward_ct % 40 == 0
+            ):
+                self.handle_dp_balance_data(ret)
             ret = self.prepare_mlp_sync_batch(ret)
         return ret
@@ -1744,6 +1781,9 @@ class Scheduler(
         elif batch.forward_mode.is_dummy_first():
             self.set_next_batch_sampling_info_done(batch)
+        self.maybe_send_health_check_signal()
+    def maybe_send_health_check_signal(self):
         if self.return_health_check_ct:
             # Return some signal for the health check.
             # This is used to prevent the health check signal being blocked by long context prefill.
@@ -1762,12 +1802,94 @@ class Scheduler(
             spec_algorithm=self.spec_algorithm,
             speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
             enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
-            enable_deepep_moe=self.server_args.enable_deepep_moe,
-            deepep_mode=DeepEPMode[self.server_args.deepep_mode],
+            enable_deepep_moe=MoeA2ABackend(
+                self.server_args.moe_a2a_backend
+            ).is_deepep(),
+            deepep_mode=DeepEPMode(self.server_args.deepep_mode),
             require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
             disable_overlap_schedule=self.server_args.disable_overlap_schedule,
         )
+    def handle_dp_balance_data(self, local_batch: ScheduleBatch):
+        def gather_dp_balance_info(holding_tokens_list) -> Union[None, List[List[int]]]:
+            """gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
+            recv_list = self.recv_dp_balance_id_this_term
+            assert len(recv_list) <= 511, (
+                "The number of requests received this round is too large. "
+                "Please increase gather_tensor_size and onfly_info_size."
+            )
+            # The maximum size of the tensor used for gathering data from all workers.
+            gather_tensor_size = 512
+            # recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
+            recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
+            recv_tensor[0] = holding_tokens_list
+            recv_tensor[1] = len(
+                recv_list
+            )  # The first element is the length of the list.
+            recv_tensor[2 : len(recv_list) + 2] = torch.tensor(
+                recv_list, dtype=torch.int32
+            )
+            if self.tp_rank == 0:
+                gathered_list = [
+                    torch.zeros(gather_tensor_size, dtype=torch.int32)
+                    for _ in range(self.balance_meta.num_workers)
+                ]
+            else:
+                gathered_list = None
+            torch.distributed.gather(
+                recv_tensor, gathered_list, group=self.tp_cpu_group
+            )
+            gathered_id_list_per_worker = None
+            if self.tp_rank == 0:
+                gathered_id_list_per_worker = []
+                holding_tokens_list = []
+                for tensor in gathered_list:
+                    holding_tokens_list.append(tensor[0].item())
+                    list_length = tensor[1].item()
+                    gathered_id_list_per_worker.append(
+                        tensor[2 : list_length + 2].tolist()
+                    )
+            return gathered_id_list_per_worker, holding_tokens_list
+        def write_shared_dp_balance_info(new_recv_rid_lists, local_tokens):
+            meta = self.balance_meta
+            with meta.mutex:
+                onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
+                assert len(new_recv_rid_lists) == len(
+                    onfly_list
+                ), "num_worker not equal"
+                # 1.Check if the rid received by each worker this round is present in onfly.
+                #   If it is, remove the corresponding onfly item.
+                worker_id = 0
+                for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
+                    for new_recv_rid in new_recv_rids:
+                        assert (
+                            new_recv_rid in on_fly_reqs
+                        ), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
+                        del on_fly_reqs[new_recv_rid]
+                    worker_id += 1
+                # 2. Atomically write local_tokens and onfly into shm under the mutex
+                meta.set_shared_onfly_info(onfly_list)
+                meta.set_shared_local_tokens(local_tokens)
+        holding_tokens = self.get_load()
+        new_recv_dp_balance_id_list, holding_token_list = gather_dp_balance_info(
+            holding_tokens
+        )
+        self.recv_dp_balance_id_this_term.clear()
+        if self.tp_rank == 0:  # only first worker write info
+            write_shared_dp_balance_info(
+                new_recv_dp_balance_id_list, holding_token_list
+            )
     @staticmethod
     def prepare_mlp_sync_batch_raw(
         local_batch: ScheduleBatch,
@@ -2344,11 +2466,19 @@ class IdleSleeper:
     def __init__(self, sockets):
         self.poller = zmq.Poller()
+        self.last_empty_time = time.time()
         for s in sockets:
             self.poller.register(s, zmq.POLLIN)
     def maybe_sleep(self):
         self.poller.poll(1000)
+        if (
+            global_config.torch_empty_cache_interval > 0
+            and time.time() - self.last_empty_time
+            > global_config.torch_empty_cache_interval
+        ):
+            self.last_empty_time = time.time()
+            torch.cuda.empty_cache()
 def is_health_check_generate_req(recv_req):
@@ -2368,6 +2498,7 @@ def run_scheduler_process(
     pp_rank: int,
     dp_rank: Optional[int],
     pipe_writer,
+    balance_meta: Optional[DPBalanceMeta] = None,
 ):
     # Generate the prefix
     prefix = ""
@@ -2401,7 +2532,14 @@ def run_scheduler_process(
     # Create a scheduler and run the event loop
     try:
         scheduler = Scheduler(
-            server_args, port_args, gpu_id, tp_rank, moe_ep_rank, pp_rank, dp_rank
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta=balance_meta,
         )
         pipe_writer.send(
             {

sglang/srt/managers/template_manager.py CHANGED Viewed

@@ -84,26 +84,27 @@ class TemplateManager:
         if chat_template_arg:
             self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
         else:
-            # Try HuggingFace template first
-            hf_template = self._resolve_hf_chat_template(tokenizer_manager)
-            if hf_template:
-                self._jinja_template_content_format = (
-                    detect_jinja_template_content_format(hf_template)
-                )
-                logger.info(
-                    f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
-                )
-                return
-            # Fallback to SGLang template guessing
+            # Guess chat template from model path
             self.guess_chat_template_from_model_path(model_path)
-            # Set default format if no template was found
+            # If no pre-defined template was found, fallback to HuggingFace template
             if self._chat_template_name is None:
-                self._jinja_template_content_format = "string"
-                logger.info(
-                    "No chat template found, defaulting to 'string' content format"
-                )
+                # Try HuggingFace template first
+                hf_template = self._resolve_hf_chat_template(tokenizer_manager)
+                if hf_template:
+                    # override the chat template
+                    tokenizer_manager.tokenizer.chat_template = hf_template
+                    self._jinja_template_content_format = (
+                        detect_jinja_template_content_format(hf_template)
+                    )
+                    logger.info(
+                        f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
+                    )
+                    return
+            # Default to string content format if no template was found
+            self._jinja_template_content_format = "string"
+            logger.info("No chat template found, defaulting to 'string' content format")
     def _load_explicit_chat_template(
         self, tokenizer_manager, chat_template_arg: str
@@ -257,13 +258,15 @@ class TemplateManager:
         Returns the chat template string if found, None otherwise.
         """
-        tokenizer = tokenizer_manager.tokenizer
-        # Try to get AutoTokenizer chat template
         try:
-            return tokenizer.get_chat_template()
+            if processor := tokenizer_manager.processor:
+                if hasattr(processor, "chat_template") and processor.chat_template:
+                    return processor.chat_template
+            if tokenizer := tokenizer_manager.tokenizer:
+                if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+                    return tokenizer.chat_template
         except Exception as e:
-            logger.debug(f"Error getting chat template via get_chat_template(): {e}")
+            logger.debug(f"Error getting chat template: {e}")
         logger.debug("No HuggingFace chat template found")
         return None

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -29,6 +29,7 @@ import uuid
 from collections import deque
 from contextlib import nullcontext
 from datetime import datetime
+from enum import Enum
 from http import HTTPStatus
 from typing import (
     Any,
@@ -70,7 +71,6 @@ from sglang.srt.managers.io_struct import (
     BatchMultimodalOut,
     BatchStrOut,
     BatchTokenIDOut,
-    BlockReqType,
     CloseSessionReqInput,
     ConfigureLoggingReq,
     EmbeddingReqInput,
@@ -116,6 +116,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.mm_utils import TensorTransportMode
 from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
+from sglang.srt.managers.scheduler import is_health_check_generate_req
 from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
 from sglang.srt.metrics.collector import TokenizerMetricsCollector
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -202,13 +203,29 @@ class TokenizerManager:
         if self.model_config.is_multimodal:
             import_processors()
-            _processor = get_processor(
-                server_args.tokenizer_path,
-                tokenizer_mode=server_args.tokenizer_mode,
-                trust_remote_code=server_args.trust_remote_code,
-                revision=server_args.revision,
-                use_fast=not server_args.disable_fast_image_processor,
-            )
+            try:
+                _processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                    use_fast=not server_args.disable_fast_image_processor,
+                )
+            except ValueError as e:
+                error_message = str(e)
+                if "does not have a slow version" in error_message:
+                    logger.info(
+                        f"Processor {server_args.tokenizer_path} does not have a slow version. Automatically use fast version"
+                    )
+                    _processor = get_processor(
+                        server_args.tokenizer_path,
+                        tokenizer_mode=server_args.tokenizer_mode,
+                        trust_remote_code=server_args.trust_remote_code,
+                        revision=server_args.revision,
+                        use_fast=True,
+                    )
+                else:
+                    raise e
             transport_mode = _determine_tensor_transport_mode(self.server_args)
             # We want to parallelize the image pre-processing so we create an executor for it
@@ -225,10 +242,10 @@ class TokenizerManager:
                 self.tokenizer = get_tokenizer_from_processor(self.processor)
                 os.environ["TOKENIZERS_PARALLELISM"] = "false"
         else:
-            self.mm_processor = None
+            self.mm_processor = self.processor = None
             if server_args.skip_tokenizer_init:
-                self.tokenizer = self.processor = None
+                self.tokenizer = None
             else:
                 self.tokenizer = get_tokenizer(
                     server_args.tokenizer_path,
@@ -255,6 +272,7 @@ class TokenizerManager:
         self.health_check_failed = False
         self.gracefully_exit = False
         self.last_receive_tstamp = 0
+        self.server_status = ServerStatus.Starting
         # Dumping
         self.dump_requests_folder = ""  # By default do not dump
@@ -1069,38 +1087,56 @@ class TokenizerManager:
         _: Optional[fastapi.Request] = None,
     ) -> LoadLoRAAdapterReqOutput:
         self.auto_create_handle_loop()
-        if not self.server_args.enable_lora:
-            raise ValueError(
-                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
-            )
-        # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
-        # with dp_size > 1.
-        assert (
-            self.server_args.dp_size == 1
-        ), "dp_size must be 1 for dynamic lora loading"
-        logger.info(
-            "Start load Lora adapter. Lora name=%s, path=%s",
-            obj.lora_name,
-            obj.lora_path,
-        )
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
-        async with self.lora_update_lock:
-            # Generate new uniquely identifiable LoRARef object.
-            new_adapter = LoRARef(
-                lora_name=obj.lora_name,
-                lora_path=obj.lora_path,
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start load Lora adapter. Lora name=%s, path=%s",
+                obj.lora_name,
+                obj.lora_path,
             )
-            # Trigger the actual loading operation at the backend processes.
-            obj.lora_id = new_adapter.lora_id
-            result = (await self.update_lora_adapter_communicator(obj))[0]
+            async with self.lora_update_lock:
+                if (
+                    self.server_args.max_loaded_loras is not None
+                    and self.lora_registry.num_registered_loras
+                    >= self.server_args.max_loaded_loras
+                ):
+                    raise ValueError(
+                        f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
+                        f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
+                        "Please unload some LoRA adapters before loading new ones."
+                    )
-            # Register the LoRA adapter only after loading is successful.
-            if result.success:
-                await self.lora_registry.register(new_adapter)
+                # Generate new uniquely identifiable LoRARef object.
+                new_adapter = LoRARef(
+                    lora_name=obj.lora_name,
+                    lora_path=obj.lora_path,
+                )
-            return result
+                # Trigger the actual loading operation at the backend processes.
+                obj.lora_id = new_adapter.lora_id
+                result = (await self.update_lora_adapter_communicator(obj))[0]
+                # Register the LoRA adapter only after loading is successful.
+                if result.success:
+                    await self.lora_registry.register(new_adapter)
+                return result
+        except ValueError as e:
+            return LoadLoRAAdapterReqOutput(
+                success=False,
+                error_message=str(e),
+            )
     async def unload_lora_adapter(
         self,
@@ -1108,37 +1144,41 @@ class TokenizerManager:
         _: Optional[fastapi.Request] = None,
     ) -> UnloadLoRAAdapterReqOutput:
         self.auto_create_handle_loop()
-        if not self.server_args.enable_lora:
-            raise ValueError(
-                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
-            )
-        assert (
-            obj.lora_name is not None
-        ), "lora_name must be provided to unload LoRA adapter"
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
-        # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
-        # with dp_size > 1.
-        assert (
-            self.server_args.dp_size == 1
-        ), "dp_size must be 1 for dynamic lora loading"
-        logger.info(
-            "Start unload Lora adapter. Lora name=%s",
-            obj.lora_name,
-        )
+            assert (
+                obj.lora_name is not None
+            ), "lora_name must be provided to unload LoRA adapter"
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start unload Lora adapter. Lora name=%s",
+                obj.lora_name,
+            )
-        async with self.lora_update_lock:
-            # Unregister the LoRA adapter from the registry to stop new requests for this adapter
-            # from being started.
-            lora_id = await self.lora_registry.unregister(obj.lora_name)
-            obj.lora_id = lora_id
+            async with self.lora_update_lock:
+                # Unregister the LoRA adapter from the registry to stop new requests for this adapter
+                # from being started.
+                lora_id = await self.lora_registry.unregister(obj.lora_name)
+                obj.lora_id = lora_id
-            # Initiate the actual unloading operation at the backend processes only after all
-            # ongoing requests using this LoRA adapter are finished.
-            await self.lora_registry.wait_for_unload(lora_id)
-            result = (await self.update_lora_adapter_communicator(obj))[0]
+                # Initiate the actual unloading operation at the backend processes only after all
+                # ongoing requests using this LoRA adapter are finished.
+                await self.lora_registry.wait_for_unload(lora_id)
+                result = (await self.update_lora_adapter_communicator(obj))[0]
-            return result
+                return result
+        except ValueError as e:
+            return UnloadLoRAAdapterReqOutput(success=False, rror_message=str(e))
     async def get_weights_by_name(
         self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
@@ -1767,6 +1807,8 @@ class TokenizerManager:
         asyncio.create_task(asyncio.to_thread(background_task))
     def _handle_abort_req(self, recv_obj):
+        if is_health_check_generate_req(recv_obj):
+            return
         state = self.rid_to_state[recv_obj.rid]
         state.finished = True
         if recv_obj.finished_reason:
@@ -1901,6 +1943,16 @@ class TokenizerManager:
         return scores
+class ServerStatus(Enum):
+    Up = "Up"
+    Starting = "Starting"
+    UnHealthy = "UnHealthy"
+    Crashed = "Crashed"
+    def is_healthy(self) -> bool:
+        return self == ServerStatus.Up
 def _determine_tensor_transport_mode(server_args: ServerArgs) -> TensorTransportMode:
     is_cross_node = server_args.dist_init_addr

sglang/srt/managers/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
+import multiprocessing as mp
 from http import HTTPStatus
-from typing import Optional
+from typing import Dict, List, Optional
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
@@ -38,3 +39,46 @@ def validate_input_length(
             return error_msg
     return None
+class DPBalanceMeta:
+    """
+    This class will be use in scheduler and dp controller
+    """
+    def __init__(self, num_workers: int):
+        self.num_workers = num_workers
+        self._manager = mp.Manager()
+        self.mutex = self._manager.Lock()
+        init_local_tokens = [0] * self.num_workers
+        init_onfly_info = [self._manager.dict() for _ in range(self.num_workers)]
+        self.shared_state = self._manager.Namespace()
+        self.shared_state.local_tokens = self._manager.list(init_local_tokens)
+        self.shared_state.onfly_info = self._manager.list(init_onfly_info)
+    def destructor(self):
+        # we must destructor this class manually
+        self._manager.shutdown()
+    def get_shared_onfly(self) -> List[Dict[int, int]]:
+        return [dict(d) for d in self.shared_state.onfly_info]
+    def set_shared_onfly_info(self, data: List[Dict[int, int]]):
+        self.shared_state.onfly_info = data
+    def get_shared_local_tokens(self) -> List[int]:
+        return list(self.shared_state.local_tokens)
+    def set_shared_local_tokens(self, data: List[int]):
+        self.shared_state.local_tokens = data
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["_manager"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._manager = None

sglang 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl