PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/conversation.py +0 -112
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
sglang/srt/disaggregation/prefill.py +1 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +11 -0
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +35 -15
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/hf_transformers_utils.py +25 -10
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +27 -10
sglang/srt/layers/communicator.py +14 -4
sglang/srt/layers/linear.py +7 -1
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/ep_moe/layer.py +11 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +26 -23
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/utils.py +43 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp8.py +5 -1
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +7 -1
sglang/srt/lora/lora_registry.py +7 -0
sglang/srt/managers/cache_controller.py +8 -4
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +3 -2
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +144 -6
sglang/srt/managers/template_manager.py +25 -22
sglang/srt/managers/tokenizer_manager.py +114 -62
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +17 -3
sglang/srt/model_executor/forward_batch_info.py +13 -3
sglang/srt/model_executor/model_runner.py +5 -0
sglang/srt/models/deepseek_v2.py +23 -17
sglang/srt/models/glm4_moe.py +82 -19
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2_moe.py +1 -4
sglang/srt/models/qwen3_moe.py +7 -8
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/operations_strategy.py +1 -1
sglang/srt/server_args.py +80 -20
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +6 -4
sglang/srt/utils.py +3 -24
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +80 -74
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0

sglang/compile_deep_gemm.py CHANGED Viewed

@@ -17,6 +17,7 @@ import time
 import requests
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -52,7 +53,9 @@ class CompileArgs:
 @warmup("compile-deep-gemm")
-async def warm_up_compile(tokenizer_manager: TokenizerManager):
+async def warm_up_compile(
+    disaggregation_mode: str, tokenizer_manager: TokenizerManager
+):
     print("\nGenerate warm up request for compiling DeepGEMM...\n")
     generate_req_input = GenerateReqInput(
         input_ids=[0, 1, 2, 3],
@@ -62,6 +65,10 @@ async def warm_up_compile(tokenizer_manager: TokenizerManager):
             "ignore_eos": True,
         },
     )
+    if disaggregation_mode != "null":
+        generate_req_input.bootstrap_room = 0
+        generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
     await tokenizer_manager.generate_request(generate_req_input, None).__anext__()

sglang/global_config.py CHANGED Viewed

@@ -30,7 +30,11 @@ class GlobalConfig:
         self.default_new_token_ratio_decay_steps = float(
             os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
         )
+        self.torch_empty_cache_interval = float(
+            os.environ.get(
+                "SGLANG_EMPTY_CACHE_INTERVAL", -1
+            )  # in seconds. Set if you observe high memory accumulation over a long serving period.
+        )
         # Runtime constants: others
         self.retract_decode_steps = 20
         self.flashinfer_workspace_size = os.environ.get(

sglang/srt/conversation.py CHANGED Viewed

@@ -954,20 +954,6 @@ register_conv_template(
     )
 )
-register_conv_template(
-    Conversation(
-        name="mimo-vl",
-        system_message="You are MiMo, an AI assistant developed by Xiaomi.",
-        system_template="<|im_start|>system\n{system_message}",
-        roles=("<|im_start|>user", "<|im_start|>assistant"),
-        sep="<|im_end|>\n",
-        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
-        stop_str=["<|im_end|>"],
-        image_token="<|vision_start|><|image_pad|><|vision_end|>",
-    )
-)
 register_conv_template(
     Conversation(
         name="qwen2-audio",
@@ -981,51 +967,11 @@ register_conv_template(
     )
 )
-register_conv_template(
-    Conversation(
-        name="llama_4_vision",
-        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
-        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
-        roles=("user", "assistant"),
-        sep_style=SeparatorStyle.LLAMA4,
-        sep="",
-        stop_str="<|eot|>",
-        image_token="<|image|>",
-    )
-)
-register_conv_template(
-    Conversation(
-        name="step3-vl",
-        system_message="<｜begin▁of▁sentence｜>You are a helpful assistant",
-        system_template="{system_message}\n",
-        roles=(
-            "<|BOT|>user\n",
-            "<|BOT|>assistant\n<think>\n",
-        ),
-        sep="<|EOT|>",
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        stop_str="<|EOT|>",
-        image_token="<im_patch>",
-        # add_bos=True,
-    )
-)
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
     if re.search(r"internvl", model_path, re.IGNORECASE):
         return "internvl-2-5"
-    if re.search(r"intern.*s1", model_path, re.IGNORECASE):
-        return "interns1"
-@register_conv_template_matching_function
-def match_llama_vision(model_path: str):
-    if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
-        return "llama_3_vision"
-    if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
-        return "llama_4_vision"
 @register_conv_template_matching_function
@@ -1040,22 +986,6 @@ def match_vicuna(model_path: str):
         return "vicuna_v1.1"
-@register_conv_template_matching_function
-def match_llama2_chat(model_path: str):
-    if re.search(
-        r"llama-2.*chat|codellama.*instruct",
-        model_path,
-        re.IGNORECASE,
-    ):
-        return "llama-2"
-@register_conv_template_matching_function
-def match_mistral(model_path: str):
-    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
-        return "mistral"
 @register_conv_template_matching_function
 def match_deepseek_vl(model_path: str):
     if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
@@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str):
 @register_conv_template_matching_function
 def match_qwen_chat_ml(model_path: str):
-    if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
-        return "gme-qwen2-vl"
-    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
-        return "qwen2-vl"
-    if re.search(r"qwen.*audio", model_path, re.IGNORECASE):
-        return "qwen2-audio"
     if re.search(
         r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
         model_path,
@@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str):
         return "chatml-llava"
-@register_conv_template_matching_function
-def match_gemma3_instruct(model_path: str):
-    if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
-        return "gemma-it"
 @register_conv_template_matching_function
 def match_openbmb_minicpm(model_path: str):
     if re.search(r"minicpm-v", model_path, re.IGNORECASE):
@@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str):
         return "minicpmo"
-@register_conv_template_matching_function
-def match_moonshot_kimivl(model_path: str):
-    if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
-        return "kimi-vl"
-@register_conv_template_matching_function
-def match_devstral(model_path: str):
-    if re.search(r"devstral", model_path, re.IGNORECASE):
-        return "devstral"
 @register_conv_template_matching_function
 def match_phi_4_mm(model_path: str):
     if "phi-4-multimodal" in model_path.lower():
         return "phi-4-mm"
-@register_conv_template_matching_function
-def match_vila(model_path: str):
-    if re.search(r"vila", model_path, re.IGNORECASE):
-        return "chatml"
-@register_conv_template_matching_function
-def match_mimo_vl(model_path: str):
-    if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
-        return "mimo-vl"
-# @register_conv_template_matching_function
-# def match_step3(model_path: str):
-#     if re.search(r"step3", model_path, re.IGNORECASE):
-#         return "step3-vl"

sglang/srt/disaggregation/decode_schedule_batch_mixin.py CHANGED Viewed

@@ -88,6 +88,7 @@ class ScheduleBatchDisaggregationDecodeMixin:
         self.extend_lens = [r.extend_input_len for r in reqs]
         self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
         self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
+        self.multimodal_inputs = [r.multimodal_inputs for r in reqs]
         # Build sampling info
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -460,6 +460,7 @@ class SchedulerDisaggregationPrefillMixin:
         # We need to remove the sync in the following function for overlap schedule.
         self.set_next_batch_sampling_info_done(batch)
+        self.maybe_send_health_check_signal()
     def process_disagg_prefill_inflight_queue(
         self: Scheduler, rids_to_check: Optional[List[str]] = None

sglang/srt/distributed/device_communicators/pynccl.py CHANGED Viewed

@@ -75,6 +75,7 @@ class PyNcclCommunicator:
         self.available = True
         self.disabled = False
+        self.nccl_version = self.nccl.ncclGetRawVersion()
         if self.rank == 0:
             logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
@@ -259,6 +260,12 @@ class PyNcclCommunicator:
             cudaStream_t(stream.cuda_stream),
         )
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
     @contextmanager
     def change_state(
         self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None

sglang/srt/distributed/device_communicators/pynccl_allocator.py ADDED Viewed

@@ -0,0 +1,133 @@
+import tempfile
+import torch
+from packaging import version
+from torch.cuda.memory import CUDAPluggableAllocator
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+nccl_allocator_source = """
+#include <nccl.h>
+extern "C" {
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  ncclResult_t err = ncclMemAlloc(&ptr, size);
+  return ptr;
+}
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+}
+"""
+_allocator = None
+_mem_pool = None
+_registered_base_addrs = set()
+_graph_pool_id = None
+def is_symmetric_memory_enabled():
+    return global_server_args_dict["enable_symm_mem"]
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+def get_nccl_mem_pool():
+    global _allocator, _mem_pool
+    if _mem_pool is None:
+        out_dir = tempfile.gettempdir()
+        nccl_allocator_libname = "nccl_allocator"
+        torch.utils.cpp_extension.load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=True,
+            is_python_module=False,
+            build_directory=out_dir,
+        )
+        _allocator = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        ).allocator()
+        _mem_pool = torch.cuda.MemPool(_allocator)
+    return _mem_pool
+class use_symmetric_memory:
+    def __init__(self, group_coordinator: GroupCoordinator):
+        if not is_symmetric_memory_enabled():
+            self.group_coordinator = None
+            self._mem_pool_ctx = None
+            self.is_graph_capture = None
+            self.device = None
+            self.pre_2_8_0 = None
+        else:
+            self.group_coordinator = group_coordinator
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+            self.device = torch.cuda.current_device()
+            self.pre_2_8_0 = version.parse(torch.__version__) < version.parse("2.8.0")
+    def __enter__(self):
+        if not is_symmetric_memory_enabled():
+            return self
+        assert (
+            self.group_coordinator.pynccl_comm is not None
+        ), f"Symmetric memory requires pynccl to be enabled in group '{self.group_coordinator.group_name}'"
+        assert (
+            self.group_coordinator.pynccl_comm.nccl_version >= 22703
+        ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        if self.is_graph_capture:
+            assert (
+                _graph_pool_id is not None
+            ), "graph_pool_id is not set under graph capture"
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            if self.pre_2_8_0:
+                torch._C._cuda_endAllocateCurrentStreamToPool(
+                    self.device, _graph_pool_id
+                )
+            else:
+                torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
+        self._mem_pool_ctx.__enter__()
+        return self
+    def tag(self, tensor: torch.Tensor):
+        if not is_symmetric_memory_enabled():
+            return
+        tensor.symmetric_memory = True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not is_symmetric_memory_enabled():
+            return
+        global _registered_base_addrs
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+        for segment in get_nccl_mem_pool().snapshot():
+            if segment["address"] not in _registered_base_addrs:
+                if segment["stream"] == 0 and self.pre_2_8_0:
+                    # PyTorch version < 2.8.0 has a multi-thread MemPool bug
+                    # See https://github.com/pytorch/pytorch/issues/152861
+                    # Fixed at https://github.com/pytorch/pytorch/commit/f01e628e3b31852983ab30b25bf251f557ba9c0b
+                    # WAR is to skip allocations on the default stream since the forward_pass thread always runs on a custom stream
+                    continue
+                self.group_coordinator.pynccl_comm.register_comm_window_raw(
+                    segment["address"], segment["total_size"]
+                )
+                _registered_base_addrs.add(segment["address"])
+        if self.is_graph_capture:
+            if self.pre_2_8_0:
+                torch._C._cuda_beginAllocateToPool(self.device, _graph_pool_id)
+            else:
+                torch._C._cuda_beginAllocateCurrentThreadToPool(
+                    self.device, _graph_pool_id
+                )

sglang/srt/distributed/device_communicators/pynccl_wrapper.py CHANGED Viewed

@@ -67,6 +67,7 @@ def find_nccl_library() -> str:
 ncclResult_t = ctypes.c_int
 ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p
 class ncclUniqueId(ctypes.Structure):
@@ -279,6 +280,23 @@ class NCCLLibrary:
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
     ]
+    exported_functions_symm_mem = [
+        # ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
+    ]
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: Dict[str, Any] = {}
@@ -312,7 +330,10 @@ class NCCLLibrary:
         if so_file not in NCCLLibrary.path_to_dict_mapping:
             _funcs: Dict[str, Any] = {}
-            for func in NCCLLibrary.exported_functions:
+            exported_functions = NCCLLibrary.exported_functions
+            if hasattr(self.lib, "ncclCommWindowRegister"):
+                exported_functions.extend(NCCLLibrary.exported_functions_symm_mem)
+            for func in exported_functions:
                 f = getattr(self.lib, func.name)
                 f.restype = func.restype
                 f.argtypes = func.argtypes
@@ -328,10 +349,14 @@ class NCCLLibrary:
             error_str = self.ncclGetErrorString(result)
             raise RuntimeError(f"NCCL error: {error_str}")
-    def ncclGetVersion(self) -> str:
+    def ncclGetRawVersion(self) -> int:
         version = ctypes.c_int()
         self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
-        version_str = str(version.value)
+        # something like 21903
+        return version.value
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
         # something like 21903 --> "2.19.3"
         major = version_str[0].lstrip("0")
         minor = version_str[1:3].lstrip("0")
@@ -460,6 +485,20 @@ class NCCLLibrary:
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
+        return window
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
 __all__ = [
     "NCCLLibrary",

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -497,6 +497,17 @@ class GroupCoordinator:
         if self.npu_communicator is not None and not self.npu_communicator.disabled:
             return self.npu_communicator.all_reduce(input_)
+        if (
+            self.pynccl_comm is not None
+            and hasattr(input_, "symmetric_memory")
+            and input_.symmetric_memory
+        ):
+            with self.pynccl_comm.change_state(
+                enable=True, stream=torch.cuda.current_stream()
+            ):
+                self.pynccl_comm.all_reduce(input_)
+                return input_
         outplace_all_reduce_method = None
         if (
             self.qr_comm is not None

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -623,8 +623,9 @@ class Engine(EngineBase):
 def _set_envs_and_config(server_args: ServerArgs):
     # Set global environments
     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
+    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if not server_args.enable_symm_mem:
+        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
     os.environ["CUDA_MODULE_LOADING"] = "AUTO"
@@ -731,6 +732,7 @@ def _launch_subprocesses(
                         pp_rank,
                         None,
                         writer,
+                        None,
                     ),
                 )

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -45,6 +45,7 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
 from sglang.srt.disaggregation.utils import (
     FAKE_BOOTSTRAP_HOST,
+    DisaggregationMode,
     register_disaggregation_server,
 )
 from sglang.srt.entrypoints.engine import _launch_subprocesses
@@ -88,7 +89,7 @@ from sglang.srt.managers.io_struct import (
     VertexGenerateReqInput,
 )
 from sglang.srt.managers.template_manager import TemplateManager
-from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
 from sglang.srt.metrics.func_timer import enable_func_timer
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.server_args import ServerArgs
@@ -230,23 +231,28 @@ async def validate_json_request(raw_request: Request):
 @app.get("/health")
-async def health() -> Response:
-    """Check the health of the http server."""
-    return Response(status_code=200)
 @app.get("/health_generate")
 async def health_generate(request: Request) -> Response:
-    """Check the health of the inference server by generating one token."""
+    """
+    Check the health of the inference server by sending a special request to generate one token.
+    If the server is running something, this request will be ignored, so it creates zero overhead.
+    If the server is not running anything, this request will be run, so we know whether the server is healthy.
+    """
     if _global_state.tokenizer_manager.gracefully_exit:
         logger.info("Health check request received during shutdown. Returning 503.")
         return Response(status_code=503)
+    if not _global_state.tokenizer_manager.server_status.is_healthy():
+        return Response(status_code=503)
     sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
     rid = f"HEALTH_CHECK_{time.time()}"
     if _global_state.tokenizer_manager.is_image_gen:
-        raise NotImplementedError()
+        # Keep this branch for some internal use cases.
+        raise NotImplementedError("Image generation is not supported yet.")
     elif _global_state.tokenizer_manager.is_generation:
         gri = GenerateReqInput(
             rid=rid,
@@ -254,6 +260,12 @@ async def health_generate(request: Request) -> Response:
             sampling_params=sampling_params,
             log_metrics=False,
         )
+        if (
+            _global_state.tokenizer_manager.server_args.disaggregation_mode
+            != DisaggregationMode.NULL
+        ):
+            gri.bootstrap_host = FAKE_BOOTSTRAP_HOST
+            gri.bootstrap_room = 0
     else:
         gri = EmbeddingReqInput(
             rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
@@ -263,9 +275,6 @@ async def health_generate(request: Request) -> Response:
         async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
             break
-    # This request is a special request.
-    # If the server already has something running, this request will be ignored, so it creates zero overhead.
-    # If the server is not running, this request will be run, so we know whether the server is healthy.
     task = asyncio.create_task(gen())
     # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
@@ -1032,8 +1041,10 @@ def _execute_server_warmup(
                 timeout=600,
             )
             assert res.status_code == 200, f"{res}"
+            _global_state.tokenizer_manager.server_status = ServerStatus.Up
         else:
-            logger.info(f"Start of prefill warmup ...")
+            logger.info(f"Start of pd disaggregation warmup ...")
             json_data = {
                 "sampling_params": {
                     "temperature": 0.0,
@@ -1055,9 +1066,18 @@ def _execute_server_warmup(
                 headers=headers,
                 timeout=1800,  # because of deep gemm precache is very long if not precache.
             )
-            logger.info(
-                f"End of prefill warmup with status {res.status_code}, resp: {res.json()}"
-            )
+            if res.status_code == 200:
+                logger.info(
+                    f"End of prefill disaggregation mode warmup with status {res.status_code}, resp: {res.json()}"
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.Up
+            else:
+                logger.info(
+                    "Prefill disaggregation mode warm Up Failed, status code: {}".format(
+                        res.status_code
+                    )
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
     except Exception:
         last_traceback = get_exception_traceback()

sglang/srt/eplb/expert_distribution.py CHANGED Viewed

@@ -288,12 +288,14 @@ class _SinglePassGatherer(ABC):
             )
         if server_args.expert_distribution_recorder_mode == "stat_approx":
-            if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
+            if server_args.moe_a2a_backend is not None and (
+                server_args.deepep_mode == "normal"
+            ):
                 return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
             else:
                 raise NotImplementedError
-        if server_args.enable_deepep_moe:
+        if server_args.moe_a2a_backend is not None:
             if server_args.deepep_mode == "normal":
                 return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
             elif server_args.deepep_mode == "low_latency":

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -14,7 +14,6 @@
 """Utilities for Huggingface Transformers."""
 import contextlib
-import logging
 import os
 import warnings
 from pathlib import Path
@@ -45,7 +44,7 @@ from sglang.srt.configs import (
 )
 from sglang.srt.configs.internvl import InternVLChatConfig
 from sglang.srt.connector import create_remote_connector
-from sglang.srt.utils import is_remote_url, lru_cache_frozenset
+from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     ChatGLMConfig.model_type: ChatGLMConfig,
@@ -317,15 +316,31 @@ def get_processor(
     if config.model_type not in {"llava", "clip"}:
         kwargs["use_fast"] = use_fast
+    try:
+        processor = AutoProcessor.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
-    processor = AutoProcessor.from_pretrained(
-        tokenizer_name,
-        *args,
-        trust_remote_code=trust_remote_code,
-        revision=revision,
-        **kwargs,
-    )
+    except ValueError as e:
+        error_message = str(e)
+        if "does not have a slow version" in error_message:
+            logger.info(
+                f"Processor {tokenizer_name} does not have a slow version. Automatically use fast version"
+            )
+            kwargs["use_fast"] = True
+            processor = AutoProcessor.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            raise e
     tokenizer = get_tokenizer_from_processor(processor)
     attach_additional_stop_token_ids(tokenizer)

sglang 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl