PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

sglang/bench_one_batch.py +113 -17
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -117
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +3 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +22 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +8 -5
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +106 -15
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +55 -13
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +40 -15
sglang/srt/layers/communicator.py +35 -8
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +9 -8
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +87 -107
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +59 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +8 -7
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +15 -4
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +10 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +61 -32
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +21 -4
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +30 -8
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +170 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +59 -22
sglang/srt/managers/tokenizer_manager.py +137 -67
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +48 -17
sglang/srt/model_executor/model_runner.py +24 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +95 -50
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +102 -27
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +7 -4
sglang/srt/models/qwen3_moe.py +39 -14
sglang/srt/models/step3_vl.py +10 -1
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/operations_strategy.py +1 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +218 -23
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +163 -9
sglang/srt/utils.py +41 -26
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +4 -4
sglang/test/test_utils.py +4 -4
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -103,6 +103,8 @@ class PrefillBootstrapQueue:
         kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
         kv_args = kv_args_class()
         kv_args.engine_rank = self.tp_rank
+        kv_args.pp_rank = self.pp_rank
+        kv_args.system_dp_rank = self.scheduler.dp_rank
         kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
         kv_args.prefill_pp_size = self.pp_size
         kv_data_ptrs, kv_data_lens, kv_item_lens = (
@@ -460,6 +462,7 @@ class SchedulerDisaggregationPrefillMixin:
         # We need to remove the sync in the following function for overlap schedule.
         self.set_next_batch_sampling_info_done(batch)
+        self.maybe_send_health_check_signal()
     def process_disagg_prefill_inflight_queue(
         self: Scheduler, rids_to_check: Optional[List[str]] = None

sglang/srt/distributed/device_communicators/pynccl.py CHANGED Viewed

@@ -75,6 +75,7 @@ class PyNcclCommunicator:
         self.available = True
         self.disabled = False
+        self.nccl_version = self.nccl.ncclGetRawVersion()
         if self.rank == 0:
             logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
@@ -259,6 +260,12 @@ class PyNcclCommunicator:
             cudaStream_t(stream.cuda_stream),
         )
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
     @contextmanager
     def change_state(
         self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None

sglang/srt/distributed/device_communicators/pynccl_allocator.py ADDED Viewed

@@ -0,0 +1,133 @@
+import tempfile
+import torch
+from packaging import version
+from torch.cuda.memory import CUDAPluggableAllocator
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+nccl_allocator_source = """
+#include <nccl.h>
+extern "C" {
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  ncclResult_t err = ncclMemAlloc(&ptr, size);
+  return ptr;
+}
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+}
+"""
+_allocator = None
+_mem_pool = None
+_registered_base_addrs = set()
+_graph_pool_id = None
+def is_symmetric_memory_enabled():
+    return global_server_args_dict["enable_symm_mem"]
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+def get_nccl_mem_pool():
+    global _allocator, _mem_pool
+    if _mem_pool is None:
+        out_dir = tempfile.gettempdir()
+        nccl_allocator_libname = "nccl_allocator"
+        torch.utils.cpp_extension.load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=True,
+            is_python_module=False,
+            build_directory=out_dir,
+        )
+        _allocator = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        ).allocator()
+        _mem_pool = torch.cuda.MemPool(_allocator)
+    return _mem_pool
+class use_symmetric_memory:
+    def __init__(self, group_coordinator: GroupCoordinator):
+        if not is_symmetric_memory_enabled():
+            self.group_coordinator = None
+            self._mem_pool_ctx = None
+            self.is_graph_capture = None
+            self.device = None
+            self.pre_2_8_0 = None
+        else:
+            self.group_coordinator = group_coordinator
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+            self.device = torch.cuda.current_device()
+            self.pre_2_8_0 = version.parse(torch.__version__) < version.parse("2.8.0")
+    def __enter__(self):
+        if not is_symmetric_memory_enabled():
+            return self
+        assert (
+            self.group_coordinator.pynccl_comm is not None
+        ), f"Symmetric memory requires pynccl to be enabled in group '{self.group_coordinator.group_name}'"
+        assert (
+            self.group_coordinator.pynccl_comm.nccl_version >= 22703
+        ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        if self.is_graph_capture:
+            assert (
+                _graph_pool_id is not None
+            ), "graph_pool_id is not set under graph capture"
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            if self.pre_2_8_0:
+                torch._C._cuda_endAllocateCurrentStreamToPool(
+                    self.device, _graph_pool_id
+                )
+            else:
+                torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
+        self._mem_pool_ctx.__enter__()
+        return self
+    def tag(self, tensor: torch.Tensor):
+        if not is_symmetric_memory_enabled():
+            return
+        tensor.symmetric_memory = True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not is_symmetric_memory_enabled():
+            return
+        global _registered_base_addrs
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+        for segment in get_nccl_mem_pool().snapshot():
+            if segment["address"] not in _registered_base_addrs:
+                if segment["stream"] == 0 and self.pre_2_8_0:
+                    # PyTorch version < 2.8.0 has a multi-thread MemPool bug
+                    # See https://github.com/pytorch/pytorch/issues/152861
+                    # Fixed at https://github.com/pytorch/pytorch/commit/f01e628e3b31852983ab30b25bf251f557ba9c0b
+                    # WAR is to skip allocations on the default stream since the forward_pass thread always runs on a custom stream
+                    continue
+                self.group_coordinator.pynccl_comm.register_comm_window_raw(
+                    segment["address"], segment["total_size"]
+                )
+                _registered_base_addrs.add(segment["address"])
+        if self.is_graph_capture:
+            if self.pre_2_8_0:
+                torch._C._cuda_beginAllocateToPool(self.device, _graph_pool_id)
+            else:
+                torch._C._cuda_beginAllocateCurrentThreadToPool(
+                    self.device, _graph_pool_id
+                )

sglang/srt/distributed/device_communicators/pynccl_wrapper.py CHANGED Viewed

@@ -67,6 +67,7 @@ def find_nccl_library() -> str:
 ncclResult_t = ctypes.c_int
 ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p
 class ncclUniqueId(ctypes.Structure):
@@ -279,6 +280,23 @@ class NCCLLibrary:
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
     ]
+    exported_functions_symm_mem = [
+        # ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
+    ]
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: Dict[str, Any] = {}
@@ -312,7 +330,10 @@ class NCCLLibrary:
         if so_file not in NCCLLibrary.path_to_dict_mapping:
             _funcs: Dict[str, Any] = {}
-            for func in NCCLLibrary.exported_functions:
+            exported_functions = NCCLLibrary.exported_functions
+            if hasattr(self.lib, "ncclCommWindowRegister"):
+                exported_functions.extend(NCCLLibrary.exported_functions_symm_mem)
+            for func in exported_functions:
                 f = getattr(self.lib, func.name)
                 f.restype = func.restype
                 f.argtypes = func.argtypes
@@ -328,10 +349,14 @@ class NCCLLibrary:
             error_str = self.ncclGetErrorString(result)
             raise RuntimeError(f"NCCL error: {error_str}")
-    def ncclGetVersion(self) -> str:
+    def ncclGetRawVersion(self) -> int:
         version = ctypes.c_int()
         self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
-        version_str = str(version.value)
+        # something like 21903
+        return version.value
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
         # something like 21903 --> "2.19.3"
         major = version_str[0].lstrip("0")
         minor = version_str[1:3].lstrip("0")
@@ -460,6 +485,20 @@ class NCCLLibrary:
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
+        return window
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
 __all__ = [
     "NCCLLibrary",

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -497,6 +497,17 @@ class GroupCoordinator:
         if self.npu_communicator is not None and not self.npu_communicator.disabled:
             return self.npu_communicator.all_reduce(input_)
+        if (
+            self.pynccl_comm is not None
+            and hasattr(input_, "symmetric_memory")
+            and input_.symmetric_memory
+        ):
+            with self.pynccl_comm.change_state(
+                enable=True, stream=torch.cuda.current_stream()
+            ):
+                self.pynccl_comm.all_reduce(input_)
+                return input_
         outplace_all_reduce_method = None
         if (
             self.qr_comm is not None
@@ -639,17 +650,19 @@ class GroupCoordinator:
             output_size, dtype=input_.dtype, device=input_.device
         )
+        # All-gather.
+        if input_.is_cpu and is_shm_available(
+            input_.dtype, self.world_size, self.local_size
+        ):
+            return torch.ops.sgl_kernel.shm_allgather(input_, dim)
         if input_.is_cpu:
-            if is_shm_available(input_.dtype, self.world_size, self.local_size):
-                return torch.ops.sgl_kernel.shm_allgather(input_, dim)
-            else:
-                torch.distributed.all_gather_into_tensor(
-                    output_tensor, input_, group=self.device_group
-                )
-                return output_tensor
+            torch.distributed.all_gather_into_tensor(
+                output_tensor, input_, group=self.device_group
+            )
+        else:
+            self.all_gather_into_tensor(output_tensor, input_)
-        # All-gather.
-        self.all_gather_into_tensor(output_tensor, input_)
         # Reshape
         output_tensor = output_tensor.reshape((world_size,) + input_size)
         output_tensor = output_tensor.movedim(0, dim)

sglang/srt/entrypoints/context.py ADDED Viewed

@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copied from vLLM
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Union
+logger = logging.getLogger(__name__)
+try:
+    from mcp import ClientSession
+except ImportError:
+    logger.warning("Ignoring mcp import error")
+from openai_harmony import Author, Message, Role, StreamState, TextContent
+from sglang.srt.entrypoints.harmony_utils import (
+    get_encoding,
+    get_streamable_parser_for_assistant,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.tool import Tool
+class ConversationContext(ABC):
+    @abstractmethod
+    def append_output(self, output) -> None:
+        pass
+    @abstractmethod
+    async def call_tool(self) -> list[Message]:
+        pass
+    @abstractmethod
+    def need_builtin_tool_call(self) -> bool:
+        pass
+    @abstractmethod
+    def render_for_completion(self) -> list[int]:
+        pass
+class SimpleContext(ConversationContext):
+    def __init__(self):
+        self.last_output = None
+    def append_output(self, output) -> None:
+        self.last_output = output
+    def need_builtin_tool_call(self) -> bool:
+        return False
+    async def call_tool(self) -> list[Message]:
+        raise NotImplementedError("Should not be called.")
+    def render_for_completion(self) -> list[int]:
+        raise NotImplementedError("Should not be called.")
+class HarmonyContext(ConversationContext):
+    def __init__(
+        self,
+        messages: list,
+        tool_sessions: dict[str, Union["ClientSession", Tool]],
+    ):
+        # TODO: Remove the hack of Union[ClientSession, Tool] by using MCP
+        # when demo.
+        self._messages = messages
+        self.tool_sessions = tool_sessions
+        self.parser = get_streamable_parser_for_assistant()
+        self.num_init_messages = len(messages)
+        # TODO
+        self.num_prompt_tokens = 0
+        self.num_cached_tokens = 0
+        self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            output_token_ids = output["output_ids"]
+            # TODO: REMOVE here:
+            # Very hacky, find the first occurrence of token 200006 and cut from there
+            try:
+                start_index = output_token_ids.index(200006)
+                output_token_ids = output_token_ids[start_index:]
+            except ValueError:
+                pass
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+            output_msgs = self.parser.messages
+            meta_info = output["meta_info"]
+            if isinstance(meta_info, dict):
+                if "prompt_token_ids" in meta_info:
+                    self.num_prompt_tokens = meta_info["prompt_tokens"]
+                if "cached_tokens" in meta_info:
+                    self.num_cached_tokens = meta_info["cached_tokens"]
+                if "completion_tokens" in meta_info:
+                    self.num_output_tokens += meta_info["completion_tokens"]
+        else:
+            output_msgs = output
+        self._messages.extend(output_msgs)
+    @property
+    def messages(self) -> list:
+        return self._messages
+    def need_builtin_tool_call(self) -> bool:
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        return recipient is not None and (
+            recipient.startswith("browser.") or recipient.startswith("python")
+        )
+    async def call_tool(self) -> list[Message]:
+        if not self.messages:
+            return []
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        if recipient is not None:
+            if recipient.startswith("browser."):
+                return await self.call_search_tool(
+                    self.tool_sessions["browser"], last_msg
+                )
+            elif recipient.startswith("python"):
+                return await self.call_python_tool(
+                    self.tool_sessions["python"], last_msg
+                )
+        raise ValueError("No tool call found")
+    def render_for_completion(self) -> list[int]:
+        return render_for_completion(self.messages)
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1]
+        args = json.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [Message(author=author, content=[content], recipient=Role.ASSISTANT)]
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        param = {
+            "code": last_msg.content[0].text,
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name="python")
+        return [
+            Message(
+                author=author,
+                content=[content],
+                channel=last_msg.channel,
+                recipient=Role.ASSISTANT,
+            )
+        ]
+class StreamingHarmonyContext(HarmonyContext):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            # RequestOutput from SGLang with outputs
+            output_token_ids = output["output_ids"]
+            # TODO: REMOVE here:
+            # Very hacky, find the first occurrence of token 200006 and cut from there
+            # Find the first occurrence of token 200006 and cut from there
+            try:
+                start_index = output_token_ids.index(200006)
+                output_token_ids = output_token_ids[start_index:]
+            except ValueError:
+                pass
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+        return rendered_tokens

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -492,12 +492,13 @@ class Engine(EngineBase):
             self.tokenizer_manager.get_weights_by_name(obj, None)
         )
-    def load_lora_adapter(self, lora_name: str, lora_path: str):
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
         """Load a new LoRA adapter without re-launching the engine."""
         obj = LoadLoRAAdapterReqInput(
             lora_name=lora_name,
             lora_path=lora_path,
+            pinned=pinned,
         )
         loop = asyncio.get_event_loop()
@@ -623,8 +624,9 @@ class Engine(EngineBase):
 def _set_envs_and_config(server_args: ServerArgs):
     # Set global environments
     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
+    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if not server_args.enable_symm_mem:
+        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
     os.environ["CUDA_MODULE_LOADING"] = "AUTO"
@@ -640,7 +642,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.9rc2",
+            "0.2.10",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -648,7 +650,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.2.8",
+            "0.3.2",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
@@ -731,6 +733,7 @@ def _launch_subprocesses(
                         pp_rank,
                         None,
                         writer,
+                        None,
                     ),
                 )

sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl