PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +119 -17
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +42 -7
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +14 -4
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +286 -160
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +15 -11
sglang/srt/entrypoints/context.py +227 -0
sglang/srt/entrypoints/engine.py +15 -9
sglang/srt/entrypoints/harmony_utils.py +372 -0
sglang/srt/entrypoints/http_server.py +74 -4
sglang/srt/entrypoints/openai/protocol.py +218 -1
sglang/srt/entrypoints/openai/serving_chat.py +41 -11
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +175 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +14 -1
sglang/srt/layers/attention/aiter_backend.py +375 -115
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +22 -6
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +29 -14
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +3 -7
sglang/srt/layers/moe/cutlass_moe.py +12 -3
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +135 -73
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +16 -4
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +27 -3
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +51 -10
sglang/srt/layers/quantization/modelopt_quant.py +258 -68
sglang/srt/layers/quantization/mxfp4.py +654 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +21 -12
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +506 -3
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +82 -62
sglang/srt/lora/lora_registry.py +23 -11
sglang/srt/lora/mem_pool.py +63 -68
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +75 -58
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -8
sglang/srt/managers/mm_utils.py +6 -13
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +61 -25
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +41 -19
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +47 -30
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +80 -22
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +34 -36
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -9
sglang/srt/model_executor/forward_batch_info.py +61 -19
sglang/srt/model_executor/model_runner.py +148 -37
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +137 -59
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +38 -0
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +28 -16
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +1251 -0
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +332 -37
sglang/srt/server_args.py +186 -75
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +169 -9
sglang/srt/utils.py +41 -5
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/runners.py +2 -2
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
/sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/mooncake/transfer_engine.py CHANGED Viewed

@@ -51,6 +51,35 @@ class MooncakeTransferEngine:
         if ret_value != 0:
             logger.debug("Mooncake memory deregistration %s failed.", ptr)
+    def batch_register(self, ptrs: List[int], lengths: List[int]) -> int:
+        """Batch register multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark batch register as failed
+            ret_value = -1
+            if not hasattr(self.engine, "batch_register_memory"):
+                raise RuntimeError(
+                    "Mooncake's batch register requires a newer version of mooncake-transfer-engine. "
+                    "Please upgrade Mooncake."
+                )
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory registration failed.")
+        return ret_value
+    def batch_deregister(self, ptrs: List[int]) -> int:
+        """Batch deregister multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_unregister_memory(ptrs)
+        except Exception:
+            # Mark batch deregister as failed
+            ret_value = -1
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory deregistration failed.")
+        return ret_value
     def initialize(
         self,
         hostname: str,

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -103,6 +103,8 @@ class PrefillBootstrapQueue:
         kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
         kv_args = kv_args_class()
         kv_args.engine_rank = self.tp_rank
+        kv_args.pp_rank = self.pp_rank
+        kv_args.system_dp_rank = self.scheduler.dp_rank
         kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
         kv_args.prefill_pp_size = self.pp_size
         kv_data_ptrs, kv_data_lens, kv_item_lens = (

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -50,6 +50,8 @@ from sglang.srt.utils import (
     supports_custom_op,
 )
+_is_npu = is_npu()
 @dataclass
 class GraphCaptureContext:
@@ -591,7 +593,7 @@ class GroupCoordinator:
             )
     def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
-        if not supports_custom_op():
+        if _is_npu or not supports_custom_op():
             self._all_gather_into_tensor(output, input)
         else:
             torch.ops.sglang.reg_all_gather_into_tensor(
@@ -650,17 +652,19 @@ class GroupCoordinator:
             output_size, dtype=input_.dtype, device=input_.device
         )
+        # All-gather.
+        if input_.is_cpu and is_shm_available(
+            input_.dtype, self.world_size, self.local_size
+        ):
+            return torch.ops.sgl_kernel.shm_allgather(input_, dim)
         if input_.is_cpu:
-            if is_shm_available(input_.dtype, self.world_size, self.local_size):
-                return torch.ops.sgl_kernel.shm_allgather(input_, dim)
-            else:
-                torch.distributed.all_gather_into_tensor(
-                    output_tensor, input_, group=self.device_group
-                )
-                return output_tensor
+            torch.distributed.all_gather_into_tensor(
+                output_tensor, input_, group=self.device_group
+            )
+        else:
+            self.all_gather_into_tensor(output_tensor, input_)
-        # All-gather.
-        self.all_gather_into_tensor(output_tensor, input_)
         # Reshape
         output_tensor = output_tensor.reshape((world_size,) + input_size)
         output_tensor = output_tensor.movedim(0, dim)
@@ -1125,7 +1129,7 @@ def init_model_parallel_group(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=not is_npu(),
+        use_pynccl=not _is_npu,
         use_pymscclpp=use_mscclpp_allreduce,
         use_custom_allreduce=use_custom_allreduce,
         use_hpu_communicator=True,

sglang/srt/entrypoints/context.py ADDED Viewed

@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copied from vLLM: https://github.com/zyongye/vllm/blob/6a70830065701b163e36a86fd331b41b5feac401/vllm/entrypoints/context.py
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Union
+logger = logging.getLogger(__name__)
+try:
+    from mcp import ClientSession
+except ImportError as e:
+    mcp = e
+from openai_harmony import Author, Message, Role, StreamState, TextContent
+from sglang.srt.entrypoints.harmony_utils import (
+    get_encoding,
+    get_streamable_parser_for_assistant,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.tool import Tool
+class ConversationContext(ABC):
+    @abstractmethod
+    def append_output(self, output) -> None:
+        pass
+    @abstractmethod
+    async def call_tool(self) -> list[Message]:
+        pass
+    @abstractmethod
+    def need_builtin_tool_call(self) -> bool:
+        pass
+    @abstractmethod
+    def render_for_completion(self) -> list[int]:
+        pass
+class SimpleContext(ConversationContext):
+    def __init__(self):
+        self.last_output = None
+    def append_output(self, output) -> None:
+        self.last_output = output
+    def need_builtin_tool_call(self) -> bool:
+        return False
+    async def call_tool(self) -> list[Message]:
+        raise NotImplementedError("Should not be called.")
+    def render_for_completion(self) -> list[int]:
+        raise NotImplementedError("Should not be called.")
+class HarmonyContext(ConversationContext):
+    def __init__(
+        self,
+        messages: list,
+        tool_sessions: dict[str, Union["ClientSession", Tool]],
+    ):
+        # TODO: Remove the hack of Union[ClientSession, Tool] by using MCP
+        # when demo.
+        self._messages = messages
+        self.tool_sessions = tool_sessions
+        self.parser = get_streamable_parser_for_assistant()
+        self.num_init_messages = len(messages)
+        # TODO
+        self.num_prompt_tokens = 0
+        self.num_cached_tokens = 0
+        self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            output_token_ids = output["output_ids"]
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+            output_msgs = self.parser.messages
+            meta_info = output["meta_info"]
+            if isinstance(meta_info, dict):
+                if "prompt_token_ids" in meta_info:
+                    self.num_prompt_tokens = meta_info["prompt_tokens"]
+                if "cached_tokens" in meta_info:
+                    self.num_cached_tokens = meta_info["cached_tokens"]
+                if "completion_tokens" in meta_info:
+                    self.num_output_tokens += meta_info["completion_tokens"]
+        else:
+            output_msgs = output
+        self._messages.extend(output_msgs)
+    @property
+    def messages(self) -> list:
+        return self._messages
+    def need_builtin_tool_call(self) -> bool:
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        return recipient is not None and (
+            recipient.startswith("browser.") or recipient.startswith("python")
+        )
+    async def call_tool(self) -> list[Message]:
+        if not self.messages:
+            return []
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        if recipient is not None:
+            if recipient.startswith("browser."):
+                return await self.call_search_tool(
+                    self.tool_sessions["browser"], last_msg
+                )
+            elif recipient.startswith("python"):
+                return await self.call_python_tool(
+                    self.tool_sessions["python"], last_msg
+                )
+        raise ValueError("No tool call found")
+    def render_for_completion(self) -> list[int]:
+        return render_for_completion(self.messages)
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1]
+        args = json.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [Message(author=author, content=[content], recipient=Role.ASSISTANT)]
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        param = {
+            "code": last_msg.content[0].text,
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name="python")
+        return [
+            Message(
+                author=author,
+                content=[content],
+                channel=last_msg.channel,
+                recipient=Role.ASSISTANT,
+            )
+        ]
+class StreamingHarmonyContext(HarmonyContext):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            # RequestOutput from SGLang with outputs
+            output_token_ids = output["output_ids"]
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+        return rendered_tokens

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -67,6 +67,7 @@ from sglang.srt.utils import (
     MultiprocessingSerializer,
     assert_pkg_version,
     configure_logger,
+    get_bool_env_var,
     get_zmq_socket,
     is_cuda,
     kill_process_tree,
@@ -259,7 +260,7 @@ class Engine(EngineBase):
                     f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
                 )
-        logger.info(f"data_parallel_rank: {data_parallel_rank}")
+        logger.debug(f"data_parallel_rank: {data_parallel_rank}")
         obj = GenerateReqInput(
             text=prompt,
             input_ids=input_ids,
@@ -450,15 +451,20 @@ class Engine(EngineBase):
     ):
         """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
         to avoid duplicated cache cleaning operation."""
-        obj = UpdateWeightsFromTensorReqInput(
-            serialized_named_tensors=[
+        if load_format == "flattened_bucket":
+            serialized_named_tensors = named_tensors
+        else:
+            serialized_named_tensors = [
                 MultiprocessingSerializer.serialize(named_tensors)
                 for _ in range(self.server_args.tp_size)
-            ],
+            ]
+        obj = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=serialized_named_tensors,
             load_format=load_format,
             flush_cache=flush_cache,
         )
         loop = asyncio.get_event_loop()
         return loop.run_until_complete(
             self.tokenizer_manager.update_weights_from_tensor(obj, None)
         )
@@ -492,12 +498,13 @@ class Engine(EngineBase):
             self.tokenizer_manager.get_weights_by_name(obj, None)
         )
-    def load_lora_adapter(self, lora_name: str, lora_path: str):
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
         """Load a new LoRA adapter without re-launching the engine."""
         obj = LoadLoRAAdapterReqInput(
             lora_name=lora_name,
             lora_path=lora_path,
+            pinned=pinned,
         )
         loop = asyncio.get_event_loop()
@@ -626,7 +633,6 @@ def _set_envs_and_config(server_args: ServerArgs):
     os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
     if not server_args.enable_symm_mem:
         os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
     os.environ["CUDA_MODULE_LOADING"] = "AUTO"
@@ -641,15 +647,15 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.9rc2",
+            "0.2.11.post1",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
-    if _is_cuda:
+    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.2.8",
+            "0.3.4",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl