PyPI - sglang - Versions diffs - 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/bench_one_batch.py +3 -11
sglang/bench_serving.py +149 -1
sglang/check_env.py +3 -3
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/kimi_vl.py +38 -0
sglang/srt/configs/kimi_vl_moonvit.py +32 -0
sglang/srt/configs/model_config.py +32 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +151 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +58 -24
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +97 -0
sglang/srt/hf_transformers_utils.py +22 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +129 -94
sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
sglang/srt/layers/attention/flashmla_backend.py +3 -0
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +5 -2
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +6 -1
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/utils.py +35 -0
sglang/srt/lora/layers.py +35 -9
sglang/srt/lora/lora_manager.py +81 -35
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +52 -34
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
sglang/srt/managers/schedule_batch.py +44 -16
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +291 -72
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +60 -28
sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +70 -36
sglang/srt/model_executor/cuda_graph_runner.py +82 -19
sglang/srt/model_executor/forward_batch_info.py +31 -1
sglang/srt/model_executor/model_runner.py +159 -90
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +2 -277
sglang/srt/models/deepseek_v2.py +132 -37
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/kimi_vl.py +308 -0
sglang/srt/models/kimi_vl_moonvit.py +639 -0
sglang/srt/models/llama.py +93 -31
sglang/srt/models/llama4.py +54 -7
sglang/srt/models/llama_eagle.py +4 -1
sglang/srt/models/llama_eagle3.py +4 -1
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_moe.py +8 -3
sglang/srt/models/qwen2_vl.py +4 -16
sglang/srt/models/qwen3_moe.py +8 -3
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +58 -62
sglang/srt/openai_api/protocol.py +38 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +93 -24
sglang/srt/speculative/eagle_worker.py +3 -2
sglang/srt/utils.py +123 -10
sglang/test/runners.py +4 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +32 -1
sglang/version.py +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -22,7 +22,7 @@ import random
 import tempfile
 from typing import List, Literal, Optional
-from sglang.srt.hf_transformers_utils import check_gguf_file
+from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     configure_ipv6,
@@ -78,6 +78,8 @@ class ServerArgs:
     # Other runtime options
     tp_size: int = 1
+    pp_size: int = 1
+    max_micro_batch_size: Optional[int] = None
     stream_interval: int = 1
     stream_output: bool = False
     random_seed: Optional[int] = None
@@ -185,6 +187,7 @@ class ServerArgs:
     n_share_experts_fusion: int = 0
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
+    mm_attention_backend: Optional[str] = None
     # Debug tensor dumps
     debug_tensor_dump_output_folder: Optional[str] = None
@@ -196,6 +199,7 @@ class ServerArgs:
     disaggregation_bootstrap_port: int = 8998
     disaggregation_transfer_backend: str = "mooncake"
     disaggregation_ib_device: Optional[str] = None
+    pdlb_url: Optional[str] = None
     def __post_init__(self):
         # Expert parallelism
@@ -222,25 +226,34 @@ class ServerArgs:
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
+            parallel_size = self.tp_size * self.pp_size
             if gpu_mem <= 81920:
-                if self.tp_size >= 16:
+                if parallel_size >= 16:
                     self.mem_fraction_static = 0.79
-                elif self.tp_size >= 8:
+                elif parallel_size >= 8:
                     self.mem_fraction_static = 0.81
-                elif self.tp_size >= 4:
+                elif parallel_size >= 4:
                     self.mem_fraction_static = 0.85
-                elif self.tp_size >= 2:
+                elif parallel_size >= 2:
                     self.mem_fraction_static = 0.87
                 else:
                     self.mem_fraction_static = 0.88
             else:
-                # FIXME: more fine grained auto-selection polices
-                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
+                self.mem_fraction_static = 0.88
+            if gpu_mem > 96 * 1024:
+                mem_fraction = self.mem_fraction_static
+                self.mem_fraction_static = min(
+                    mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
+                    (gpu_mem - 1024 * 18)
+                    / gpu_mem,  # 15 GB + additional 3GB for cuda graph
+                )
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
             if gpu_mem is not None and gpu_mem < 25_000:
                 self.chunked_prefill_size = 2048
+            elif self.disaggregation_mode != "null":
+                self.chunked_prefill_size = 16384
             else:
                 self.chunked_prefill_size = 8192
         assert self.chunked_prefill_size % self.page_size == 0
@@ -311,6 +324,9 @@ class ServerArgs:
                 assert (
                     not self.enable_dp_attention
                 ), "DeepEP MoE `auto` mode is not supported with DP Attention."
+            if self.deepep_mode == "normal":
+                logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
+                self.disable_cuda_graph = True
             self.ep_size = self.tp_size
             self.enable_sp_layernorm = (
                 self.dp_size < self.tp_size if self.enable_dp_attention else True
@@ -333,6 +349,17 @@ class ServerArgs:
                 "eagle speculative decoding."
             )
+            model_arch = get_model_arch(self)
+            # Auto set draft_model_path DeepSeek-V3/R1
+            if model_arch == "DeepseekV3ForCausalLM":
+                if self.speculative_draft_model_path is None:
+                    self.speculative_draft_model_path = self.model_path
+                else:
+                    logger.warning(
+                        "DeepSeek MTP does not require setting speculative_draft_model_path."
+                    )
             # Auto choose parameters
             if self.speculative_num_steps is None:
                 assert (
@@ -343,7 +370,7 @@ class ServerArgs:
                     self.speculative_num_steps,
                     self.speculative_eagle_topk,
                     self.speculative_num_draft_tokens,
-                ) = auto_choose_speculative_params(self)
+                ) = auto_choose_speculative_params(model_arch)
             if self.page_size > 1 and self.speculative_eagle_topk > 1:
                 self.speculative_eagle_topk = 1
@@ -532,7 +559,7 @@ class ServerArgs:
             "--device",
             type=str,
             default=ServerArgs.device,
-            help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
+            help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
         )
         parser.add_argument(
             "--served-model-name",
@@ -632,6 +659,19 @@ class ServerArgs:
             default=ServerArgs.tp_size,
             help="The tensor parallelism size.",
         )
+        parser.add_argument(
+            "--pipeline-parallel-size",
+            "--pp-size",
+            type=int,
+            default=ServerArgs.pp_size,
+            help="The pipeline parallelism size.",
+        )
+        parser.add_argument(
+            "--max-micro-batch-size",
+            type=int,
+            default=ServerArgs.max_micro_batch_size,
+            help="The maximum micro batch size in pipeline parallelism.",
+        )
         parser.add_argument(
             "--stream-interval",
             type=int,
@@ -1096,9 +1136,9 @@ class ServerArgs:
         parser.add_argument(
             "--tool-call-parser",
             type=str,
-            choices=["qwen25", "mistral", "llama3", "deepseekv3"],
+            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
             default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
         )
         parser.add_argument(
             "--enable-hierarchical-cache",
@@ -1215,12 +1255,29 @@ class ServerArgs:
             "--disaggregation-ib-device",
             type=str,
             default=ServerArgs.disaggregation_ib_device,
-            help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
+            help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
+            "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
+            "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
+        )
+        parser.add_argument(
+            "--pdlb-url",
+            type=str,
+            default=None,
+            help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
+        )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
         )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
+        args.pp_size = args.pipeline_parallel_size
         args.dp_size = args.data_parallel_size
         args.ep_size = args.expert_parallel_size
         attrs = [attr.name for attr in dataclasses.fields(cls)]
@@ -1234,15 +1291,25 @@ class ServerArgs:
     def check_server_args(self):
         assert (
-            self.tp_size % self.nnodes == 0
-        ), "tp_size must be divisible by number of nodes"
+            self.tp_size * self.pp_size
+        ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
+        # FIXME pp constraints
+        if self.pp_size > 1:
+            logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
+            self.disable_overlap_schedule = True
+            assert (
+                self.disable_overlap_schedule
+                and self.speculative_algorithm is None
+                and not self.enable_mixed_chunk
+            ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
         assert not (
             self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
         ), "multi-node data parallel is not supported unless dp attention!"
         assert (
             self.max_loras_per_batch > 0
             # FIXME
-            and (self.lora_paths is None or self.disable_cuda_graph)
             and (self.lora_paths is None or self.disable_radix_cache)
         ), "compatibility of lora and cuda graph and radix attention is in progress"
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
@@ -1368,20 +1435,22 @@ class DeprecatedAction(argparse.Action):
         raise ValueError(self.help)
-def auto_choose_speculative_params(self: ServerArgs):
+def get_model_arch(args: ServerArgs):
+    hf_config = get_config(
+        args.model_path,
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+        model_override_args=json.loads(args.json_model_override_args),
+    )
+    return hf_config.architectures[0]
+def auto_choose_speculative_params(arch: str):
     """
     Automatically choose the parameters for speculative decoding.
     You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
     """
-    config_path = os.path.join(self.model_path, "config.json")
-    if not os.path.exists(config_path):
-        raise ValueError(f"{config_path} is not found.")
-    config = json.load(open(config_path))
-    arch = config.get("architectures", ["Unknown"])[0]
     if arch in ["LlamaForCausalLM"]:
         # The default value for llama
         return (5, 4, 8)

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
         # Init draft worker
         with empty_context():
             super().__init__(
+                server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                server_args=server_args,
-                nccl_port=nccl_port,
+                pp_rank=0,  # FIXME
                 dp_rank=dp_rank,
+                nccl_port=nccl_port,
                 is_draft_worker=True,
                 req_to_token_pool=self.req_to_token_pool,
                 token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,

sglang/srt/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """Common utilities."""
 import base64
 import builtins
 import ctypes
@@ -144,6 +145,10 @@ def is_xpu() -> bool:
     return hasattr(torch, "xpu") and torch.xpu.is_available()
+def is_npu() -> bool:
+    return hasattr(torch, "npu") and torch.npu.is_available()
 def is_flashinfer_available():
     """
     Check whether flashinfer is available.
@@ -327,6 +332,16 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
     elif device == "cpu":
         # TODO: rename the variables in the current function to be not GPU specific
         free_gpu_memory = psutil.virtual_memory().available
+    elif device == "npu":
+        num_gpus = torch.npu.device_count()
+        assert gpu_id < num_gpus
+        if torch.npu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
+                "which may cause useless memory allocation for torch NPU context.",
+            )
+        free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
     if distributed:
         tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
@@ -414,16 +429,40 @@ class LayerFn(Protocol):
 def make_layers(
     num_hidden_layers: int,
     layer_fn: LayerFn,
+    pp_rank: Optional[int] = None,
+    pp_size: Optional[int] = None,
     prefix: str = "",
+    return_tuple: bool = False,
 ) -> Tuple[int, int, torch.nn.ModuleList]:
     """Make a list of layers with the given layer function"""
+    # circula imports
+    from sglang.srt.distributed import get_pp_indices
+    from sglang.srt.layers.utils import PPMissingLayer
+    assert not pp_size or num_hidden_layers >= pp_size
+    start_layer, end_layer = (
+        get_pp_indices(
+            num_hidden_layers,
+            pp_rank,
+            pp_size,
+        )
+        if pp_rank is not None and pp_size is not None
+        else (0, num_hidden_layers)
+    )
     modules = torch.nn.ModuleList(
-        [
+        [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
+        + [
             maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
-            for idx in range(num_hidden_layers)
+            for idx in range(start_layer, end_layer)
+        ]
+        + [
+            PPMissingLayer(return_tuple=return_tuple)
+            for _ in range(end_layer, num_hidden_layers)
         ]
     )
-    return modules
+    if pp_rank is None or pp_size is None:
+        return modules
+    return modules, start_layer, end_layer
 def set_random_seed(seed: int) -> None:
@@ -872,12 +911,15 @@ def broadcast_pyobj(
     src: int = 0,
     force_cpu_device: bool = True,
 ):
-    """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
+    """Broadcast inputs from src rank to all other ranks with torch.dist backend.
+    The `rank` here refer to the source rank on global process group (regardless
+    of dist_group argument).
+    """
     device = torch.device(
         "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
     )
-    if rank == 0:
+    if rank == src:
         if len(data) == 0:
             tensor_size = torch.tensor([0], dtype=torch.long, device=device)
             dist.broadcast(tensor_size, src=src, group=dist_group)
@@ -909,6 +951,50 @@ def broadcast_pyobj(
         return data
+def point_to_point_pyobj(
+    data: List[Any],
+    rank: int,
+    group: Optional[torch.distributed.ProcessGroup] = None,
+    src: int = 0,
+    dst: int = 1,
+):
+    """Send data from src to dst in group."""
+    if rank == src:
+        if len(data) == 0:
+            tensor_size = torch.tensor([0], dtype=torch.long)
+            dist.send(tensor_size, dst=dst, group=group)
+        else:
+            serialized_data = pickle.dumps(data)
+            size = len(serialized_data)
+            tensor_data = torch.ByteTensor(
+                np.frombuffer(serialized_data, dtype=np.uint8)
+            )
+            tensor_size = torch.tensor([size], dtype=torch.long)
+            dist.send(tensor_size, dst=dst, group=group)
+            dist.send(tensor_data, dst=dst, group=group)
+        return data
+    elif rank == dst:
+        tensor_size = torch.tensor([0], dtype=torch.long)
+        dist.recv(tensor_size, src=src, group=group)
+        size = tensor_size.item()
+        if size == 0:
+            return []
+        tensor_data = torch.empty(size, dtype=torch.uint8)
+        dist.recv(tensor_data, src=src, group=group)
+        serialized_data = bytes(tensor_data.cpu().numpy())
+        data = pickle.loads(serialized_data)
+        return data
+    # Other ranks in pp_group do nothing
+    return []
 step_counter = 0
@@ -1276,6 +1362,9 @@ def get_device_name(device_id: int = 0) -> str:
     if hasattr(torch, "hpu") and torch.hpu.is_available():
         return torch.hpu.get_device_name(device_id)
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        return torch.npu.get_device_name(device_id)
 @lru_cache(maxsize=1)
 def is_habana_available() -> bool:
@@ -1372,6 +1461,13 @@ def get_compiler_backend() -> str:
     if hasattr(torch, "hpu") and torch.hpu.is_available():
         return "hpu_backend"
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        import torchair
+        config = torchair.CompilerConfig()
+        npu_backend = torchair.get_npu_backend(compiler_config=config)
+        return npu_backend
     return "inductor"
@@ -1732,6 +1828,13 @@ def configure_ipv6(dist_init_addr):
     return port, host
+def rank0_log(msg: str):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(msg)
 def rank0_print(msg: str):
     from sglang.srt.distributed import get_tensor_model_parallel_rank
@@ -1905,13 +2008,16 @@ def fast_topk(values, topk, dim):
         return torch.topk(values, topk, dim=dim)
-def is_hopper_with_cuda_12_3():
+def _check(cc_major):
     if not is_cuda():
         return False
-    is_hopper = torch.cuda.get_device_capability()[0] == 9
-    cuda_version = torch.version.cuda.split(".")
-    is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
-    return is_hopper and is_cuda_compatible
+    return torch.cuda.get_device_capability()[0] == cc_major and tuple(
+        map(int, torch.version.cuda.split(".")[:2])
+    ) >= (12, 3)
+is_ampere_with_cuda_12_3 = lambda: _check(8)
+is_hopper_with_cuda_12_3 = lambda: _check(9)
 def get_free_port():
@@ -1990,3 +2096,10 @@ class BumpAllocator:
         output = self._buffer[self._pointer : self._pointer + size]
         self._pointer += size
         return output
+def log_info_on_rank0(logger, msg):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(msg)

sglang/test/runners.py CHANGED Viewed

@@ -423,6 +423,10 @@ class HFRunner:
                     )
                 del input_logits
+            if lora_paths is not None and lora_paths[i] is not None:
+                # Unload the LoRA adapter if it is used
+                model.unload()
         return ModelOutput(
             output_strs=output_strs,
             top_input_logprobs=top_input_logprobs,

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -7,9 +7,9 @@ import torch
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
 from sglang.srt.layers.quantization.fp8_kernel import (
-    per_tensor_quant_mla_deep_gemm_masked_fp8,
     per_tensor_quant_mla_fp8,
     per_token_group_quant_fp8,
+    per_token_group_quant_mla_deep_gemm_masked_fp8,
     static_quant_fp8,
     w8a8_block_fp8_matmul,
 )
@@ -236,7 +236,7 @@ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
         with torch.inference_mode():
             ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
-            out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
+            out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
                 x, group_size
             )
             out = out[:, :num_tokens, :]

sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post3py3-none-any.whl