PyPI - sglang - Versions diffs - 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl - Mend

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

sglang/__init__.py +1 -1
sglang/bench_one_batch.py +4 -0
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +11 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +5 -5
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +33 -20
sglang/srt/layers/attention/torch_native_backend.py +299 -0
sglang/srt/layers/attention/triton_backend.py +22 -8
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +661 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +36 -2
sglang/srt/layers/quantization/fp8.py +559 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +4 -2
sglang/srt/layers/sampler.py +2 -0
sglang/srt/layers/torchao_utils.py +23 -45
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/io_struct.py +48 -2
sglang/srt/managers/schedule_batch.py +19 -14
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +145 -85
sglang/srt/managers/tokenizer_manager.py +166 -68
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +28 -8
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/model_executor/cuda_graph_runner.py +30 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +146 -153
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/model_parallel.py +1 -5
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +4 -5
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +90 -18
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +3 -8
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +96 -31
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +1 -4
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +24 -14
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +0 -1
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -13
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -16
sglang/srt/models/qwen2_vl.py +2 -6
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -17
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +9 -5
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +270 -173
sglang/srt/server_args.py +102 -29
sglang/srt/utils.py +295 -28
sglang/test/test_utils.py +7 -0
sglang/version.py +1 -1
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/METADATA +5 -4
sglang-0.4.0.post1.dist-info/RECORD +189 -0
sglang-0.3.6.post3.dist-info/RECORD +0 -162
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -20,8 +20,12 @@ import random
 import tempfile
 from typing import List, Optional
+import torch
+from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
+    get_hpu_memory_capacity,
     get_nvgpu_memory_capacity,
     is_flashinfer_available,
     is_hip,
@@ -49,6 +53,7 @@ class ServerArgs:
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
     is_embedding: bool = False
+    revision: Optional[str] = None
     # Port
     host: str = "127.0.0.1"
@@ -58,7 +63,7 @@ class ServerArgs:
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
     max_total_tokens: Optional[int] = None
-    chunked_prefill_size: int = 8192
+    chunked_prefill_size: Optional[int] = None
     max_prefill_tokens: int = 16384
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -89,6 +94,8 @@ class ServerArgs:
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
+    # Expert parallelism
+    ep_size: int = 1
     # Multi-node distributed serving
     dist_init_addr: Optional[str] = None
@@ -120,15 +127,16 @@ class ServerArgs:
     disable_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
-    disable_disk_cache: bool = False
+    disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     disable_mla: bool = False
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
+    enable_ep_moe: bool = False
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
-    cuda_graph_max_bs: int = 160
+    cuda_graph_max_bs: Optional[int] = None
     torchao_config: str = ""
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
@@ -144,19 +152,25 @@ class ServerArgs:
         if self.served_model_name is None:
             self.served_model_name = self.model_path
-        if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
-            # Disable chunked prefill
-            self.chunked_prefill_size = None
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
-        # Mem fraction depends on the tensor parallelism size
+        if is_hip():
+            gpu_mem = get_amdgpu_memory_capacity()
+        elif torch.cuda.is_available():
+            gpu_mem = get_nvgpu_memory_capacity()
+        elif self.device == "hpu":
+            gpu_mem = get_hpu_memory_capacity()
+        else:
+            # GPU memory is not known yet or no GPU is available.
+            gpu_mem = None
+        # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
             elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.82
+                self.mem_fraction_static = 0.81
             elif self.tp_size >= 4:
                 self.mem_fraction_static = 0.85
             elif self.tp_size >= 2:
@@ -164,25 +178,43 @@ class ServerArgs:
             else:
                 self.mem_fraction_static = 0.88
-        # Adjust for GPUs with small memory capacities
-        if is_hip():
-            gpu_mem = get_amdgpu_memory_capacity()
-        else:
-            gpu_mem = get_nvgpu_memory_capacity()
-        if gpu_mem < 25000:
-            self.chunked_prefill_size //= 4  # make it 2048
-            self.cuda_graph_max_bs = 4
-            logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
+        # Set chunked prefill size, which depends on the gpu memory capacity
+        if self.chunked_prefill_size is None:
+            if gpu_mem is not None and gpu_mem < 25_000:
+                self.chunked_prefill_size = 2048
+            else:
+                self.chunked_prefill_size = 8192
+        # Set cuda graph max batch size
+        if self.cuda_graph_max_bs is None:
+            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
+            if gpu_mem is not None and gpu_mem < 25_000:
+                if self.tp_size < 4:
+                    self.cuda_graph_max_bs = 8
+                else:
+                    self.cuda_graph_max_bs = 80
+            else:
+                self.cuda_graph_max_bs = 160
         # Choose kernel backends
-        if not is_flashinfer_available():
-            self.attention_backend = "triton"
+        if self.device == "hpu":
+            self.attention_backend = "torch_native"
             self.sampling_backend = "pytorch"
         if self.attention_backend is None:
-            self.attention_backend = "flashinfer"
+            self.attention_backend = (
+                "flashinfer" if is_flashinfer_available() else "triton"
+            )
         if self.sampling_backend is None:
-            self.sampling_backend = "flashinfer"
+            self.sampling_backend = (
+                "flashinfer" if is_flashinfer_available() else "pytorch"
+            )
+        if self.attention_backend == "torch_native":
+            logger.warning(
+                "Cuda graph is disabled because of using torch native attention backend"
+            )
+            self.disable_cuda_graph = True
         # Others
         if self.enable_dp_attention:
@@ -191,13 +223,25 @@ class ServerArgs:
             self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
             self.disable_overlap_schedule = True
-            logger.info(
+            logger.warning(
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
                 f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
                 f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
-                "Overlap schedule is disabled."
+                "Overlap scheduler is disabled."
             )
+        # Expert parallelism
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            logger.info(
+                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
+        # GGUF
+        if (
+            self.load_format == "auto" or self.load_format == "gguf"
+        ) and check_gguf_file(self.model_path):
+            self.quantization = self.load_format = "gguf"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -238,7 +282,7 @@ class ServerArgs:
             "--load-format",
             type=str,
             default=ServerArgs.load_format,
-            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
+            choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
             "and fall back to the pytorch bin format if safetensors format "
@@ -248,7 +292,8 @@ class ServerArgs:
             '"npcache" will load the weights in pytorch format and store '
             "a numpy cache to speed up the loading. "
             '"dummy" will initialize the weights with random values, '
-            "which is mainly for profiling.",
+            "which is mainly for profiling."
+            '"gguf" will load the weights in the gguf format. ',
         )
         parser.add_argument(
             "--trust-remote-code",
@@ -288,6 +333,7 @@ class ServerArgs:
                 "gptq_marlin",
                 "awq_marlin",
                 "bitsandbytes",
+                "gguf",
             ],
             help="The quantization method.",
         )
@@ -321,6 +367,14 @@ class ServerArgs:
             action="store_true",
             help="Whether to use a CausalLM as an embedding model.",
         )
+        parser.add_argument(
+            "--revision",
+            type=str,
+            default=None,
+            help="The specific model version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
         # Memory and scheduling
         parser.add_argument(
@@ -492,6 +546,14 @@ class ServerArgs:
                 "shortest_queue",
             ],
         )
+        # Expert parallelism
+        parser.add_argument(
+            "--expert-parallel-size",
+            "--ep-size",
+            type=int,
+            default=ServerArgs.ep_size,
+            help="The expert parallelism size.",
+        )
         # Multi-node distributed serving
         parser.add_argument(
@@ -572,7 +634,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton"],
+            choices=["flashinfer", "triton", "torch_native"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -613,9 +675,9 @@ class ServerArgs:
             help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
         )
         parser.add_argument(
-            "--disable-disk-cache",
+            "--disable-outlines-disk-cache",
             action="store_true",
-            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
+            help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
         )
         parser.add_argument(
             "--disable-custom-all-reduce",
@@ -647,6 +709,11 @@ class ServerArgs:
             action="store_true",
             help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
         )
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
@@ -716,11 +783,17 @@ class ServerArgs:
             action=DeprecatedAction,
             help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
         )
+        parser.add_argument(
+            "--disable-disk-cache",
+            action=DeprecatedAction,
+            help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
         args.dp_size = args.data_parallel_size
+        args.ep_size = args.expert_parallel_size
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         return cls(**{attr: getattr(args, attr) for attr in attrs})

sglang/srt/utils.py CHANGED Viewed

@@ -30,6 +30,7 @@ import subprocess
 import tempfile
 import time
 import warnings
+from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
 from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
@@ -38,6 +39,7 @@ import numpy as np
 import psutil
 import requests
 import torch
+import torch.distributed
 import torch.distributed as dist
 import triton
 import zmq
@@ -67,6 +69,22 @@ def is_hip() -> bool:
     return torch.version.hip is not None
+def is_cuda():
+    return hasattr(torch, "cuda") and torch.cuda.is_available()
+def is_cuda_alike():
+    return is_cuda() or is_hip()
+def is_hpu() -> bool:
+    return hasattr(torch, "hpu") and torch.hpu.is_available()
+def is_xpu() -> bool:
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
 def is_flashinfer_available():
     """
     Check whether flashinfer is available.
@@ -183,6 +201,18 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
         total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
         free_gpu_memory = total_gpu_memory - used_memory
+    elif device == "hpu":
+        num_gpus = torch.hpu.device_count()
+        assert gpu_id < num_gpus
+        if torch.hpu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.hpu.current_device()}, ",
+                "which may cause useless memory allocation for torch HPU context.",
+            )
+        free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
     if distributed:
         tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
             torch.device(device, gpu_id)
@@ -412,16 +442,12 @@ def suppress_other_loggers():
     from vllm.logger import logger as vllm_default_logger
     vllm_default_logger.setLevel(logging.WARN)
-    logging.getLogger("vllm.config").setLevel(logging.ERROR)
     logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
         logging.WARN
     )
     logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
         logging.WARN
     )
-    logging.getLogger("vllm.selector").setLevel(logging.WARN)
-    logging.getLogger("vllm.utils").setLevel(logging.ERROR)
-    logging.getLogger("vllm.model_executor.model_loader.loader").setLevel(logging.ERROR)
     warnings.filterwarnings(
         "ignore", category=UserWarning, message="The given NumPy array is not writable"
@@ -474,27 +500,6 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N
             pass
-def monkey_patch_vllm_model_config():
-    from vllm.config import ModelConfig
-    if not hasattr(ModelConfig, "_resolve_task"):
-        return
-    def _resolve_task(
-        self,
-        task_option,
-        hf_config,
-    ):
-        supported_tasks = {
-            "generate": True,
-            "embedding": False,
-        }
-        selected_task = "generate"
-        return supported_tasks, selected_task
-    setattr(ModelConfig, "_resolve_task", _resolve_task)
 def monkey_patch_vllm_p2p_access_check(gpu_id: int):
     """
     Monkey patch the slow p2p access check in vllm.
@@ -557,6 +562,29 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
         setattr(GroupCoordinator, "all_gather", all_gather)
+def monkey_patch_vllm_gguf_config():
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.gguf import (
+        GGUFConfig,
+        GGUFEmbeddingMethod,
+        GGUFLinearMethod,
+    )
+    from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+    def get_quant_method_with_embedding_replaced(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return GGUFLinearMethod(self)
+        elif isinstance(layer, VocabParallelEmbedding):
+            # patch to own VocabParallelEmbedding
+            return GGUFEmbeddingMethod(self)
+        return None
+    setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
 def maybe_set_triton_cache_manager() -> None:
     """Set environment variable to tell Triton to use a
     custom cache manager"""
@@ -862,7 +890,9 @@ def get_amdgpu_memory_capacity():
     try:
         # Run rocm-smi and capture the output
         result = subprocess.run(
-            ["rocm-smi --showmeminfo vram | grep 'Total Memory' | awk '{print $NF}'"],
+            [
+                "rocminfo | grep 'gfx' -A 100 | grep 'Pool 1' -A 5 | grep 'Size:' | awk '{print $2}'"
+            ],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             shell=True,
@@ -873,9 +903,8 @@ def get_amdgpu_memory_capacity():
         # Parse the output to extract memory values in MiB
         memory_values = [
-            float(mem) / 1024 / 1024
+            float(mem.split("(")[0].strip()) / 1024
             for mem in result.stdout.strip().split("\n")
-            if re.match(r"^\d+(\.\d+)?$", mem.strip())
         ]
         if not memory_values:
@@ -922,11 +951,119 @@ def get_nvgpu_memory_capacity():
         )
+def get_hpu_memory_capacity():
+    try:
+        # Run hl-smi and capture the output
+        result = subprocess.run(
+            ["hl-smi --query | grep 'Total'"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"hl-smi error: {result.stderr.strip()}")
+        # Parse the output to extract memory values in MiB
+        memory_values = [
+            float(mem.split(" ")[-2]) for mem in result.stdout.strip().split("\n")
+        ]
+        if not memory_values:
+            raise ValueError("No GPU memory values found.")
+        # Return the minimum memory value
+        return min(memory_values)
+    except FileNotFoundError:
+        raise RuntimeError(
+            "hl-smi not found. Ensure Habana drivers are installed and accessible."
+        )
+# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
+# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
+# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
+def init_custom_process_group(
+    backend=None,
+    init_method=None,
+    timeout=None,
+    world_size=-1,
+    rank=-1,
+    store=None,
+    group_name=None,
+    pg_options=None,
+):
+    from torch.distributed.distributed_c10d import (
+        Backend,
+        PrefixStore,
+        _new_process_group_helper,
+        _world,
+        default_pg_timeout,
+        rendezvous,
+    )
+    assert (store is None) or (
+        init_method is None
+    ), "Cannot specify both init_method and store."
+    if store is not None:
+        assert world_size > 0, "world_size must be positive if using store"
+        assert rank >= 0, "rank must be non-negative if using store"
+    elif init_method is None:
+        init_method = "env://"
+    if backend:
+        backend = Backend(backend)
+    else:
+        backend = Backend("undefined")
+    if timeout is None:
+        timeout = default_pg_timeout
+    # backward compatible API
+    if store is None:
+        rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout)
+        store, rank, world_size = next(rendezvous_iterator)
+        store.set_timeout(timeout)
+        # Use a PrefixStore to avoid accidental overrides of keys used by
+        # different systems (e.g. RPC) in case the store is multi-tenant.
+        store = PrefixStore(group_name, store)
+    # NOTE: The pg_options parameter was renamed into backend_options in PyTorch 2.6.0
+    # https://github.com/pytorch/pytorch/commit/a0c7029a75628cd5fa8df83c0de0ea98ee7fd844
+    # We need to determine the appropriate parameter name based on PyTorch version
+    pg_options_param_name = (
+        "backend_options" if str(torch.__version__) >= "2.6" else "pg_options"
+    )
+    pg, _ = _new_process_group_helper(
+        world_size,
+        rank,
+        [],
+        backend,
+        store,
+        group_name=group_name,
+        **{pg_options_param_name: pg_options},
+        timeout=timeout,
+    )
+    _world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
+    return pg
 def crash_on_warnings():
     # Crash on warning if we are running CI tests
     return get_bool_env_var("SGLANG_IS_IN_CI")
+def print_warning_once(msg: str) -> None:
+    # Set the stacklevel to 2 to print the caller's line info
+    logger.warning(msg, stacklevel=2)
 def get_device_name(device_id: int = 0) -> str:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         return torch.cuda.get_device_name(device_id)
@@ -941,9 +1078,49 @@ def get_device_name(device_id: int = 0) -> str:
         return torch.hpu.get_device_name(device_id)
+def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    major, minor = None, None
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability(device_id)
+    if hasattr(torch, "hip") and torch.hip.is_available():
+        major, minor = torch.cuda.get_device_capability(device_id)
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
+            "."
+        )
+        major, minor = int(major), int(minor)
+    # TODO(HandH1998): `get_device_capability` is not supported by `torch.hpu` for now.
+    # Update this once the support is available.
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        try:
+            major, minor = torch.hpu.get_device_capability(device_id)
+        except Exception as e:
+            raise RuntimeError(
+                f"An error occurred while getting device capability of hpu: {e}."
+            ) from e
+    return major, minor
+def get_compiler_backend() -> str:
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        return "hpu_backend"
+    return "inductor"
 sglang_lib = Library("sglang", "FRAGMENT")  # noqa
+# Some backends use pytorch version < 2.4.0 which doesn't
+# support `torch.library.custom_op`.
+def supports_custom_op() -> bool:
+    return hasattr(torch.library, "custom_op")
 def direct_register_custom_op(
     op_name: str,
     op_func: Callable,
@@ -1020,3 +1197,93 @@ def set_gpu_proc_affinity(
 def get_bool_env_var(name: str, default: str = "false") -> bool:
     value = os.getenv(name, default)
     return value.lower() in ("true", "1")
+@lru_cache(maxsize=8)
+def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
+    # Note: cuda_visible_devices is not used, but we keep it as an argument for
+    # LRU Cache purposes.
+    # Code below is based on
+    # https://github.com/pytorch/pytorch/blob/
+    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
+    # torch/cuda/__init__.py#L831C1-L831C17
+    import torch.cuda
+    import torch.version
+    if not torch.cuda._is_compiled():
+        return 0
+    if is_hip():
+        # ROCm uses amdsmi instead of nvml for stateless device count
+        # This requires a sufficiently modern version of Torch 2.4.0
+        raw_count = (
+            torch.cuda._device_count_amdsmi()
+            if (hasattr(torch.cuda, "_device_count_amdsmi"))
+            else -1
+        )
+    else:
+        raw_count = torch.cuda._device_count_nvml()
+    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
+    return r
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/utils.py
+def cuda_device_count_stateless() -> int:
+    """Get number of CUDA devices, caching based on the value of
+    CUDA_VISIBLE_DEVICES at the time of call.
+    This should be used instead of torch.cuda.device_count()
+    unless CUDA_VISIBLE_DEVICES has already been set to the desired
+    value."""
+    # This can be removed and simply replaced with torch.cuda.get_device_count
+    # after https://github.com/pytorch/pytorch/pull/122815 is released.
+    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
+def should_use_tensor_core(
+    kv_cache_dtype: torch.dtype,
+    num_attention_heads: int,
+    num_kv_heads: int,
+) -> bool:
+    """
+    Determine whether to use tensor cores for attention computation.
+    Args:
+        kv_cache_dtype: Data type of the KV cache
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key/value heads
+    Returns:
+        bool: Whether to use tensor cores
+    """
+    # Try to use environment variable first
+    env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
+    if env_override is not None:
+        return env_override.lower() == "true"
+    # Try to use _grouped_size_compiled_for_decode_kernels if available
+    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
+    try:
+        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+        if not _grouped_size_compiled_for_decode_kernels(
+            num_attention_heads,
+            num_kv_heads,
+        ):
+            return True
+        else:
+            return False
+    except (ImportError, AttributeError):
+        pass
+    # Calculate GQA group size
+    gqa_group_size = num_attention_heads // num_kv_heads
+    # Determine based on dtype and GQA group size
+    if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        return True
+    elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
+        return gqa_group_size > 4
+    else:
+        return False

sglang 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl