PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +3 -6
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +6 -2
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +105 -6
sglang/srt/disaggregation/mini_lb.py +74 -9
sglang/srt/disaggregation/mooncake/conn.py +33 -63
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +137 -17
sglang/srt/disaggregation/utils.py +32 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +883 -209
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +20 -5
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +17 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -88
sglang/srt/layers/quantization/fp8_utils.py +189 -264
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +9 -8
sglang/srt/layers/sampler.py +7 -12
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +15 -4
sglang/srt/managers/scheduler.py +28 -77
sglang/srt/managers/tokenizer_manager.py +116 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +41 -29
sglang/srt/mem_cache/memory_pool.py +38 -15
sglang/srt/model_executor/cuda_graph_runner.py +15 -10
sglang/srt/model_executor/model_runner.py +39 -31
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +292 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +31 -203
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +86 -72
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +6 -14
sglang/srt/utils.py +62 -6
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     configure_ipv6,
-    get_amdgpu_memory_capacity,
     get_device,
-    get_hpu_memory_capacity,
-    get_nvgpu_memory_capacity,
-    is_cuda,
+    get_device_memory_capacity,
     is_flashinfer_available,
     is_hip,
     is_port_available,
@@ -49,6 +46,7 @@ class ServerArgs:
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
     skip_tokenizer_init: bool = False
+    enable_tokenizer_batch_encode: bool = False
     load_format: str = "auto"
     trust_remote_code: bool = False
     dtype: str = "auto"
@@ -155,7 +153,6 @@ class ServerArgs:
     enable_nccl_nvls: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
-    disable_mla: bool = False
     enable_llama4_multimodal: Optional[bool] = None
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
@@ -180,13 +177,14 @@ class ServerArgs:
     tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
-    enable_flashinfer_mla: bool = False  # TODO: remove this argument
-    enable_flashmla: bool = False
+    hicache_size: int = 0
+    hicache_write_policy: str = "write_through_selective"
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
+    moe_dense_tp_size: Optional[int] = None
     n_share_experts_fusion: int = 0
-    disable_shared_experts_fusion: bool = False
     disable_chunked_prefix_cache: bool = False
+    disable_fast_image_processor: bool = False
     # Debug tensor dumps
     debug_tensor_dump_output_folder: Optional[str] = None
@@ -197,9 +195,7 @@ class ServerArgs:
     disaggregation_mode: str = "null"
     disaggregation_bootstrap_port: int = 8998
     disaggregation_transfer_backend: str = "mooncake"
-    # multimodal
-    disable_fast_image_processor: bool = False
+    disaggregation_ib_device: Optional[str] = None
     def __post_init__(self):
         # Expert parallelism
@@ -222,31 +218,24 @@ class ServerArgs:
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
-        if is_cuda():
-            gpu_mem = get_nvgpu_memory_capacity()
-        elif is_hip():
-            gpu_mem = get_amdgpu_memory_capacity()
-        elif self.device == "hpu":
-            gpu_mem = get_hpu_memory_capacity()
-        else:
-            # GPU memory is not known yet or no GPU is available.
-            gpu_mem = None
-        if is_hip():
-            self.disable_shared_experts_fusion = True
+        gpu_mem = get_device_memory_capacity(self.device)
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
-            if self.tp_size >= 16:
-                self.mem_fraction_static = 0.79
-            elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.81
-            elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.85
-            elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.87
+            if gpu_mem <= 81920:
+                if self.tp_size >= 16:
+                    self.mem_fraction_static = 0.79
+                elif self.tp_size >= 8:
+                    self.mem_fraction_static = 0.81
+                elif self.tp_size >= 4:
+                    self.mem_fraction_static = 0.85
+                elif self.tp_size >= 2:
+                    self.mem_fraction_static = 0.87
+                else:
+                    self.mem_fraction_static = 0.88
             else:
-                self.mem_fraction_static = 0.88
+                # FIXME: more fine grained auto-selection polices
+                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
@@ -257,7 +246,12 @@ class ServerArgs:
         assert self.chunked_prefill_size % self.page_size == 0
-        if self.enable_flashmla is True:
+        assert self.moe_dense_tp_size in {
+            1,
+            None,
+        }, f"moe_dense_tp_size only support 1 and None currently"
+        if self.attention_backend == "flashmla":
             logger.warning(
                 "FlashMLA only supports a page_size of 64, change page_size to 64."
             )
@@ -270,8 +264,6 @@ class ServerArgs:
                     self.cuda_graph_max_bs = 8
                 else:
                     self.cuda_graph_max_bs = 80
-            else:
-                self.cuda_graph_max_bs = 160
         # Set kernel backends for hpu device
         if self.device == "hpu":
@@ -293,13 +285,6 @@ class ServerArgs:
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
-        # Expert parallelism
-        if self.enable_ep_moe:
-            self.ep_size = self.tp_size
-            logger.info(
-                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
-            )
         self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
         # Data parallelism attention
@@ -360,7 +345,18 @@ class ServerArgs:
             if self.page_size > 1 and self.speculative_eagle_topk > 1:
                 self.speculative_eagle_topk = 1
-                logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
+                logger.info(
+                    "speculative_eagle_topk is adjusted to 1 when page_size > 1"
+                )
+            if (
+                self.speculative_eagle_topk == 1
+                and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
+            ):
+                logger.info(
+                    "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
+                )
+                self.speculative_num_draft_tokens = self.speculative_num_steps + 1
             # The token generated from the verify step is counted.
             # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
@@ -382,18 +378,18 @@ class ServerArgs:
         # PD disaggregation
         if self.disaggregation_mode == "prefill":
             self.disable_cuda_graph = True
-            logger.warning("KV cache is forced as chunk cache for decode server")
-            self.disable_overlap_schedule = True
-            logger.warning("Overlap scheduler is disabled for prefill server")
+            logger.warning("Cuda graph is disabled for prefill server")
         elif self.disaggregation_mode == "decode":
             self.disable_radix_cache = True
-            logger.warning("Cuda graph is disabled for prefill server")
-            self.disable_overlap_schedule = True
-            logger.warning("Overlap scheduler is disabled for decode server")
+            logger.warning("KV cache is forced as chunk cache for decode server")
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
+        # Set env var before grammar backends init
+        os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
+            "1" if self.disable_outlines_disk_cache else "0"
+        )
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -430,6 +426,11 @@ class ServerArgs:
             action="store_true",
             help="If set, skip init tokenizer and pass input_ids in generate request",
         )
+        parser.add_argument(
+            "--enable-tokenizer-batch-encode",
+            action="store_true",
+            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
+        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -826,7 +827,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton", "torch_native", "fa3"],
+            choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -846,13 +847,13 @@ class ServerArgs:
         )
         parser.add_argument(
             "--enable-flashinfer-mla",
-            action="store_true",
-            help="Enable FlashInfer MLA optimization. This argument will be deprecated soon! Please use '--attention-backend flashinfer' instead for switching on flashfiner mla!",
+            action=DeprecatedAction,
+            help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
         )
         parser.add_argument(
             "--enable-flashmla",
-            action="store_true",
-            help="Enable FlashMLA decode optimization",
+            action=DeprecatedAction,
+            help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
         )
         parser.add_argument(
             "--flashinfer-mla-disable-ragged",
@@ -977,11 +978,6 @@ class ServerArgs:
             action="store_true",
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
-        parser.add_argument(
-            "--disable-mla",
-            action="store_true",
-            help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
-        )
         parser.add_argument(
             "--enable-llama4-multimodal",
             default=ServerArgs.enable_llama4_multimodal,
@@ -1090,7 +1086,7 @@ class ServerArgs:
         parser.add_argument(
             "--tool-call-parser",
             type=str,
-            choices=["qwen25", "mistral", "llama3"],
+            choices=["qwen25", "mistral", "llama3", "deepseekv3"],
             default=ServerArgs.tool_call_parser,
             help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
         )
@@ -1102,15 +1098,33 @@ class ServerArgs:
         parser.add_argument(
             "--hicache-ratio",
             type=float,
-            required=False,
             default=ServerArgs.hicache_ratio,
             help="The ratio of the size of host KV cache memory pool to the size of device pool.",
         )
+        parser.add_argument(
+            "--hicache-size",
+            type=int,
+            default=ServerArgs.hicache_size,
+            help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
+        )
+        parser.add_argument(
+            "--hicache-write-policy",
+            type=str,
+            choices=["write_back", "write_through", "write_through_selective"],
+            default=ServerArgs.hicache_write_policy,
+            help="The write policy of hierarchical cache.",
+        )
         parser.add_argument(
             "--enable-deepep-moe",
             action="store_true",
             help="Enabling DeepEP MoE implementation for EP MoE.",
         )
+        parser.add_argument(
+            "--moe-dense-tp-size",
+            type=int,
+            default=ServerArgs.moe_dense_tp_size,
+            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
+        )
         parser.add_argument(
             "--deepep-mode",
             type=str,
@@ -1123,18 +1137,18 @@ class ServerArgs:
             "--n-share-experts-fusion",
             type=int,
             default=0,
-            help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1 "
-            "we use tp_size by default.",
+            help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
+            "set it to tp_size can get best optimized performace.",
         )
         parser.add_argument(
-            "--disable-shared-experts-fusion",
+            "--disable-chunked-prefix-cache",
             action="store_true",
-            help="Disable shared experts fusion by setting n_share_experts_fusion to 0.",
+            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
         )
         parser.add_argument(
-            "--disable-chunked-prefix-cache",
+            "--disable-fast-image-processor",
             action="store_true",
-            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
+            help="Adopt base image processor instead of fast image processor.",
         )
         # Server warmups
@@ -1184,14 +1198,14 @@ class ServerArgs:
             "--disaggregation-transfer-backend",
             type=str,
             default=ServerArgs.disaggregation_transfer_backend,
+            choices=["mooncake", "nixl"],
             help="The backend for disaggregation transfer. Default is mooncake.",
         )
-        # Multimodal
         parser.add_argument(
-            "--disable-fast-image-processor",
-            action="store_true",
-            help="Adopt base image processor instead of fast image processor.",
+            "--disaggregation-ib-device",
+            type=str,
+            default=ServerArgs.disaggregation_ib_device,
+            help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
         )
     @classmethod

sglang/srt/speculative/build_eagle_tree.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import List
 import torch
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import is_cuda, is_hip
-if is_cuda_available() or is_hip():
+if is_cuda() or is_hip():
     from sgl_kernel import (
         build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
     )

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import fast_topk, is_cuda_available, is_hip, next_power_of_2
+from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import (
         top_k_renorm_prob,
         top_p_renorm_prob,

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
-from sglang.srt.utils import (
-    empty_context,
-    fast_topk,
-    get_available_gpu_memory,
-    is_cuda_available,
-)
+from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import segment_packbits
 logger = logging.getLogger(__name__)
@@ -271,14 +266,11 @@ class EAGLEWorker(TpModelWorker):
             )
         elif batch.forward_mode.is_idle():
             model_worker_batch = batch.get_model_worker_batch()
-            logits_output, next_token_ids, _ = (
-                self.target_worker.forward_batch_generation(
-                    ForwardBatch.init_new(
-                        model_worker_batch, self.target_worker.model_runner
-                    )
-                )
+            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+                model_worker_batch
             )
-            return logits_output, next_token_ids, model_worker_batch.bid, 0, False
+            return logits_output, next_token_ids, model_worker_batch.bid, 0
         else:
             logits_output, next_token_ids, bid = self.forward_target_extend(batch)
             with self.draft_tp_context(self.draft_model_runner.tp_group):

sglang/srt/utils.py CHANGED Viewed

@@ -55,7 +55,6 @@ import torch.distributed
 import torch.distributed as dist
 import triton
 import zmq
-from decord import VideoReader, cpu
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from PIL import Image
@@ -79,10 +78,34 @@ time_infos = {}
 HIP_FP8_E4M3_FNUZ_MAX = 224.0
+_warned_bool_env_var_keys = set()
 def get_bool_env_var(name: str, default: str = "false") -> bool:
     value = os.getenv(name, default)
-    return value.lower() in ("true", "1")
+    value = value.lower()
+    truthy_values = ("true", "1")
+    falsy_values = ("false", "0")
+    if (value not in truthy_values) and (value not in falsy_values):
+        if value not in _warned_bool_env_var_keys:
+            logger.warning(
+                f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
+            )
+        _warned_bool_env_var_keys.add(value)
+    return value in truthy_values
+def get_int_env_var(name: str, default: int = 0) -> int:
+    value = os.getenv(name)
+    if value is None or not value.strip():
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
 # https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
@@ -131,10 +154,6 @@ def is_flashinfer_available():
     return importlib.util.find_spec("flashinfer") is not None and is_cuda()
-def is_cuda_available():
-    return is_cuda()
 _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
     "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
 )
@@ -545,6 +564,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
 def encode_video(video_path, frame_count_limit=None):
+    # Lazy import because decord is not available on some arm platforms.
+    from decord import VideoReader, cpu
     if not os.path.exists(video_path):
         logger.error(f"Video {video_path} does not exist")
         return []
@@ -772,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
             return await call_next(request)
         if request.url.path.startswith("/health"):
             return await call_next(request)
+        if request.url.path.startswith("/metrics"):
+            return await call_next(request)
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
@@ -928,6 +952,8 @@ def get_zmq_socket(
         buf_size = -1
     socket = context.socket(socket_type)
+    if endpoint.find("[") != -1:
+        socket.setsockopt(zmq.IPV6, 1)
     def set_send_opt():
         socket.setsockopt(zmq.SNDHWM, 0)
@@ -1144,6 +1170,20 @@ def get_hpu_memory_capacity():
         )
+def get_device_memory_capacity(device: str = None):
+    if is_cuda():
+        gpu_mem = get_nvgpu_memory_capacity()
+    elif is_hip():
+        gpu_mem = get_amdgpu_memory_capacity()
+    elif device == "hpu":
+        gpu_mem = get_hpu_memory_capacity()
+    else:
+        # GPU memory is not known yet or no GPU is available.
+        gpu_mem = None
+    return gpu_mem
 # Copy from pytorch and OpenRLHF to allow creating multiple main groups.
 # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
 # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
@@ -1911,6 +1951,8 @@ def is_page_size_one(server_args):
     return server_args.page_size == 1
+# TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
+# TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
 def is_no_spec_infer_or_topk_one(server_args):
     return server_args.speculative_eagle_topk is None or (
         server_args.speculative_eagle_topk is not None
@@ -1928,5 +1970,19 @@ def is_fa3_default_architecture(hf_config):
         "Llama4ForConditionalGeneration",
         "LlamaForCausalLM",
         "MistralForCausalLM",
+        "Gemma2ForCausalLM",
     }
     return architectures[0] in default_archs
+# Can be more general if it is used in multiple places (keep it simple and thus not general now)
+class BumpAllocator:
+    def __init__(self, buffer_size: int, dtype, device):
+        self._buffer = torch.zeros((buffer_size,), dtype=dtype, device=device)
+        self._pointer = 0
+    def allocate(self, size: int):
+        assert self._pointer + size <= len(self._buffer)
+        output = self._buffer[self._pointer : self._pointer + size]
+        self._pointer += size
+        return output

sglang/test/runners.py CHANGED Viewed

@@ -26,8 +26,8 @@ from transformers import (
     AutoProcessor,
 )
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server import Engine
 from sglang.srt.utils import load_image
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
@@ -51,6 +51,8 @@ NUM_TOP_LOGPROBS = 5
 def get_dtype_str(torch_dtype):
     if torch_dtype is torch.float16:
         return "float16"
+    if torch_dtype is torch.float32:
+        return "float32"
     else:
         raise NotImplementedError()
@@ -447,6 +449,7 @@ class SRTRunner:
         port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
+        attention_backend: Optional[str] = None,
         lora_backend: str = "triton",
         disable_cuda_graph: bool = False,
         disable_radix_cache: bool = False,
@@ -487,6 +490,7 @@ class SRTRunner:
             lora_paths=lora_paths,
             max_loras_per_batch=max_loras_per_batch,
             lora_backend=lora_backend,
+            attention_backend=attention_backend,
             disable_cuda_graph=disable_cuda_graph,
             disable_radix_cache=disable_radix_cache,
             chunked_prefill_size=chunked_prefill_size,

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl