PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_serving.py +3 -2
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/disaggregation/decode.py +43 -0
sglang/srt/disaggregation/mini_lb.py +69 -8
sglang/srt/disaggregation/mooncake/conn.py +1 -1
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +100 -16
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +781 -150
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/rotary_embedding.py +6 -6
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/io_struct.py +14 -3
sglang/srt/managers/schedule_batch.py +13 -0
sglang/srt/managers/scheduler.py +16 -6
sglang/srt/managers/tokenizer_manager.py +115 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +31 -13
sglang/srt/model_executor/cuda_graph_runner.py +13 -8
sglang/srt/model_executor/model_runner.py +19 -4
sglang/srt/models/deepseek_v2.py +9 -6
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +52 -40
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/utils.py +46 -5
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -42,6 +42,10 @@ from sglang.srt.layers.dp_attention import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer
+from sglang.srt.layers.quantization.deep_gemm import (
+    _ENABLE_JIT_DEEPGEMM,
+    update_deep_gemm_config,
+)
 from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
 from sglang.srt.lora.lora_manager import LoRAManager
@@ -169,6 +173,10 @@ class ModelRunner:
         # Get memory before model loading
         min_per_gpu_memory = self.init_torch_distributed()
+        # Update deep gemm configure
+        if _ENABLE_JIT_DEEPGEMM:
+            update_deep_gemm_config(gpu_id, server_args)
         # If it is a draft model tp_group can be different.
         self.initialize(min_per_gpu_memory)
@@ -221,7 +229,16 @@ class ModelRunner:
         server_args = self.server_args
         if server_args.attention_backend is None:
-            # By default, use flashinfer for non-mla attention and triton for mla attention
+            """
+            We auto select the fastest attention backend according to the current offering
+            1. Models with MHA Architecture (e.g: Llama, QWen)
+                1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
+                1.2 In other cases, we will use flashinfer if available, otherwise use triton.
+            2. Models with MLA Architecture and using FA3
+                2.1 We will use FA3 backend on hopper.
+                2.2 Otherwise, we will use triton backend.
+            """
             if not self.use_mla_backend:
                 if (
                     is_hopper_with_cuda_12_3()
@@ -234,9 +251,7 @@ class ModelRunner:
                         "flashinfer" if is_flashinfer_available() else "triton"
                     )
             else:
-                if is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(
-                    server_args
-                ):
+                if is_hopper_with_cuda_12_3():
                     server_args.attention_backend = "fa3"
                 else:
                     server_args.attention_backend = "triton"

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -57,8 +57,8 @@ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.layers.quantization.fp8_kernel import (
-    _enable_jit_deepgemm_bmm,
     per_tensor_quant_mla_deep_gemm_masked_fp8,
     per_tensor_quant_mla_fp8,
 )
@@ -86,8 +86,11 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_cuda:
-    from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
     from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
+    from sglang.srt.layers.quantization.deep_gemm import (
+        grouped_gemm_nt_f8f8bf16_masked as deep_gemm_grouped_gemm_nt_f8f8bf16_masked,
+    )
 else:
     from vllm._custom_ops import awq_dequantize
@@ -702,7 +705,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             q_nope_out = q_nope.new_empty(
                 (self.num_local_heads, aligned_m, self.kv_lora_rank)
             )
-            m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
                 (q_nope_val, q_nope_scale),
                 (self.w_kc, self.w_scale_k),
                 q_nope_out,
@@ -751,7 +754,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             attn_bmm_output = attn_output.new_empty(
                 (self.num_local_heads, aligned_m, self.v_head_dim)
             )
-            m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
                 (attn_output_val, attn_output_scale),
                 (self.w_vc, self.w_scale_v),
                 attn_bmm_output,
@@ -1520,7 +1523,7 @@ class DeepseekV2ForCausalLM(nn.Module):
                         if (
                             _is_cuda
-                            and _enable_jit_deepgemm_bmm
+                            and _ENABLE_JIT_DEEPGEMM
                             and weight_block_size[0] == 128
                             and weight_block_size[1] == 128
                             and model_dtype == torch.bfloat16
@@ -1628,7 +1631,7 @@ class DeepseekV2ForCausalLM(nn.Module):
                                 f"mlp.experts."
                                 f"{self.config.n_routed_experts + num_repeat}"
                                 f".{suffix}",
-                                weights_dict[shared_expert_weight_name].clone(),
+                                weights_dict[shared_expert_weight_name],
                             )
                         )
                         names_to_remove += [shared_expert_weight_name]

sglang/srt/models/minicpm3.py CHANGED Viewed

@@ -40,9 +40,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import add_prefix, is_cuda_available
+from sglang.srt.utils import add_prefix, is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import bmm_fp8

sglang/srt/models/minicpmo.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torch.nn.functional as F
 import torch.nn.utils.parametrize as P
 import torch.types
 from torch import nn
-from torch.nn.utils import weight_norm
+from torch.nn.utils import parametrizations
 from tqdm import tqdm
 from transformers import LlamaConfig, LlamaModel, PretrainedConfig, PreTrainedModel
 from transformers.activations import ACT2FN
@@ -585,7 +585,7 @@ class ConditionalChatTTS(PreTrainedModel):
         self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
         self.head_code = nn.ModuleList(
             [
-                weight_norm(
+                parametrizations.weight_norm(
                     nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
                     name="weight",
                 )
@@ -1859,11 +1859,22 @@ class MiniCPMO(MiniCPMBaseModel):
                 # the checkpoint. Skip them.
                 continue
-            # adapt to parametrization
+            # For weight_norm parametrization, handle both old and new formats
             if self.config.init_tts and "tts" in name:
-                name = name.replace(".parametrizations", "")
-                name = name.replace(".weight.original0", ".weight_g")
-                name = name.replace(".weight.original1", ".weight_v")
+                # Handle loading from older checkpoints with weight_g/weight_v format
+                if ".weight_g" in name or ".weight_v" in name:
+                    name = name.replace(
+                        ".weight_g", ".parametrizations.weight.original0"
+                    )
+                    name = name.replace(
+                        ".weight_v", ".parametrizations.weight.original1"
+                    )
+                elif ".weight" in name and name not in params_dict:
+                    param_name = name.replace(
+                        ".weight", ".parametrizations.weight.original0"
+                    )
+                    if param_name in params_dict:
+                        name = param_name
             # adapt to VisionAttention
             if "vpm" in name:

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -938,6 +938,35 @@ def v1_chat_generate_request(
             if chat_template_name is None:
                 openai_compatible_messages = []
+                if (
+                    tools
+                    and tokenizer_manager.server_args.tool_call_parser == "deepseekv3"
+                ):
+                    # add function call prompt to deepseekv3
+                    openai_compatible_messages.append(
+                        {
+                            "role": "system",
+                            "content": """You are a helpful Assistant.
+                    ## Tools
+                    ### Function
+                    You have the following functions available:
+                    """
+                            + "".join(
+                                [
+                                    f"""
+                        - `{tool['name']}`:
+                        ```json
+                        {json.dumps(tool)}
+                        ```
+                        """
+                                    for tool in tools
+                                ]
+                            ),
+                        }
+                    )
+                    # TODO fix the compatible issues with xgrammar
+                    strict_tag = None
                 for message in request.messages:
                     if isinstance(message.content, str):
                         openai_compatible_messages.append(
@@ -950,9 +979,16 @@ def v1_chat_generate_request(
                                 openai_compatible_messages.append(
                                     {"role": message.role, "content": content["text"]}
                                 )
-                if openai_compatible_messages[-1]["role"] == "assistant":
-                    assistant_prefix = openai_compatible_messages[-1]["content"]
-                    openai_compatible_messages = openai_compatible_messages[:-1]
+                if (
+                    openai_compatible_messages
+                    and openai_compatible_messages[-1]["role"] == "assistant"
+                ):
+                    if request.continue_final_message:
+                        # Remove the final assistant message so its content can be continued.
+                        assistant_prefix = openai_compatible_messages[-1]["content"]
+                        openai_compatible_messages = openai_compatible_messages[:-1]
+                    else:
+                        assistant_prefix = None
                 else:
                     assistant_prefix = None
@@ -991,7 +1027,33 @@ def v1_chat_generate_request(
                 modalities = []
             else:
                 conv = generate_chat_conv(request, chat_template_name)
-                prompt = conv.get_prompt()
+                # If we should continue the final assistant message, adjust the conversation.
+                if (
+                    request.continue_final_message
+                    and request.messages
+                    and request.messages[-1].role == "assistant"
+                ):
+                    # Remove the auto-added blank assistant turn, if present.
+                    if conv.messages and conv.messages[-1][1] is None:
+                        conv.messages.pop()
+                    # Rebuild the prompt from the conversation.
+                    prompt = conv.get_prompt()
+                    # Strip any trailing stop tokens or separators that indicate end-of-assistant.
+                    if isinstance(conv.stop_str, list):
+                        for stop_token in conv.stop_str:
+                            if prompt.endswith(stop_token):
+                                prompt = prompt[: -len(stop_token)]
+                    elif isinstance(conv.stop_str, str) and prompt.endswith(
+                        conv.stop_str
+                    ):
+                        prompt = prompt[: -len(conv.stop_str)]
+                    if conv.sep and prompt.endswith(conv.sep):
+                        prompt = prompt[: -len(conv.sep)]
+                    if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
+                        prompt = prompt[: -len(conv.sep2)]
+                else:
+                    prompt = conv.get_prompt()
                 image_data = conv.image_data
                 audio_data = conv.audio_data
                 modalities = conv.modalities
@@ -1003,6 +1065,7 @@ def v1_chat_generate_request(
                     else:
                         stop.extend(request.stop)
                 prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
         else:
             # Use the raw prompt and stop strings if the messages is already a string.
             prompt_ids = request.messages
@@ -1042,6 +1105,8 @@ def v1_chat_generate_request(
             sampling_params["json_schema"] = convert_json_schema_to_str(
                 request.response_format.json_schema.schema_
             )
+        elif request.response_format and request.response_format.type == "json_object":
+            sampling_params["json_schema"] = '{"type": "object"}'
         elif (
             request.response_format and request.response_format.type == "structural_tag"
         ):
@@ -1109,6 +1174,8 @@ def v1_chat_generate_request(
         rid=request_ids,
         modalities=modalities_list,
         lora_path=lora_paths,
+        bootstrap_host=all_requests[0].bootstrap_host,
+        bootstrap_room=all_requests[0].bootstrap_room,
     )
     return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -252,7 +252,7 @@ ChatCompletionMessageContentPart = Union[
 class ChatCompletionMessageGenericParam(BaseModel):
     role: Literal["system", "assistant", "tool"]
-    content: Union[str, List[ChatCompletionMessageContentTextPart]]
+    content: Union[str, List[ChatCompletionMessageContentTextPart], None]
 class ChatCompletionMessageUserParam(BaseModel):
@@ -355,12 +355,17 @@ class ChatCompletionRequest(BaseModel):
     stop_token_ids: Optional[List[int]] = None
     no_stop_trim: bool = False
     ignore_eos: bool = False
+    continue_final_message: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     session_params: Optional[Dict] = None
     separate_reasoning: bool = True
     stream_reasoning: bool = True
+    # For PD disaggregation
+    bootstrap_host: Optional[str] = None
+    bootstrap_room: Optional[int] = None
 class FunctionResponse(BaseModel):
     """Function response."""

sglang/srt/server_args.py CHANGED Viewed

@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     configure_ipv6,
-    get_amdgpu_memory_capacity,
     get_device,
-    get_hpu_memory_capacity,
-    get_nvgpu_memory_capacity,
-    is_cuda,
+    get_device_memory_capacity,
     is_flashinfer_available,
     is_hip,
     is_port_available,
@@ -49,6 +46,7 @@ class ServerArgs:
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
     skip_tokenizer_init: bool = False
+    enable_tokenizer_batch_encode: bool = False
     load_format: str = "auto"
     trust_remote_code: bool = False
     dtype: str = "auto"
@@ -179,6 +177,8 @@ class ServerArgs:
     tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
+    hicache_size: int = 0
+    hicache_write_policy: str = "write_through_selective"
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
     moe_dense_tp_size: Optional[int] = None
@@ -218,28 +218,24 @@ class ServerArgs:
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
-        if is_cuda():
-            gpu_mem = get_nvgpu_memory_capacity()
-        elif is_hip():
-            gpu_mem = get_amdgpu_memory_capacity()
-        elif self.device == "hpu":
-            gpu_mem = get_hpu_memory_capacity()
-        else:
-            # GPU memory is not known yet or no GPU is available.
-            gpu_mem = None
+        gpu_mem = get_device_memory_capacity(self.device)
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
-            if self.tp_size >= 16:
-                self.mem_fraction_static = 0.79
-            elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.81
-            elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.85
-            elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.87
+            if gpu_mem <= 81920:
+                if self.tp_size >= 16:
+                    self.mem_fraction_static = 0.79
+                elif self.tp_size >= 8:
+                    self.mem_fraction_static = 0.81
+                elif self.tp_size >= 4:
+                    self.mem_fraction_static = 0.85
+                elif self.tp_size >= 2:
+                    self.mem_fraction_static = 0.87
+                else:
+                    self.mem_fraction_static = 0.88
             else:
-                self.mem_fraction_static = 0.88
+                # FIXME: more fine grained auto-selection polices
+                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
@@ -268,8 +264,6 @@ class ServerArgs:
                     self.cuda_graph_max_bs = 8
                 else:
                     self.cuda_graph_max_bs = 80
-            else:
-                self.cuda_graph_max_bs = 160
         # Set kernel backends for hpu device
         if self.device == "hpu":
@@ -291,13 +285,6 @@ class ServerArgs:
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
-        # Expert parallelism
-        if self.enable_ep_moe:
-            self.ep_size = self.tp_size
-            logger.info(
-                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
-            )
         self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
         # Data parallelism attention
@@ -358,7 +345,18 @@ class ServerArgs:
             if self.page_size > 1 and self.speculative_eagle_topk > 1:
                 self.speculative_eagle_topk = 1
-                logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
+                logger.info(
+                    "speculative_eagle_topk is adjusted to 1 when page_size > 1"
+                )
+            if (
+                self.speculative_eagle_topk == 1
+                and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
+            ):
+                logger.info(
+                    "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
+                )
+                self.speculative_num_draft_tokens = self.speculative_num_steps + 1
             # The token generated from the verify step is counted.
             # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
@@ -380,14 +378,10 @@ class ServerArgs:
         # PD disaggregation
         if self.disaggregation_mode == "prefill":
             self.disable_cuda_graph = True
-            logger.warning("KV cache is forced as chunk cache for decode server")
-            self.disable_overlap_schedule = True
-            logger.warning("Overlap scheduler is disabled for prefill server")
+            logger.warning("Cuda graph is disabled for prefill server")
         elif self.disaggregation_mode == "decode":
             self.disable_radix_cache = True
-            logger.warning("Cuda graph is disabled for prefill server")
-            self.disable_overlap_schedule = True
-            logger.warning("Overlap scheduler is disabled for decode server")
+            logger.warning("KV cache is forced as chunk cache for decode server")
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
@@ -432,6 +426,11 @@ class ServerArgs:
             action="store_true",
             help="If set, skip init tokenizer and pass input_ids in generate request",
         )
+        parser.add_argument(
+            "--enable-tokenizer-batch-encode",
+            action="store_true",
+            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
+        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -1087,7 +1086,7 @@ class ServerArgs:
         parser.add_argument(
             "--tool-call-parser",
             type=str,
-            choices=["qwen25", "mistral", "llama3"],
+            choices=["qwen25", "mistral", "llama3", "deepseekv3"],
             default=ServerArgs.tool_call_parser,
             help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
         )
@@ -1099,10 +1098,22 @@ class ServerArgs:
         parser.add_argument(
             "--hicache-ratio",
             type=float,
-            required=False,
             default=ServerArgs.hicache_ratio,
             help="The ratio of the size of host KV cache memory pool to the size of device pool.",
         )
+        parser.add_argument(
+            "--hicache-size",
+            type=int,
+            default=ServerArgs.hicache_size,
+            help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
+        )
+        parser.add_argument(
+            "--hicache-write-policy",
+            type=str,
+            choices=["write_back", "write_through", "write_through_selective"],
+            default=ServerArgs.hicache_write_policy,
+            help="The write policy of hierarchical cache.",
+        )
         parser.add_argument(
             "--enable-deepep-moe",
             action="store_true",
@@ -1187,6 +1198,7 @@ class ServerArgs:
             "--disaggregation-transfer-backend",
             type=str,
             default=ServerArgs.disaggregation_transfer_backend,
+            choices=["mooncake", "nixl"],
             help="The backend for disaggregation transfer. Default is mooncake.",
         )
         parser.add_argument(

sglang/srt/speculative/build_eagle_tree.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import List
 import torch
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import is_cuda, is_hip
-if is_cuda_available() or is_hip():
+if is_cuda() or is_hip():
     from sgl_kernel import (
         build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
     )

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import fast_topk, is_cuda_available, is_hip, next_power_of_2
+from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import (
         top_k_renorm_prob,
         top_p_renorm_prob,

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
-from sglang.srt.utils import (
-    empty_context,
-    fast_topk,
-    get_available_gpu_memory,
-    is_cuda_available,
-)
+from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import segment_packbits
 logger = logging.getLogger(__name__)

sglang/srt/utils.py CHANGED Viewed

@@ -78,10 +78,34 @@ time_infos = {}
 HIP_FP8_E4M3_FNUZ_MAX = 224.0
+_warned_bool_env_var_keys = set()
 def get_bool_env_var(name: str, default: str = "false") -> bool:
     value = os.getenv(name, default)
-    return value.lower() in ("true", "1")
+    value = value.lower()
+    truthy_values = ("true", "1")
+    falsy_values = ("false", "0")
+    if (value not in truthy_values) and (value not in falsy_values):
+        if value not in _warned_bool_env_var_keys:
+            logger.warning(
+                f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
+            )
+        _warned_bool_env_var_keys.add(value)
+    return value in truthy_values
+def get_int_env_var(name: str, default: int = 0) -> int:
+    value = os.getenv(name)
+    if value is None or not value.strip():
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
 # https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
@@ -130,10 +154,6 @@ def is_flashinfer_available():
     return importlib.util.find_spec("flashinfer") is not None and is_cuda()
-def is_cuda_available():
-    return is_cuda()
 _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
     "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
 )
@@ -774,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
             return await call_next(request)
         if request.url.path.startswith("/health"):
             return await call_next(request)
+        if request.url.path.startswith("/metrics"):
+            return await call_next(request)
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
@@ -930,6 +952,8 @@ def get_zmq_socket(
         buf_size = -1
     socket = context.socket(socket_type)
+    if endpoint.find("[") != -1:
+        socket.setsockopt(zmq.IPV6, 1)
     def set_send_opt():
         socket.setsockopt(zmq.SNDHWM, 0)
@@ -1146,6 +1170,20 @@ def get_hpu_memory_capacity():
         )
+def get_device_memory_capacity(device: str = None):
+    if is_cuda():
+        gpu_mem = get_nvgpu_memory_capacity()
+    elif is_hip():
+        gpu_mem = get_amdgpu_memory_capacity()
+    elif device == "hpu":
+        gpu_mem = get_hpu_memory_capacity()
+    else:
+        # GPU memory is not known yet or no GPU is available.
+        gpu_mem = None
+    return gpu_mem
 # Copy from pytorch and OpenRLHF to allow creating multiple main groups.
 # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
 # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
@@ -1913,6 +1951,8 @@ def is_page_size_one(server_args):
     return server_args.page_size == 1
+# TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
+# TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
 def is_no_spec_infer_or_topk_one(server_args):
     return server_args.speculative_eagle_topk is None or (
         server_args.speculative_eagle_topk is not None
@@ -1930,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
         "Llama4ForConditionalGeneration",
         "LlamaForCausalLM",
         "MistralForCausalLM",
+        "Gemma2ForCausalLM",
     }
     return architectures[0] in default_archs

sglang/test/test_utils.py CHANGED Viewed

@@ -450,7 +450,9 @@ def popen_launch_server(
             return_code = process.poll()
             if return_code is not None:
-                raise Exception(f"Server unexpectedly exits ({return_code=}).")
+                raise Exception(
+                    f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
+                )
             time.sleep(10)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.5.~~post2~~"
1	+ __version__ = "0.4.5.post3"

sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl