PyPI - sglang - Versions diffs - 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl - Mend

sglang 0.4.2.post4py3-none-any.whl → 0.4.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/srt/models/qwen2_vl.py CHANGED Viewed

@@ -31,8 +31,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from transformers import Qwen2VLConfig
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
-from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.activation import QuickGELU
 from sglang.srt.layers.attention.vision import VisionAttention

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -20,12 +20,14 @@ import os
 import time
 import uuid
 from http import HTTPStatus
-from typing import Dict, List, Optional
+from typing import Dict, List
 from fastapi import HTTPException, Request, UploadFile
 from fastapi.responses import ORJSONResponse, StreamingResponse
 from pydantic import ValidationError
+from sglang.lang.chat_template import get_chat_template_by_model_path
 try:
     from outlines.fsm.json_schema import convert_json_schema_to_str
 except ImportError:
@@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {}
 # map file id to file path in SGLang backend
 file_id_storage: Dict[str, str] = {}
 # backend storage directory
 storage_dir = None
@@ -116,12 +117,13 @@ def create_streaming_error_response(
     return json_str
-def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
+def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path):
     global chat_template_name
     logger.info(
         f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
     )
     if not chat_template_exists(chat_template_arg):
         if not os.path.exists(chat_template_arg):
             raise RuntimeError(
@@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
     else:
         chat_template_name = chat_template_arg
+    # check chat-template
+    chat_template = get_chat_template_by_model_path(model_path)
+    if chat_template is not None:
+        official_chat_template = chat_template.name
+        used_chat_template = chat_template_name
+        if official_chat_template != used_chat_template:
+            logger.warning(
+                f"Using a chat_template: '{used_chat_template}', "
+                f"which is different from official chat template: '{official_chat_template}', "
+                f"This discrepancy may lead to performance degradation."
+            )
 async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
     try:

sglang/srt/server_args.py CHANGED Viewed

@@ -140,6 +140,7 @@ class ServerArgs:
     disable_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
+    enable_nccl_nvls: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     disable_mla: bool = False
@@ -160,12 +161,15 @@ class ServerArgs:
     delete_ckpt_after_loading: bool = False
     enable_memory_saver: bool = False
     allow_auto_truncate: bool = False
+    return_hidden_states: bool = False
     # Custom logit processor
     enable_custom_logit_processor: bool = False
     tool_call_parser: str = None
     enable_hierarchical_cache: bool = False
+    enable_flashinfer_mla: bool = False
     def __post_init__(self):
         # Set missing default values
         if self.tokenizer_path is None:
@@ -258,14 +262,17 @@ class ServerArgs:
             )
         # Speculative Decoding
-        if self.speculative_algorithm == "EAGLE":
+        if (
+            self.speculative_algorithm == "EAGLE"
+            or self.speculative_algorithm == "NEXTN"
+        ):
             self.prefill_only_one_req = True
             self.disable_cuda_graph_padding = True
             self.disable_radix_cache = True
             self.disable_overlap_schedule = True
             self.chunked_prefill_size = -1
             logger.info(
-                "The radix cache, chunked prefill, and overlap scheduler are disabled because of using eagle speculative decoding."
+                f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
             )
         # GGUF
@@ -691,12 +698,17 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
+        parser.add_argument(
+            "--enable-flashinfer-mla",
+            action="store_true",
+            help="Enable FlashInfer MLA optimization",
+        )
         # Speculative decoding
         parser.add_argument(
             "--speculative-algorithm",
             type=str,
-            choices=["EAGLE"],
+            choices=["EAGLE", "NEXTN"],
             help="Speculative algorithm.",
         )
         parser.add_argument(
@@ -782,6 +794,11 @@ class ServerArgs:
             action="store_true",
             help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
         )
+        parser.add_argument(
+            "--enable-nccl-nvls",
+            action="store_true",
+            help="Enable NCCL NVLS for prefill heavy requests when available.",
+        )
         parser.add_argument(
             "--disable-outlines-disk-cache",
             action="store_true",
@@ -795,7 +812,7 @@ class ServerArgs:
         parser.add_argument(
             "--disable-mla",
             action="store_true",
-            help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
+            help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
         )
         parser.add_argument(
             "--disable-overlap-schedule",
@@ -896,6 +913,11 @@ class ServerArgs:
             action="store_true",
             help="Enable users to pass custom logit processors to the server (disabled by default for security)",
         )
+        parser.add_argument(
+            "--return-hidden-states",
+            action="store_true",
+            help="Return hidden states in the response.",
+        )
         # Function Calling
         parser.add_argument(
             "--tool-call-parser",

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sglang.srt.speculative.eagle_utils import (
     fast_topk,
     select_top_k_tokens,
 )
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 logger = logging.getLogger(__name__)
@@ -57,23 +58,43 @@ class EAGLEWorker(TpModelWorker):
         # Parse arguments
         self.topk = server_args.speculative_eagle_topk
         self.speculative_num_steps = server_args.speculative_num_steps
+        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
         self.server_args = server_args
         # Share the embedding and lm_head
-        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
-        self.model_runner.model.set_embed_and_head(embed, head)
+        if not self.speculative_algorithm.is_nextn():
+            embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+            self.model_runner.model.set_embed_and_head(embed, head)
         self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
         # Create multi-step attn backends and cuda graph runners
-        from sglang.srt.layers.attention.flashinfer_backend import (
-            FlashInferMultiStepDraftBackend,
-        )
+        if server_args.attention_backend == "flashinfer":
+            from sglang.srt.layers.attention.flashinfer_backend import (
+                FlashInferMultiStepDraftBackend,
+            )
+            self.draft_attn_backend = FlashInferMultiStepDraftBackend(
+                self.model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+        elif server_args.attention_backend == "triton":
+            from sglang.srt.layers.attention.triton_backend import (
+                TritonMultiStepDraftBackend,
+            )
+            self.draft_attn_backend = TritonMultiStepDraftBackend(
+                self.model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+        else:
+            raise ValueError(
+                f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
+            )
-        self.draft_attn_backend = FlashInferMultiStepDraftBackend(
-            self.model_runner,
-            self.topk,
-            self.speculative_num_steps,
-        )
         self.model_runner.draft_attn_backend = self.draft_attn_backend
         self.init_cuda_graphs()
@@ -218,6 +239,10 @@ class EAGLEWorker(TpModelWorker):
             token_list.append(tree_info[1])
             parents_list.append(tree_info[2])
+            # we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
+            if i == self.speculative_num_steps - 1:
+                break
             # Set inputs
             forward_batch.input_ids = input_ids
             forward_batch.out_cache_loc = out_cache_loc[

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -5,18 +5,28 @@ class SpeculativeAlgorithm(IntEnum):
     NONE = auto()
     EAGLE = auto()
+    # NEXTN spec decoding is for DeepSeek V3/R1
+    # currently it's implemented based on EAGLE
+    NEXTN = auto()
     def is_none(self):
         return self == SpeculativeAlgorithm.NONE
     def is_eagle(self):
-        return self == SpeculativeAlgorithm.EAGLE
+        return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.NEXTN
+    def is_nextn(self):
+        return self == SpeculativeAlgorithm.NEXTN
     @staticmethod
     def from_string(name: str):
         name_map = {
             "EAGLE": SpeculativeAlgorithm.EAGLE,
+            "NEXTN": SpeculativeAlgorithm.NEXTN,
             None: SpeculativeAlgorithm.NONE,
         }
+        if name is not None:
+            name = name.upper()
         return name_map[name]

sglang/srt/utils.py CHANGED Viewed

@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
         timeout_keep_alive=5,
         loop="uvloop",
     )
+def set_cuda_arch():
+    if is_flashinfer_available():
+        capability = torch.cuda.get_device_capability()
+        arch = f"{capability[0]}.{capability[1]}"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"

sglang/utils.py CHANGED Viewed

@@ -306,22 +306,112 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
     return filename
-def execute_shell_command(command: str) -> subprocess.Popen:
+import fcntl
+def is_in_ci():
+    from sglang.test.test_utils import is_in_ci
+    return is_in_ci()
+LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
+PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
+if not os.path.exists(LOCKFILE):
+    with open(LOCKFILE, "w") as f:
+        pass
+if not os.path.exists(PORT_REGISTRY):
+    with open(PORT_REGISTRY, "w") as f:
+        json.dump([], f)
+def print_highlight(html_content: str):
+    if is_in_ci():
+        html_content = str(html_content).replace("\n", "<br>")
+        display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
+    else:
+        print(html_content)
+def init_port_registry():
+    """Initialize the port registry file if it doesn't exist."""
+    if not os.path.exists(PORT_REGISTRY):
+        with open(PORT_REGISTRY, "w") as f:
+            json.dump([], f)
+def reserve_port(start=30000, end=40000):
+    """
+    Reserve an available port using a file lock and a registry.
+    Returns the allocated port.
     """
-    Execute a shell command and return the process handle
+    init_port_registry()
+    with open(LOCKFILE, "w") as lock:
+        fcntl.flock(lock, fcntl.LOCK_EX)
+        try:
+            with open(PORT_REGISTRY, "r") as f:
+                used = json.load(f)
+        except Exception:
+            used = []
+        for port in range(start, end):
+            if port not in used:
+                used.append(port)
+                with open(PORT_REGISTRY, "w") as f:
+                    json.dump(used, f)
+                return port
+    raise RuntimeError("No free port available")
+def release_port(port):
+    """Release the reserved port by removing it from the registry."""
+    with open(LOCKFILE, "w") as lock:
+        fcntl.flock(lock, fcntl.LOCK_EX)
+        try:
+            with open(PORT_REGISTRY, "r") as f:
+                used = json.load(f)
+        except Exception:
+            used = []
+        if port in used:
+            used.remove(port)
+        with open(PORT_REGISTRY, "w") as f:
+            json.dump(used, f)
-    Args:
-        command: Shell command as a string (can include \\ line continuations)
-    Returns:
-        subprocess.Popen: Process handle
+def execute_shell_command(command: str) -> subprocess.Popen:
     """
-    # Replace \ newline with space and split
+    Execute a shell command and return its process handle.
+    """
+    # Replace newline continuations and split the command string.
     command = command.replace("\\\n", " ").replace("\\", " ")
     parts = command.split()
     return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port = reserve_port()
+    full_command = f"{command} --port {port}"
+    process = execute_shell_command(full_command)
+    return process, port
+def terminate_process(process, port=None):
+    """
+    Terminate the process and, if a port was reserved, release it.
+    """
+    from sglang.srt.utils import kill_process_tree
+    kill_process_tree(process.pid)
+    if port is not None:
+        release_port(port)
 def wait_for_server(base_url: str, timeout: int = None) -> None:
     """Wait for the server to be ready by polling the /v1/models endpoint.
@@ -343,6 +433,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                     NOTE: Typically, the server runs in a separate terminal.
                     In this notebook, we run the server and notebook code together, so their outputs are combined.
                     To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
+                    We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
                     """
                 )
                 break
@@ -353,17 +444,6 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
             time.sleep(1)
-def terminate_process(process):
-    from sglang.srt.utils import kill_process_tree
-    kill_process_tree(process.pid)
-def print_highlight(html_content: str):
-    html_content = str(html_content).replace("\n", "<br>")
-    display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
 class TypeBasedDispatcher:
     def __init__(self, mapping: List[Tuple[Type, Callable]]):
         self._mapping = mapping

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.2.~~post4~~"
1	+ __version__ = "0.4.3.post1"

{sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.2.post4
+Version: 0.4.3.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -235,14 +235,15 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.10; extra == "runtime-common"
+Requires-Dist: ninja; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.3.post3; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
-Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
+Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"

sglang 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl

sglang 0.4.2.post4py3-none-any.whl → 0.4.3.post1py3-none-any.whl