PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +76 -15
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +20 -5
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +8 -7
sglang/srt/managers/detokenizer_manager.py +11 -9
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +80 -78
sglang/srt/managers/schedule_batch.py +46 -52
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +145 -82
sglang/srt/managers/tokenizer_manager.py +236 -334
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +34 -23
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +10 -19
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +41 -33
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +2 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +151 -6
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
sglang-0.3.5.dist-info/RECORD +152 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post1.dist-info/METADATA +0 -900
sglang-0.3.4.post1.dist-info/RECORD +0 -148
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -63,6 +63,7 @@ class ServerArgs:
     stream_interval: int = 1
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
+    decode_log_interval: int = 40
     # Logging
     log_level: str = "info"
@@ -74,6 +75,7 @@ class ServerArgs:
     api_key: Optional[str] = None
     file_storage_pth: str = "SGLang_storage"
     enable_cache_report: bool = False
+    watchdog_timeout: float = 600
     # Data parallelism
     dp_size: int = 1
@@ -102,6 +104,7 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
+    grammar_backend: Optional[str] = "outlines"
     # Optimization/debug options
     disable_flashinfer: bool = False
@@ -118,7 +121,8 @@ class ServerArgs:
     enable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
-    max_torch_compile_bs: int = 32
+    torch_compile_max_bs: int = 32
+    cuda_graph_max_bs: int = 160
     torchao_config: str = ""
     enable_p2p_check: bool = False
     triton_attention_reduce_in_fp32: bool = False
@@ -427,6 +431,18 @@ class ServerArgs:
             action="store_true",
             help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
         )
+        parser.add_argument(
+            "--watchdog-timeout",
+            type=float,
+            default=ServerArgs.watchdog_timeout,
+            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
+        )
+        parser.add_argument(
+            "--decode-log-interval",
+            type=int,
+            default=ServerArgs.decode_log_interval,
+            help="The log interval of decode batch"
+        )
         # Data parallelism
         parser.add_argument(
@@ -537,6 +553,13 @@ class ServerArgs:
             default=ServerArgs.sampling_backend,
             help="Choose the kernels for sampling layers.",
         )
+        parser.add_argument(
+            "--grammar-backend",
+            type=str,
+            choices=["xgrammar", "outlines"],
+            default=ServerArgs.grammar_backend,
+            help="Choose the backend for constrained decoding.",
+        )
         # Optimization/debug options
         parser.add_argument(
@@ -611,11 +634,17 @@ class ServerArgs:
             help="Optimize the model with torch.compile. Experimental feature.",
         )
         parser.add_argument(
-            "--max-torch-compile-bs",
+            "--torch-compile-max-bs",
             type=int,
-            default=ServerArgs.max_torch_compile_bs,
+            default=ServerArgs.torch_compile_max_bs,
             help="Set the maximum batch size when using torch compile.",
         )
+        parser.add_argument(
+            "--cuda-graph-max-bs",
+            type=int,
+            default=ServerArgs.cuda_graph_max_bs,
+            help="Set the maximum batch size for cuda graph.",
+        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -712,11 +741,11 @@ class PortArgs:
     @staticmethod
     def init_new(server_args) -> "PortArgs":
-        port = server_args.port + 1
+        port = server_args.port + 42
         while True:
             if is_port_available(port):
                 break
-            port += 1
+            port += 42
         return PortArgs(
             tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,

sglang/srt/utils.py CHANGED Viewed

@@ -35,6 +35,7 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
+import zmq
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from torch import nn
@@ -203,56 +204,6 @@ def is_port_available(port):
             return False
-def is_multimodal_model(model_architectures):
-    if (
-        "LlavaLlamaForCausalLM" in model_architectures
-        or "LlavaQwenForCausalLM" in model_architectures
-        or "LlavaMistralForCausalLM" in model_architectures
-        or "LlavaVidForCausalLM" in model_architectures
-        or "MllamaForConditionalGeneration" in model_architectures
-        or "Qwen2VLForConditionalGeneration" in model_architectures
-    ):
-        return True
-    else:
-        return False
-def is_attention_free_model(model_architectures):
-    return False
-def model_has_inner_state(model_architectures):
-    return False
-def is_embedding_model(model_architectures):
-    if (
-        "LlamaEmbeddingModel" in model_architectures
-        or "MistralModel" in model_architectures
-        or "LlamaForSequenceClassification" in model_architectures
-        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
-    ):
-        return True
-    else:
-        return False
-def is_generation_model(model_architectures, is_embedding: bool = False):
-    # We have two ways to determine whether a model is a generative model.
-    # 1. Check the model architectue
-    # 2. check the `is_embedding` server args
-    if (
-        "LlamaEmbeddingModel" in model_architectures
-        or "MistralModel" in model_architectures
-        or "LlamaForSequenceClassification" in model_architectures
-        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
-    ):
-        return False
-    else:
-        return not is_embedding
 def decode_video_base64(video_base64):
     from PIL import Image
@@ -397,17 +348,26 @@ def kill_parent_process():
     """Kill the parent process and all children of the parent process."""
     current_process = psutil.Process()
     parent_process = current_process.parent()
-    kill_child_process(parent_process.pid, skip_pid=current_process.pid)
+    kill_child_process(
+        parent_process.pid, include_self=True, skip_pid=current_process.pid
+    )
+    try:
+        current_process.kill()
+    except psutil.NoSuchProcess:
+        pass
-def kill_child_process(pid, including_parent=True, skip_pid=None):
+def kill_child_process(pid=None, include_self=False, skip_pid=None):
     """Kill the process and all its children process."""
+    if pid is None:
+        pid = os.getpid()
     try:
-        parent = psutil.Process(pid)
+        itself = psutil.Process(pid)
     except psutil.NoSuchProcess:
         return
-    children = parent.children(recursive=True)
+    children = itself.children(recursive=True)
     for child in children:
         if child.pid == skip_pid:
             continue
@@ -416,9 +376,9 @@ def kill_child_process(pid, including_parent=True, skip_pid=None):
         except psutil.NoSuchProcess:
             pass
-    if including_parent:
+    if include_self:
         try:
-            parent.kill()
+            itself.kill()
         except psutil.NoSuchProcess:
             pass
@@ -720,3 +680,27 @@ def first_rank_print(*args, **kwargs):
         print(*args, **kwargs)
     else:
         pass
+def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
+    mem = psutil.virtual_memory()
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)
+    else:
+        buf_size = -1
+    socket = context.socket(socket_type)
+    if socket_type == zmq.PUSH:
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+        socket.connect(f"ipc://{endpoint}")
+    elif socket_type == zmq.PULL:
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+        socket.bind(f"ipc://{endpoint}")
+    else:
+        raise ValueError(f"Unsupported socket type: {socket_type}")
+    return socket

sglang/test/run_eval.py CHANGED Viewed

@@ -67,6 +67,7 @@ def run_eval(args):
         model=args.model,
         max_tokens=2048,
         base_url=base_url,
+        temperature=getattr(args, "temperature", 0.0),
     )
     # Run eval
@@ -119,6 +120,7 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
+    parser.add_argument("--temperature", type=float, default=0.0)
     args = parser.parse_args()
     run_eval(args)

sglang/test/runners.py CHANGED Viewed

@@ -273,6 +273,7 @@ class SRTRunner:
             disable_cuda_graph=disable_cuda_graph,
             disable_radix_cache=disable_radix_cache,
         )
+        self.tokenizer = get_tokenizer(model_path)
     def forward(
         self,
@@ -366,7 +367,7 @@ class SRTRunner:
                 return ModelOutput(embed_logits=logits)
             else:
                 scores = [x["embedding"][0] for x in response]
-                return ModelOutput(scores=logits)
+                return ModelOutput(scores=scores)
     def __enter__(self):
         return self

sglang/test/srt/sampling/penaltylib/utils.py CHANGED Viewed

@@ -24,6 +24,7 @@ class MockSamplingParams:
 @dataclasses.dataclass
 class MockTokenizer:
     eos_token_id: int
+    additional_stop_token_ids: typing.Optional[typing.List[int]] = None
 @dataclasses.dataclass

sglang/test/test_utils.py CHANGED Viewed

@@ -3,9 +3,11 @@
 import argparse
 import asyncio
 import os
+import random
 import subprocess
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
 from typing import Callable, List, Optional
@@ -20,6 +22,7 @@ from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -400,7 +403,7 @@ def popen_launch_server(
     api_key: Optional[str] = None,
     other_args: tuple = (),
     env: Optional[dict] = None,
-    return_stdout_stderr: bool = False,
+    return_stdout_stderr: Optional[tuple] = None,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -423,8 +426,8 @@ def popen_launch_server(
     if return_stdout_stderr:
         process = subprocess.Popen(
             command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=return_stdout_stderr[0],
+            stderr=return_stdout_stderr[1],
             env=env,
             text=True,
         )
@@ -493,7 +496,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
             )
             assert ret_code == 0
         except TimeoutError:
-            kill_child_process(process.pid)
+            kill_child_process(process.pid, include_self=True)
             time.sleep(5)
             print(
                 f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
@@ -561,7 +564,7 @@ def run_bench_serving(
     try:
         res = run_benchmark(args)
     finally:
-        kill_child_process(process.pid)
+        kill_child_process(process.pid, include_self=True)
     assert res["completed"] == num_prompts
     return res
@@ -594,7 +597,7 @@ def run_bench_latency(model, other_args):
         lastline = output.split("\n")[-3]
         output_throughput = float(lastline.split(" ")[-2])
     finally:
-        kill_child_process(process.pid)
+        kill_child_process(process.pid, include_self=True)
     return output_throughput
@@ -631,3 +634,145 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
         rouge_l_scores.append(fmeasure)
     return rouge_l_scores
+STDOUT_FILENAME = "stdout.txt"
+STDERR_FILENAME = "stderr.txt"
+def read_output(output_lines):
+    """Print the output in real time with another thread."""
+    while not os.path.exists(STDERR_FILENAME):
+        time.sleep(1)
+    pt = 0
+    while pt >= 0:
+        if pt > 0 and not os.path.exists(STDERR_FILENAME):
+            break
+        lines = open(STDERR_FILENAME).readlines()
+        for line in lines[pt:]:
+            print(line, end="", flush=True)
+            output_lines.append(line)
+            pt += 1
+        time.sleep(0.1)
+def run_and_check_memory_leak(
+    workload_func,
+    disable_radix_cache,
+    enable_mixed_chunk,
+    enable_overlap,
+    chunked_prefill_size,
+):
+    other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
+    if disable_radix_cache:
+        other_args += ["--disable-radix-cache"]
+    if enable_mixed_chunk:
+        other_args += ["--enable-mixed-chunk"]
+    if enable_overlap:
+        other_args += ["--enable-overlap-scheduler"]
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    port = random.randint(4000, 5000)
+    base_url = f"http://127.0.0.1:{port}"
+    # Create files and launch the server
+    stdout = open(STDOUT_FILENAME, "w")
+    stderr = open(STDERR_FILENAME, "w")
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+        return_stdout_stderr=(stdout, stderr),
+    )
+    # Launch a thread to stream the output
+    output_lines = []
+    t = threading.Thread(target=read_output, args=(output_lines,))
+    t.start()
+    # Run the workload
+    workload_func(base_url, model)
+    # Clean up everything
+    kill_child_process(process.pid, include_self=True)
+    kill_child_process(process.pid, include_self=True)
+    stdout.close()
+    stderr.close()
+    if os.path.exists(STDOUT_FILENAME):
+        os.remove(STDOUT_FILENAME)
+    if os.path.exists(STDERR_FILENAME):
+        os.remove(STDERR_FILENAME)
+    t.join()
+    # Assert success
+    has_new_server = False
+    has_leak = False
+    for line in output_lines:
+        if "The server is fired" in line:
+            has_new_server = True
+        if "leak" in line:
+            has_leak = True
+    assert has_new_server
+    assert not has_leak
+def run_mmlu_test(
+    disable_radix_cache=False,
+    enable_mixed_chunk=False,
+    enable_overlap=False,
+    chunked_prefill_size=32,
+):
+    def workload_func(base_url, model):
+        # Run the eval
+        args = SimpleNamespace(
+            base_url=base_url,
+            model=model,
+            eval_name="mmlu",
+            num_examples=128,
+            num_threads=128,
+        )
+        try:
+            metrics = run_eval(args)
+            print(f"{metrics=}")
+            assert metrics["score"] >= 0.65
+        finally:
+            pass
+    run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
+def run_mulit_request_test(
+    disable_radix_cache=False,
+    enable_mixed_chunk=False,
+    enable_overlap=False,
+    chunked_prefill_size=32,
+):
+    def workload_func(base_url, model):
+        def run_one(_):
+            prompt = """
+            System: You are a helpful assistant.
+            User: What is the capital of France?
+            Assistant: The capital of France is
+            """
+            response = requests.post(
+                f"{base_url}/generate",
+                json={
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 8,
+                    },
+                },
+            )
+            ret = response.json()
+        with ThreadPoolExecutor(2) as executor:
+            list(executor.map(run_one, list(range(4))))
+    run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)

sglang/utils.py CHANGED Viewed

@@ -1,12 +1,15 @@
 """Common utilities."""
 import base64
+import gc
 import importlib
 import json
 import logging
 import os
 import signal
+import subprocess
 import sys
+import time
 import traceback
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor
@@ -16,6 +19,7 @@ from typing import Optional, Union
 import numpy as np
 import requests
+from IPython.display import HTML, display
 from tqdm import tqdm
 logger = logging.getLogger(__name__)
@@ -151,7 +155,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
     frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     frames = []
-    for i in range(total_frames):
+    for _ in range(total_frames):
         ret, frame = cap.read()
         if ret:
             frames.append(frame)
@@ -294,3 +298,60 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
             bar.update(len(chunk))
     return filename
+def execute_shell_command(command: str) -> subprocess.Popen:
+    """
+    Execute a shell command and return the process handle
+    Args:
+        command: Shell command as a string (can include \\ line continuations)
+    Returns:
+        subprocess.Popen: Process handle
+    """
+    # Replace \ newline with space and split
+    command = command.replace("\\\n", " ").replace("\\", " ")
+    parts = command.split()
+    return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
+def wait_for_server(base_url: str, timeout: int = None) -> None:
+    """Wait for the server to be ready by polling the /v1/models endpoint.
+    Args:
+        base_url: The base URL of the server
+        timeout: Maximum time to wait in seconds. None means wait forever.
+    """
+    start_time = time.time()
+    while True:
+        try:
+            response = requests.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": "Bearer None"},
+            )
+            if response.status_code == 200:
+                time.sleep(5)
+                print_highlight(
+                    """\n
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
+                    """
+                )
+                break
+            if timeout and time.time() - start_time > timeout:
+                raise TimeoutError("Server did not become ready within timeout period")
+        except requests.exceptions.RequestException:
+            time.sleep(1)
+def terminate_process(process):
+    from sglang.srt.utils import kill_child_process
+    kill_child_process(process.pid, include_self=True)
+def print_highlight(html_content: str):
+    html_content = str(html_content).replace("\n", "<br>")
+    display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.~~4.post1~~"
1	+ __version__ = "0.3.5"

sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl