PyPI - sglang - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

sglang 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/__init__.py +57 -2
sglang/api.py +8 -5
sglang/backend/anthropic.py +18 -4
sglang/backend/openai.py +2 -1
sglang/backend/runtime_endpoint.py +18 -5
sglang/backend/vertexai.py +1 -0
sglang/global_config.py +5 -1
sglang/lang/chat_template.py +83 -2
sglang/lang/interpreter.py +92 -35
sglang/lang/ir.py +12 -9
sglang/lang/tracer.py +6 -4
sglang/launch_server_llavavid.py +31 -0
sglang/srt/constrained/fsm_cache.py +1 -0
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +2 -2
sglang/srt/flush_cache.py +16 -0
sglang/srt/hf_transformers_utils.py +10 -2
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +1 -0
sglang/srt/layers/logits_processor.py +114 -54
sglang/srt/layers/radix_attention.py +2 -1
sglang/srt/layers/token_attention.py +1 -0
sglang/srt/managers/detokenizer_manager.py +5 -1
sglang/srt/managers/io_struct.py +27 -3
sglang/srt/managers/router/infer_batch.py +97 -48
sglang/srt/managers/router/manager.py +11 -8
sglang/srt/managers/router/model_rpc.py +169 -90
sglang/srt/managers/router/model_runner.py +110 -166
sglang/srt/managers/router/radix_cache.py +89 -51
sglang/srt/managers/router/scheduler.py +17 -28
sglang/srt/managers/tokenizer_manager.py +110 -33
sglang/srt/memory_pool.py +5 -14
sglang/srt/model_config.py +11 -0
sglang/srt/models/commandr.py +372 -0
sglang/srt/models/dbrx.py +412 -0
sglang/srt/models/dbrx_config.py +281 -0
sglang/srt/models/gemma.py +24 -25
sglang/srt/models/llama2.py +25 -26
sglang/srt/models/llava.py +8 -10
sglang/srt/models/llavavid.py +307 -0
sglang/srt/models/mixtral.py +29 -33
sglang/srt/models/qwen.py +34 -25
sglang/srt/models/qwen2.py +25 -26
sglang/srt/models/stablelm.py +26 -26
sglang/srt/models/yivl.py +3 -5
sglang/srt/openai_api_adapter.py +356 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +91 -456
sglang/srt/server_args.py +79 -49
sglang/srt/utils.py +212 -47
sglang/srt/weight_utils.py +417 -0
sglang/test/test_programs.py +8 -7
sglang/test/test_utils.py +195 -7
sglang/utils.py +77 -26
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/METADATA +20 -18
sglang-0.1.16.dist-info/RECORD +72 -0
sglang-0.1.14.dist-info/RECORD +0 -64
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/LICENSE +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/WHEEL +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""The arguments of the server."""
 import argparse
 import dataclasses
 from typing import List, Optional, Union
@@ -5,34 +7,47 @@ from typing import List, Optional, Union
 @dataclasses.dataclass
 class ServerArgs:
+    # Model and tokenizer
     model_path: str
     tokenizer_path: Optional[str] = None
-    host: str = "127.0.0.1"
-    port: int = 30000
-    additional_ports: Optional[Union[List[int], int]] = None
     load_format: str = "auto"
     tokenizer_mode: str = "auto"
     chat_template: Optional[str] = None
     trust_remote_code: bool = True
+    context_length: Optional[int] = None
+    # Port
+    host: str = "127.0.0.1"
+    port: int = 30000
+    additional_ports: Optional[Union[List[int], int]] = None
+    # Memory and scheduling
     mem_fraction_static: Optional[float] = None
     max_prefill_num_token: Optional[int] = None
-    context_length: Optional[int] = None
-    tp_size: int = 1
     schedule_heuristic: str = "lpm"
     schedule_conservativeness: float = 1.0
-    attention_reduce_in_fp32: bool = False
-    random_seed: int = 42
+    # Other runtime options
+    tp_size: int = 1
     stream_interval: int = 8
+    random_seed: int = 42
+    # Logging
+    log_level: str = "info"
+    log_requests: bool = False
     disable_log_stats: bool = False
     log_stats_interval: int = 10
-    log_level: str = "info"
+    show_time_cost: bool = False
-    # optional modes
-    disable_radix_cache: bool = False
+    # Other
+    api_key: str = ""
+    # Optimization/debug options
     enable_flashinfer: bool = False
+    attention_reduce_in_fp32: bool = False
+    disable_radix_cache: bool = False
     disable_regex_jump_forward: bool = False
     disable_disk_cache: bool = False
-    api_key: str = ""
     def __post_init__(self):
         if self.tokenizer_path is None:
@@ -65,15 +80,18 @@ class ServerArgs:
             default=ServerArgs.tokenizer_path,
             help="The path of the tokenizer.",
         )
-        parser.add_argument("--host", type=str, default=ServerArgs.host)
-        parser.add_argument("--port", type=int, default=ServerArgs.port)
-        # we want to be able to pass a list of ports
+        parser.add_argument(
+            "--host", type=str, default=ServerArgs.host, help="The host of the server."
+        )
+        parser.add_argument(
+            "--port", type=int, default=ServerArgs.port, help="The port of the server."
+        )
         parser.add_argument(
             "--additional-ports",
             type=int,
             nargs="*",
             default=[],
-            help="Additional ports specified for launching server.",
+            help="Additional ports specified for the server.",
         )
         parser.add_argument(
             "--load-format",
@@ -111,6 +129,12 @@ class ServerArgs:
             action="store_true",
             help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
         )
+        parser.add_argument(
+            "--context-length",
+            type=int,
+            default=ServerArgs.context_length,
+            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
+        )
         parser.add_argument(
             "--mem-fraction-static",
             type=float,
@@ -123,23 +147,12 @@ class ServerArgs:
             default=ServerArgs.max_prefill_num_token,
             help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
         )
-        parser.add_argument(
-            "--context-length",
-            type=int,
-            default=ServerArgs.context_length,
-            help="The model's maximum context length. Use this to reduce the context length to save memory. Defaults to None (will use the value from the model's config.json instead).",
-        )
-        parser.add_argument(
-            "--tp-size",
-            type=int,
-            default=ServerArgs.tp_size,
-            help="Tensor parallelism degree.",
-        )
         parser.add_argument(
             "--schedule-heuristic",
             type=str,
             default=ServerArgs.schedule_heuristic,
-            help="Schudule mode: [lpm, weight, random, fcfs]",
+            choices=["lpm", "random", "fcfs", "dfs-weight"],
+            help="Scheduling Heuristic.",
         )
         parser.add_argument(
             "--schedule-conservativeness",
@@ -148,15 +161,10 @@ class ServerArgs:
             help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
         )
         parser.add_argument(
-            "--random-seed",
+            "--tp-size",
             type=int,
-            default=ServerArgs.random_seed,
-            help="Random seed.",
-        )
-        parser.add_argument(
-            "--attention-reduce-in-fp32",
-            action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
+            default=ServerArgs.tp_size,
+            help="Tensor parallelism size.",
         )
         parser.add_argument(
             "--stream-interval",
@@ -164,11 +172,22 @@ class ServerArgs:
             default=ServerArgs.stream_interval,
             help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
         )
+        parser.add_argument(
+            "--random-seed",
+            type=int,
+            default=ServerArgs.random_seed,
+            help="Random seed.",
+        )
         parser.add_argument(
             "--log-level",
             type=str,
             default=ServerArgs.log_level,
-            help="Log level",
+            help="Logging level",
+        )
+        parser.add_argument(
+            "--log-requests",
+            action="store_true",
+            help="Log all requests",
         )
         parser.add_argument(
             "--disable-log-stats",
@@ -181,17 +200,34 @@ class ServerArgs:
             default=ServerArgs.log_stats_interval,
             help="Log stats interval in second.",
         )
-        # optional modes
         parser.add_argument(
-            "--disable-radix-cache",
+            "--show-time-cost",
             action="store_true",
-            help="Disable RadixAttention",
+            help="Show time cost of custom marks",
+        )
+        parser.add_argument(
+            "--api-key",
+            type=str,
+            default=ServerArgs.api_key,
+            help="Set API key of the server",
         )
+        # Optimization/debug options
         parser.add_argument(
             "--enable-flashinfer",
             action="store_true",
             help="Enable flashinfer inference kernels",
         )
+        parser.add_argument(
+            "--attention-reduce-in-fp32",
+            action="store_true",
+            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
+        )
+        parser.add_argument(
+            "--disable-radix-cache",
+            action="store_true",
+            help="Disable RadixAttention",
+        )
         parser.add_argument(
             "--disable-regex-jump-forward",
             action="store_true",
@@ -202,12 +238,6 @@ class ServerArgs:
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
-        parser.add_argument(
-            "--api-key",
-            type=str,
-            default=ServerArgs.api_key,
-            help="Set API Key",
-        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -217,13 +247,13 @@ class ServerArgs:
     def url(self):
         return f"http://{self.host}:{self.port}"
-    def get_optional_modes_logging(self):
+    def print_mode_args(self):
         return (
-            f"disable_radix_cache={self.disable_radix_cache}, "
             f"enable_flashinfer={self.enable_flashinfer}, "
+            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
+            f"disable_radix_cache={self.disable_radix_cache}, "
             f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
             f"disable_disk_cache={self.disable_disk_cache}, "
-            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}"
         )

sglang/srt/utils.py CHANGED Viewed

@@ -1,58 +1,74 @@
+"""Common utilities."""
 import base64
 import os
 import random
 import socket
-import sys
 import time
-import traceback
+from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
 from typing import List, Optional
 import numpy as np
+import pydantic
 import requests
 import torch
-import torch.distributed as dist
+from fastapi.responses import JSONResponse
+from packaging import version as pkg_version
+from pydantic import BaseModel
+from starlette.middleware.base import BaseHTTPMiddleware
-is_show_cost_time = False
+from sglang.utils import get_exception_traceback
+show_time_cost = False
+time_infos = {}
-def mark_cost_time(func_name):
-    def inner_func(func):
-        def time_func(*args, **kwargs):
-            if dist.get_rank() in [0, 1] and is_show_cost_time:
-                torch.cuda.synchronize()
-                start_time = time.time()
-                ans = func(*args, **kwargs)
-                torch.cuda.synchronize()
-                print(func_name, "cost time:", (time.time() - start_time) * 1000)
-                return ans
-            else:
-                torch.cuda.synchronize()
-                ans = func(*args, **kwargs)
-                torch.cuda.synchronize()
-                return ans
-        return time_func
+def enable_show_time_cost():
+    global show_time_cost
+    show_time_cost = True
-    return inner_func
+class TimeInfo:
+    def __init__(self, name, interval=0.1, color=0, indent=0):
+        self.name = name
+        self.interval = interval
+        self.color = color
+        self.indent = indent
+        self.acc_time = 0
+        self.last_acc_time = 0
+    def check(self):
+        if self.acc_time - self.last_acc_time > self.interval:
+            self.last_acc_time = self.acc_time
+            return True
+        return False
-time_mark = {}
+    def pretty_print(self):
+        print(f"\x1b[{self.color}m", end="")
+        print("-" * self.indent * 2, end="")
+        print(f"{self.name}: {self.acc_time:.3f}s\x1b[0m")
-def mark_start(key):
+def mark_start(name, interval=0.1, color=0, indent=0):
+    global time_infos, show_time_cost
+    if not show_time_cost:
+        return
     torch.cuda.synchronize()
-    global time_mark
-    time_mark[key] = time.time()
-    return
+    if time_infos.get(name, None) is None:
+        time_infos[name] = TimeInfo(name, interval, color, indent)
+    time_infos[name].acc_time -= time.time()
-def mark_end(key, print_min_cost=0.0):
+def mark_end(name):
+    global time_infos, show_time_cost
+    if not show_time_cost:
+        return
     torch.cuda.synchronize()
-    global time_mark
-    cost_time = (time.time() - time_mark[key]) * 1000
-    if cost_time > print_min_cost:
-        print(f"cost {key}:", cost_time)
+    time_infos[name].acc_time += time.time()
+    if time_infos[name].check():
+        time_infos[name].pretty_print()
 def calculate_time(show=False, min_cost_ms=0.0):
@@ -74,6 +90,32 @@ def calculate_time(show=False, min_cost_ms=0.0):
     return wrapper
+def get_available_gpu_memory(gpu_id, distributed=True):
+    """
+    Get available memory for cuda:gpu_id device.
+    When distributed is True, the available memory is the minimum available memory of all GPUs.
+    """
+    num_gpus = torch.cuda.device_count()
+    assert gpu_id < num_gpus
+    if torch.cuda.current_device() != gpu_id:
+        print(
+            f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
+            "which may cause useless memory allocation for torch CUDA context.",
+        )
+    free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+    if distributed:
+        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
+            torch.device("cuda", gpu_id)
+        )
+        torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
+        free_gpu_memory = tensor.item()
+    return free_gpu_memory / (1 << 30)
 def set_random_seed(seed: int) -> None:
     random.seed(seed)
@@ -89,11 +131,13 @@ def alloc_usable_network_port(num, used_list=()):
             continue
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
             try:
                 s.bind(("", port))
+                s.listen(1)  # Attempt to listen on the port
                 port_list.append(port)
             except socket.error:
-                pass
+                pass  # If any error occurs, this port is not usable
             if len(port_list) == num:
                 return port_list
@@ -110,7 +154,7 @@ def check_port(port):
             return False
-def handle_port_init(
+def allocate_init_ports(
     port: Optional[int] = None,
     additional_ports: Optional[List[int]] = None,
     tp_size: int = 1,
@@ -142,15 +186,7 @@ def handle_port_init(
     return port, additional_ports
-def get_exception_traceback():
-    etype, value, tb = sys.exc_info()
-    err_str = "".join(traceback.format_exception(etype, value, tb))
-    return err_str
 def get_int_token_logit_bias(tokenizer, vocab_size):
-    from transformers import LlamaTokenizer, LlamaTokenizerFast
     # a bug when model's vocab size > tokenizer.vocab_size
     vocab_size = tokenizer.vocab_size
     logit_bias = np.zeros(vocab_size, dtype=np.float32)
@@ -231,20 +267,102 @@ def wrap_kernel_launcher(kernel):
 def is_multimodal_model(model):
-    if isinstance(model, str):
-        return "llava" in model or "yi-vl" in model
     from sglang.srt.model_config import ModelConfig
+    if isinstance(model, str):
+        model = model.lower()
+        return "llava" in model or "yi-vl" in model or "llava-next" in model
     if isinstance(model, ModelConfig):
         model_path = model.path.lower()
-        return "llava" in model_path or "yi-vl" in model_path
-    raise Exception("unrecognized type")
+        return "llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path
+    raise ValueError("unrecognized type")
+def decode_video_base64(video_base64):
+    from PIL import Image
+    # Decode the base64 string
+    video_bytes = base64.b64decode(video_base64)
+    # Placeholder for the start indices of each PNG image
+    img_starts = []
+    frame_format = "PNG"  # str(os.getenv('FRAME_FORMAT', "JPEG"))
+    assert frame_format in [
+        "PNG",
+        "JPEG",
+    ], "FRAME_FORMAT must be either 'PNG' or 'JPEG'"
+    if frame_format == "PNG":
+        # Find each PNG start signature to isolate images
+        i = 0
+        while i < len(video_bytes) - 7:  # Adjusted for the length of the PNG signature
+            # Check if we found the start of a PNG file
+            if (
+                video_bytes[i] == 0x89
+                and video_bytes[i + 1] == 0x50
+                and video_bytes[i + 2] == 0x4E
+                and video_bytes[i + 3] == 0x47
+                and video_bytes[i + 4] == 0x0D
+                and video_bytes[i + 5] == 0x0A
+                and video_bytes[i + 6] == 0x1A
+                and video_bytes[i + 7] == 0x0A
+            ):
+                img_starts.append(i)
+                i += 8  # Skip the PNG signature
+            else:
+                i += 1
+    else:
+        # Find each JPEG start (0xFFD8) to isolate images
+        i = 0
+        while (
+            i < len(video_bytes) - 1
+        ):  # Adjusted for the length of the JPEG SOI signature
+            # Check if we found the start of a JPEG file
+            if video_bytes[i] == 0xFF and video_bytes[i + 1] == 0xD8:
+                img_starts.append(i)
+                # Move to the next byte to continue searching for the next image start
+                i += 2
+            else:
+                i += 1
+    frames = []
+    for start_idx in img_starts:
+        # Assuming each image is back-to-back, the end of one image is the start of another
+        # The last image goes until the end of the byte string
+        end_idx = (
+            img_starts[img_starts.index(start_idx) + 1]
+            if img_starts.index(start_idx) + 1 < len(img_starts)
+            else len(video_bytes)
+        )
+        img_bytes = video_bytes[start_idx:end_idx]
+        # Convert bytes to a PIL Image
+        img = Image.open(BytesIO(img_bytes))
+        # Convert PIL Image to a NumPy array
+        frame = np.array(img)
+        # Append the frame to the list of frames
+        frames.append(frame)
+    # Ensure there's at least one frame to avoid errors with np.stack
+    if frames:
+        return np.stack(frames, axis=0), img.size
+    else:
+        return np.array([]), (
+            0,
+            0,
+        )  # Return an empty array and size tuple if no frames were found
 def load_image(image_file):
     from PIL import Image
-    image = None
+    image = image_size = None
     if image_file.startswith("http://") or image_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
@@ -255,7 +373,54 @@ def load_image(image_file):
     elif image_file.startswith("data:"):
         image_file = image_file.split(",")[1]
         image = Image.open(BytesIO(base64.b64decode(image_file)))
+    elif image_file.startswith("video:"):
+        image_file = image_file.replace("video:", "")
+        image, image_size = decode_video_base64(image_file)
     else:
         image = Image.open(BytesIO(base64.b64decode(image_file)))
-    return image
+    return image, image_size
+def assert_pkg_version(pkg: str, min_version: str):
+    try:
+        installed_version = version(pkg)
+        if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
+            raise Exception(
+                f"{pkg} is installed with version {installed_version} which "
+                f"is less than the minimum required version {min_version}"
+            )
+    except PackageNotFoundError:
+        raise Exception(
+            f"{pkg} with minimum required version {min_version} is not installed"
+        )
+API_KEY_HEADER_NAME = "X-API-Key"
+class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app, api_key: str):
+        super().__init__(app)
+        self.api_key = api_key
+    async def dispatch(self, request, call_next):
+        # extract API key from the request headers
+        api_key_header = request.headers.get(API_KEY_HEADER_NAME)
+        if not api_key_header or api_key_header != self.api_key:
+            return JSONResponse(
+                status_code=403,
+                content={"detail": "Invalid API Key"},
+            )
+        response = await call_next(request)
+        return response
+# FIXME: Remove this once we drop support for pydantic 1.x
+IS_PYDANTIC_1 = int(pydantic.VERSION.split(".")[0]) == 1
+def jsonify_pydantic_model(obj: BaseModel):
+    if IS_PYDANTIC_1:
+        return obj.json(ensure_ascii=False)
+    return obj.model_dump_json()

sglang 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

sglang 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl