PyPI - sglang - Versions diffs - 0.4.4.post2__tar.gz → 0.4.4.post4__tar.gz - Mend

sglang 0.4.4.post2tar.gz → 0.4.4.post4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (587) hide show

{sglang-0.4.4.post2/sglang.egg-info → sglang-0.4.4.post4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.4.post2
+Version: 0.4.4.post4
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
 Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
+Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: decord; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
@@ -233,21 +234,25 @@ Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
+Requires-Dist: pynvml; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: transformers==4.50.0; extra == "runtime-common"
+Requires-Dist: transformers==4.51.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.16; extra == "runtime-common"
+Requires-Dist: compressed-tensors; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.5.post3; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
+Requires-Dist: partial_json_parser; extra == "srt"
+Requires-Dist: einops; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -271,7 +276,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
 Provides-Extra: torch-memory-saver
-Requires-Dist: torch_memory_saver>=0.0.3; extra == "torch-memory-saver"
+Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
 Provides-Extra: test
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.4.post2"
+version = "0.4.4.post4"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -17,6 +17,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
 [project.optional-dependencies]
 runtime_common = [
+    "compressed-tensors",
     "datasets",
     "decord",
     "fastapi",
@@ -32,28 +33,37 @@ runtime_common = [
     "prometheus-client>=0.20.0",
     "psutil",
     "pydantic",
+    "pynvml",
     "python-multipart",
     "pyzmq>=25.1.2",
     "soundfile==0.13.1",
     "torchao>=0.7.0",
-    "transformers==4.50.0",
+    "transformers==4.51.0",
     "uvicorn",
     "uvloop",
-    "xgrammar==0.1.16",
+    "compressed-tensors",
+    "xgrammar==0.1.17",
 ]
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.5.post3",
+    "sgl-kernel==0.0.8",
     "flashinfer_python==0.2.3",
     "torch==2.5.1",
     "cuda-python",
     "outlines>=0.0.44,<=0.1.11",
+    "partial_json_parser",
+    "einops",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20250114, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
+srt_hip = [
+    "sglang[runtime_common]",
+    "torch",
+    "vllm==0.6.7.dev2",
+    "outlines==0.1.11"
+]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
@@ -71,7 +81,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
-torch_memory_saver = ["torch_memory_saver>=0.0.3"]
+torch_memory_saver = ["torch_memory_saver>=0.0.4"]
 test = [
     "jsonlines",
     "matplotlib",

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/sglang/bench_serving.py RENAMED Viewed

@@ -44,6 +44,12 @@ ASSISTANT_SUFFIX = "Assistant:"
 global args
+# don't want to import sglang package here
+def _get_bool_env_var(name: str, default: str = "false") -> bool:
+    value = os.getenv(name, default)
+    return value.lower() in ("true", "1")
 @dataclass
 class RequestFuncInput:
     prompt: str
@@ -965,10 +971,11 @@ async def benchmark(
     request_rate: float,
     max_concurrency: Optional[int],
     disable_tqdm: bool,
-    lora_name: str,
+    lora_names: List[str],
     extra_request_body: Dict[str, Any],
     profile: bool,
     pd_seperated: bool = False,
+    flush_cache: bool = False,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -986,8 +993,16 @@ async def benchmark(
             return await request_func(request_func_input=request_func_input, pbar=pbar)
     # Warmup
-    print("Starting initial single prompt test run...")
+    print(f"Starting warmup with {args.warmup_requests} sequences...")
+    # Use the first request for all warmup iterations
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    if lora_names != None and len(lora_names) != 0:
+        lora_name = lora_names[0]
+    else:
+        lora_name = None
+    # Create the test input once
     test_input = RequestFuncInput(
         model=model_id,
         prompt=test_prompt,
@@ -997,17 +1012,29 @@ async def benchmark(
         lora_name=lora_name,
         extra_request_body=extra_request_body,
     )
-    test_output = await request_func(request_func_input=test_input)
-    if not test_output.success:
+    # Run warmup requests
+    warmup_tasks = []
+    for _ in range(args.warmup_requests):
+        warmup_tasks.append(
+            asyncio.create_task(request_func(request_func_input=test_input))
+        )
+    warmup_outputs = await asyncio.gather(*warmup_tasks)
+    # Check if at least one warmup request succeeded
+    if not any(output.success for output in warmup_outputs):
         raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}"
+            "Warmup failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {warmup_outputs[0].error}"
         )
     else:
-        print("Initial test run completed. Starting main benchmark run...")
+        print(
+            f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
+        )
     # Flush cache
-    if "sglang" in backend:
+    if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
         requests.post(base_url + "/flush_cache", headers=get_auth_headers())
     time.sleep(1.0)
@@ -1028,6 +1055,12 @@ async def benchmark(
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
+        if lora_names != None and len(lora_names) != 0:
+            idx = random.randint(0, len(lora_names) - 1)
+            lora_name = lora_names[idx]
+        else:
+            lora_name = None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=prompt,
@@ -1235,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
     if not hasattr(args, "max_concurrency"):
         args.max_concurrency = None
+    # Set default value for warmup_requests if not present
+    if not hasattr(args, "warmup_requests"):
+        args.warmup_requests = 1
     print(f"benchmark_args={args}")
     # Set global environments
@@ -1336,6 +1373,10 @@ def run_benchmark(args_: argparse.Namespace):
     tokenizer = get_tokenizer(tokenizer_id)
     input_requests = get_dataset(args, tokenizer)
+    # compatible with SimpleNamespace
+    if not hasattr(args, "flush_cache"):
+        args.flush_cache = False
     return asyncio.run(
         benchmark(
             backend=backend,
@@ -1347,10 +1388,11 @@ def run_benchmark(args_: argparse.Namespace):
             request_rate=args.request_rate,
             max_concurrency=args.max_concurrency,
             disable_tqdm=args.disable_tqdm,
-            lora_name=args.lora_name,
+            lora_names=args.lora_name,
             extra_request_body=extra_request_body,
             profile=args.profile,
             pd_seperated=args.pd_seperated,
+            flush_cache=args.flush_cache,
         )
     )
@@ -1366,6 +1408,13 @@ def set_ulimit(target_soft_limit=65535):
             print(f"Fail to set RLIMIT_NOFILE: {e}")
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, [])
+        for lora_name in values:
+            getattr(namespace, self.dest).append(lora_name)
 if __name__ == "__main__":
     parser = ArgumentParser(description="Benchmark the online serving throughput.")
     parser.add_argument(
@@ -1509,8 +1558,10 @@ if __name__ == "__main__":
     parser.add_argument(
         "--lora-name",
         type=str,
+        nargs="*",
         default=None,
-        help="The name of LoRA adapter",
+        action=LoRAPathAction,
+        help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
     )
     parser.add_argument(
         "--prompt-suffix",
@@ -1523,6 +1574,17 @@ if __name__ == "__main__":
         action="store_true",
         help="Benchmark PD disaggregation server",
     )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        help="Flush the cache before running the benchmark",
+    )
+    parser.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=1,
+        help="Number of warmup requests to run before the benchmark",
+    )
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(

sglang-0.4.4.post4/sglang/srt/_custom_ops.py ADDED Viewed

@@ -0,0 +1,117 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
+import logging
+import os
+from typing import List, Tuple
+import torch
+import torch.library
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
+logger = logging.getLogger(__name__)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
+)
+if not is_hpu():
+    # ROCm does not use vllm custom allreduce
+    if use_vllm_custom_allreduce and not is_hip():
+        try:
+            import vllm._C
+        except ImportError as e:
+            logger.warning("Failed to import from vllm._C with %r", e)
+    else:
+        try:
+            import sgl_kernel
+        except ImportError as e:
+            logger.warning("Failed to import from custom_ar with %r", e)
+if not is_hip():
+    if use_vllm_custom_allreduce:
+        custom_op = torch.ops._C_custom_ar
+    else:
+        custom_op = sgl_kernel.allreduce
+    # custom allreduce
+    def init_custom_ar(
+        ipc_tensors: List[torch.Tensor],
+        rank_data: torch.Tensor,
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+    def dispose(fa: int) -> None:
+        custom_op.dispose(fa)
+    def meta_size() -> int:
+        return custom_op.meta_size()
+    def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+        return custom_op.register_buffer(fa, ipc_tensors)
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return custom_op.get_graph_buffer_ipc_meta(fa)
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        custom_op.register_graph_buffers(fa, handles, offsets)
+else:
+    # ROCM custom allreduce
+    def init_custom_ar(
+        meta: torch.Tensor,
+        rank_data: torch.Tensor,
+        handles: List[str],
+        offsets: List[int],
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return sgl_kernel.allreduce.init_custom_ar(
+            meta, rank_data, handles, offsets, rank, full_nvlink
+        )
+    def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
+    def all_reduce_unreg(
+        fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
+    def dispose(fa: int) -> None:
+        sgl_kernel.allreduce.dispose(fa)
+    def meta_size() -> int:
+        return sgl_kernel.allreduce.meta_size()
+    def register_buffer(
+        fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
+    ) -> None:
+        return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
+        return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
+    def register_graph_buffers(
+        fa: int, handles: List[str], offsets: List[List[int]]
+    ) -> None:
+        sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
+    def allocate_meta_buffer(size: int) -> torch.Tensor:
+        return sgl_kernel.allreduce.allocate_meta_buffer(size)
+    def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
+        return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/sglang/srt/configs/deepseekvl2.py RENAMED Viewed

@@ -4,7 +4,6 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 import torch
-import torchvision.transforms as T
 from PIL import Image, ImageOps
 from transformers import (
     AutoProcessor,
@@ -76,6 +75,16 @@ class ImageTransform(object):
         self.std = std
         self.normalize = normalize
+        # only load torchvision.transforms when needed
+        try:
+            import torchvision.transforms as T
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
+            ) from err
         transform_pipelines = [T.ToTensor()]
         if normalize:

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -22,11 +22,7 @@ import torch
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
-from sglang.srt.layers.quantization import (
-    BASE_QUANTIZATION_METHODS,
-    QUANTIZATION_METHODS,
-    VLLM_AVAILABLE,
-)
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.utils import get_bool_env_var, is_hip
 logger = logging.getLogger(__name__)
@@ -239,12 +235,7 @@ class ModelConfig:
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
     def _verify_quantization(self) -> None:
-        # Select supported quantization methods based on vllm availability
-        if VLLM_AVAILABLE:
-            supported_quantization = [*QUANTIZATION_METHODS]
-        else:
-            supported_quantization = [*BASE_QUANTIZATION_METHODS]
+        supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
             "awq",
             "gptq",
@@ -267,6 +258,7 @@ class ModelConfig:
             "experts_int8",
             "w8a8_int8",
             "w8a8_fp8",
+            "moe_wna16",
         ]
         compatible_quantization_methods = {
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
@@ -282,11 +274,7 @@ class ModelConfig:
             quant_method = quant_cfg.get("quant_method", "").lower()
             # Detect which checkpoint is it
-            # Only iterate through currently available quantization methods
-            available_methods = (
-                QUANTIZATION_METHODS if VLLM_AVAILABLE else BASE_QUANTIZATION_METHODS
-            )
-            for _, method in available_methods.items():
+            for _, method in QUANTIZATION_METHODS.items():
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization
                 )
@@ -467,6 +455,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "InternLM2ForRewardModel" in model_architectures
         or "Qwen2ForRewardModel" in model_architectures
         or "Qwen2ForSequenceClassification" in model_architectures
+        or "CLIPModel" in model_architectures
     ):
         return False
     else:
@@ -488,6 +477,7 @@ multimodal_model_archs = [
     "MllamaForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
+    "CLIPModel",
 ]

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/sglang/srt/constrained/base_grammar_backend.py RENAMED Viewed

@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
             self.cache.clear()
-def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
+def create_grammar_backend(
+    server_args: ServerArgs, tokenizer, vocab_size: int
+) -> Optional[BaseGrammarBackend]:
     if server_args.grammar_backend == "outlines":
         from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
             tokenizer=tokenizer,
             whitespace_pattern=server_args.constrained_json_whitespace_pattern,
         )
+    elif server_args.grammar_backend == "none":
+        return None
     else:
         raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")

{sglang-0.4.4.post2 → sglang-0.4.4.post4}/sglang/srt/custom_op.py RENAMED Viewed

@@ -50,6 +50,7 @@ if _is_cuda:
     def scaled_fp8_quant(
         input: torch.Tensor,
         scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
         use_per_token_if_dynamic: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
@@ -59,6 +60,8 @@ if _is_cuda:
             input (torch.Tensor): Input tensor to be quantized
             scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
                 If None, scales will be computed dynamically.
+            num_token_padding (Optional[int]): If specified, pad the first dimension
+                of the output to at least this value.
             use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
                 determines the quantization granularity:
                 - True: compute scale per token
@@ -75,6 +78,8 @@ if _is_cuda:
         assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
         shape = input.shape
         out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
         output = torch.empty(shape, device=input.device, dtype=out_dtype)
         if scale is None:

sglang 0.4.4.post2__tar.gz → 0.4.4.post4__tar.gz

sglang 0.4.4.post2tar.gz → 0.4.4.post4tar.gz