PyPI - sglang - Versions diffs - 0.3.6.post2__tar.gz → 0.4.0__tar.gz - Mend

sglang 0.3.6.post2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

{sglang-0.3.6.post2 → sglang-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.6.post2
+Version: 0.4.0
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
+Requires-Dist: flashinfer>=0.1.6; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"

{sglang-0.3.6.post2 → sglang-0.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.6.post2"
+version = "0.4.0"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
     "xgrammar>=0.1.4"]
-srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_offline_throughput.py RENAMED Viewed

@@ -14,20 +14,20 @@ import argparse
 import dataclasses
 import json
 import logging
+import os
 import random
 import time
 from typing import Dict, List, Optional, Tuple
 import numpy as np
-from sglang.api import Engine
 from sglang.bench_serving import (
     get_dataset,
     get_tokenizer,
     sample_random_requests,
     set_ulimit,
 )
-from sglang.srt.server import Runtime
+from sglang.srt.server import Engine, Runtime
 from sglang.srt.server_args import ServerArgs
@@ -52,6 +52,7 @@ class BenchArgs:
     seed: int = 1
     skip_warmup: bool = False
     do_not_exit: bool = False
+    profile: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -156,6 +157,12 @@ class BenchArgs:
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -169,6 +176,7 @@ def throughput_test_once(
     reqs: List[Tuple[str, int, int]],
     ignore_eos: bool,
     extra_request_body: Dict,
+    profile: bool,
 ):
     measurement_results = {
         "backend": backend_name,
@@ -194,7 +202,15 @@ def throughput_test_once(
     ]
     st = time.perf_counter()
+    if profile:
+        backend.start_profile()
     gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    if profile:
+        backend.stop_profile()
+        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
     latency = time.perf_counter() - st
     if backend_name == "runtime":
@@ -221,6 +237,41 @@ def throughput_test_once(
     return measurement_results
+def monitor_trace_file(directory, interval=1):
+    print(f"Monitoring {directory} for new trace files...")
+    known_files = set(os.listdir(directory))
+    while True:
+        flag = False
+        time.sleep(interval)
+        current_files = set(os.listdir(directory))
+        new_files = current_files - known_files
+        for new_file in new_files:
+            new_file_path = os.path.join(directory, new_file)
+            print(f"New file detected: {new_file}")
+            previous_size = 0
+            while True:
+                try:
+                    current_size = os.path.getsize(new_file_path)
+                except FileNotFoundError:
+                    print(f"File {new_file} is no longer accessible.")
+                    break
+                if current_size > previous_size:
+                    previous_size = current_size
+                else:
+                    flag = True
+                    break
+                time.sleep(interval)
+        if flag:
+            break
 def throughput_test(
     server_args: ServerArgs,
     bench_args: BenchArgs,
@@ -268,6 +319,7 @@ def throughput_test(
             reqs=warmup_requests,
             ignore_eos=not bench_args.disable_ignore_eos,
             extra_request_body=extra_request_body,
+            profile=False,
         )
     logging.info("\nBenchmark...")
@@ -277,6 +329,7 @@ def throughput_test(
         reqs=input_requests,
         ignore_eos=not bench_args.disable_ignore_eos,
         extra_request_body=extra_request_body,
+        profile=bench_args.profile,
     )
     if bench_args.result_filename:

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch.py RENAMED Viewed

@@ -47,6 +47,7 @@ import itertools
 import json
 import logging
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    configure_logger,
-    kill_child_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
 @dataclasses.dataclass
@@ -114,8 +111,12 @@ def load_model(server_args, port_args, tp_rank):
     model_config = ModelConfig(
         server_args.model_path,
         trust_remote_code=server_args.trust_remote_code,
+        revision=server_args.revision,
         context_length=server_args.context_length,
         model_override_args=server_args.json_model_override_args,
+        is_embedding=server_args.is_embedding,
+        dtype=server_args.dtype,
+        quantization=server_args.quantization,
     )
     model_runner = ModelRunner(
         model_config=model_config,
@@ -468,4 +469,4 @@ if __name__ == "__main__":
         main(server_args, bench_args)
     finally:
         if server_args.tp_size != 1:
-            kill_child_process()
+            kill_process_tree(os.getpid(), include_parent=False)

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -15,6 +15,7 @@ import dataclasses
 import itertools
 import json
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -23,7 +24,7 @@ import requests
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 @dataclasses.dataclass
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)
 def launch_server_process(server_args: ServerArgs):
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid, include_self=True)
+            kill_process_tree(proc.pid)
     print(f"\nResults are saved to {bench_args.result_filename}")

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/bench_serving.py RENAMED Viewed

@@ -51,6 +51,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    lora_name: str
     extra_request_body: Dict[str, Any]
@@ -319,6 +320,7 @@ async def async_request_sglang_generate(
                 "ignore_eos": not args.disable_ignore_eos,
             },
             "stream": not args.disable_stream,
+            "lora_path": request_func_input.lora_name,
             **request_func_input.extra_request_body,
         }
         headers = {}
@@ -884,6 +886,7 @@ async def benchmark(
     request_rate: float,
     max_concurrency: Optional[int],
     disable_tqdm: bool,
+    lora_name: str,
     extra_request_body: Dict[str, Any],
     profile: bool,
 ):
@@ -909,6 +912,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        lora_name=lora_name,
         extra_request_body=extra_request_body,
     )
     test_output = await request_func(request_func_input=test_input)
@@ -942,6 +946,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            lora_name=lora_name,
             extra_request_body=extra_request_body,
         )
         tasks.append(
@@ -1247,6 +1252,7 @@ def run_benchmark(args_: argparse.Namespace):
                 request_rate=args.request_rate,
                 max_concurrency=args.max_concurrency,
                 disable_tqdm=args.disable_tqdm,
+                lora_name=args.lora_name,
                 extra_request_body=extra_request_body,
                 profile=args.profile,
             )
@@ -1267,6 +1273,7 @@ def run_benchmark(args_: argparse.Namespace):
                     request_rate=rate,
                     max_concurrency=args.max_concurrency,
                     disable_tqdm=args.disable_tqdm,
+                    lora_name=args.lora_name,
                     extra_request_body=extra_request_body,
                     profile=args.profile,
                 )
@@ -1451,5 +1458,11 @@ if __name__ == "__main__":
         help="Use Torch Profiler. The endpoint must be launched with "
         "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
     )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
     args = parser.parse_args()
     run_benchmark(args)

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/check_env.py RENAMED Viewed

@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
 import torch
-# List of packages to check versions for
+# List of packages to check versions
 PACKAGE_LIST = [
     "sglang",
     "flashinfer",

{sglang-0.3.6.post2 → sglang-0.4.0}/sglang/launch_server.py RENAMED Viewed

@@ -1,10 +1,11 @@
 """Launch the inference server."""
+import os
 import sys
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import prepare_server_args
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 if __name__ == "__main__":
     server_args = prepare_server_args(sys.argv[1:])
@@ -12,4 +13,4 @@ if __name__ == "__main__":
     try:
         launch_server(server_args)
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)

sglang-0.4.0/sglang/srt/_custom_ops.py ADDED Viewed

@@ -0,0 +1,118 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
+import contextlib
+import functools
+import importlib
+import logging
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import torch
+import torch.library
+from sglang.srt.utils import is_hpu
+logger = logging.getLogger(__name__)
+if not is_hpu():
+    try:
+        import custom_ar
+    except ImportError as e:
+        logger.warning("Failed to import from custom_ar with %r", e)
+def hint_on_error(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except NotImplementedError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Not implemented or built, mostly likely because the current current device "
+                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
+                "incorrectly while building)"
+            )
+            logger.error(msg, fn.__name__, e)
+            raise NotImplementedError(msg % (fn.__name__, e)) from e
+        except AttributeError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Possibly you have built or installed an obsolete version of vllm.\n"
+                "Please try a clean build and install of vllm,"
+                "or remove old built files such as vllm/*cpython*.so and build/ ."
+            )
+            logger.error(msg, fn.__name__, e)
+            raise e
+    return wrapper
+# custom ar
+def init_custom_ar(
+    ipc_tensors: List[torch.Tensor],
+    rank_data: torch.Tensor,
+    rank: int,
+    full_nvlink: bool,
+) -> int:
+    return torch.ops._C_vllm_ar.init_custom_ar(
+        ipc_tensors, rank_data, rank, full_nvlink
+    )
+def all_reduce(
+    fa: int,
+    inp: torch.Tensor,
+    out: torch.Tensor,
+    reg_buffer: int,
+    reg_buffer_sz_bytes: int,
+) -> None:
+    torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+def dispose(fa: int) -> None:
+    torch.ops._C_vllm_ar.dispose(fa)
+def meta_size() -> int:
+    return torch.ops._C_vllm_ar.meta_size()
+def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+    return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+    return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
+def register_graph_buffers(
+    fa: int, handles: List[List[int]], offsets: List[List[int]]
+) -> None:
+    torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
+# temporary fix for https://github.com/vllm-project/vllm/issues/5456
+# TODO: remove this in v0.6.0
+names_and_values = globals()
+names_and_values_to_update = {}
+# prepare variables to avoid dict size change during iteration
+k, v, arg = None, None, None
+fn_type = type(lambda x: x)
+for k, v in names_and_values.items():
+    # find functions that are defined in this file and have torch.Tensor
+    # in their annotations. `arg == "torch.Tensor"` is used to handle
+    # the case when users use `import __annotations__` to turn type
+    # hints into strings.
+    if (
+        isinstance(v, fn_type)
+        and v.__code__.co_filename == __file__
+        and any(
+            arg is torch.Tensor or arg == "torch.Tensor"
+            for arg in v.__annotations__.values()
+        )
+    ):
+        names_and_values_to_update[k] = hint_on_error(v)
+names_and_values.update(names_and_values_to_update)
+del names_and_values_to_update, names_and_values, v, k, fn_type

sglang-0.4.0/sglang/srt/configs/device_config.py ADDED Viewed

@@ -0,0 +1,17 @@
+import logging
+from typing import Optional
+import torch
+logger = logging.getLogger(__name__)
+class DeviceConfig:
+    device: Optional[torch.device]
+    def __init__(self, device: str = "cuda") -> None:
+        if device in ["cuda", "xpu", "hpu"]:
+            self.device_type = device
+        else:
+            raise RuntimeError(f"Not supported device type: {device}")
+        self.device = torch.device(self.device_type)

sglang-0.4.0/sglang/srt/configs/load_config.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+import enum
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+from sglang.srt.utils import is_hip
+logger = logging.getLogger(__name__)
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
+    BITSANDBYTES = "bitsandbytes"
+    MISTRAL = "mistral"
+@dataclass
+class LoadConfig:
+    """
+    download_dir: Directory to download and load the weights, default to the
+        default cache directory of huggingface.
+    load_format: The format of the model weights to load:
+        "auto" will try to load the weights in the safetensors format and
+            fall back to the pytorch bin format if safetensors format is
+            not available.
+        "pt" will load the weights in the pytorch bin format.
+        "safetensors" will load the weights in the safetensors format.
+        "npcache" will load the weights in pytorch format and store
+            a numpy cache to speed up the loading.
+        "dummy" will initialize the weights with random values, which is
+            mainly for profiling.
+        "bitsandbytes" will load nf4 type weights.
+    ignore_patterns: The list of patterns to ignore when loading the model.
+        Default to "original/**/*" to avoid repeated loading of llama's
+        checkpoints.
+    """
+    load_format: Union[str, LoadFormat] = LoadFormat.AUTO
+    download_dir: Optional[str] = None
+    model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None
+    def __post_init__(self):
+        model_loader_extra_config = self.model_loader_extra_config or {}
+        if isinstance(model_loader_extra_config, str):
+            self.model_loader_extra_config = json.loads(model_loader_extra_config)
+        self._verify_load_format()
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns,
+            )
+        else:
+            self.ignore_patterns = ["original/**/*"]
+    def _verify_load_format(self) -> None:
+        if not isinstance(self.load_format, str):
+            return
+        load_format = self.load_format.lower()
+        self.load_format = LoadFormat(load_format)
+        rocm_not_supported_load_format: List[str] = []
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f
+                for f in LoadFormat.__members__
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load formats are "
+                f"{rocm_supported_load_format}"
+            )

sglang 0.3.6.post2__tar.gz → 0.4.0__tar.gz

sglang 0.3.6.post2tar.gz → 0.4.0tar.gz