PyPI - sglang - Versions diffs - 0.3.6.post2__tar.gz → 0.3.6.post3__tar.gz - Mend

sglang 0.3.6.post2tar.gz → 0.3.6.post3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.6.post2
+Version: 0.3.6.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
+Requires-Dist: flashinfer>=0.1.6; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.6.post2"
+version = "0.3.6.post3"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
     "xgrammar>=0.1.4"]
-srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/bench_offline_throughput.py RENAMED Viewed

@@ -14,20 +14,20 @@ import argparse
 import dataclasses
 import json
 import logging
+import os
 import random
 import time
 from typing import Dict, List, Optional, Tuple
 import numpy as np
-from sglang.api import Engine
 from sglang.bench_serving import (
     get_dataset,
     get_tokenizer,
     sample_random_requests,
     set_ulimit,
 )
-from sglang.srt.server import Runtime
+from sglang.srt.server import Engine, Runtime
 from sglang.srt.server_args import ServerArgs
@@ -52,6 +52,7 @@ class BenchArgs:
     seed: int = 1
     skip_warmup: bool = False
     do_not_exit: bool = False
+    profile: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -156,6 +157,12 @@ class BenchArgs:
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -169,6 +176,7 @@ def throughput_test_once(
     reqs: List[Tuple[str, int, int]],
     ignore_eos: bool,
     extra_request_body: Dict,
+    profile: bool,
 ):
     measurement_results = {
         "backend": backend_name,
@@ -194,7 +202,15 @@ def throughput_test_once(
     ]
     st = time.perf_counter()
+    if profile:
+        backend.start_profile()
     gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    if profile:
+        backend.stop_profile()
+        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
     latency = time.perf_counter() - st
     if backend_name == "runtime":
@@ -221,6 +237,41 @@ def throughput_test_once(
     return measurement_results
+def monitor_trace_file(directory, interval=1):
+    print(f"Monitoring {directory} for new trace files...")
+    known_files = set(os.listdir(directory))
+    while True:
+        flag = False
+        time.sleep(interval)
+        current_files = set(os.listdir(directory))
+        new_files = current_files - known_files
+        for new_file in new_files:
+            new_file_path = os.path.join(directory, new_file)
+            print(f"New file detected: {new_file}")
+            previous_size = 0
+            while True:
+                try:
+                    current_size = os.path.getsize(new_file_path)
+                except FileNotFoundError:
+                    print(f"File {new_file} is no longer accessible.")
+                    break
+                if current_size > previous_size:
+                    previous_size = current_size
+                else:
+                    flag = True
+                    break
+                time.sleep(interval)
+        if flag:
+            break
 def throughput_test(
     server_args: ServerArgs,
     bench_args: BenchArgs,
@@ -268,6 +319,7 @@ def throughput_test(
             reqs=warmup_requests,
             ignore_eos=not bench_args.disable_ignore_eos,
             extra_request_body=extra_request_body,
+            profile=False,
         )
     logging.info("\nBenchmark...")
@@ -277,6 +329,7 @@ def throughput_test(
         reqs=input_requests,
         ignore_eos=not bench_args.disable_ignore_eos,
         extra_request_body=extra_request_body,
+        profile=bench_args.profile,
     )
     if bench_args.result_filename:

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/bench_one_batch.py RENAMED Viewed

@@ -47,6 +47,7 @@ import itertools
 import json
 import logging
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    configure_logger,
-    kill_child_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
 @dataclasses.dataclass
@@ -468,4 +465,4 @@ if __name__ == "__main__":
         main(server_args, bench_args)
     finally:
         if server_args.tp_size != 1:
-            kill_child_process()
+            kill_process_tree(os.getpid(), include_parent=False)

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -15,6 +15,7 @@ import dataclasses
 import itertools
 import json
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -23,7 +24,7 @@ import requests
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 @dataclasses.dataclass
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)
 def launch_server_process(server_args: ServerArgs):
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid, include_self=True)
+            kill_process_tree(proc.pid)
     print(f"\nResults are saved to {bench_args.result_filename}")

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/launch_server.py RENAMED Viewed

@@ -1,10 +1,11 @@
 """Launch the inference server."""
+import os
 import sys
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import prepare_server_args
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 if __name__ == "__main__":
     server_args = prepare_server_args(sys.argv[1:])
@@ -12,4 +13,4 @@ if __name__ == "__main__":
     try:
         launch_server(server_args)
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/data_parallel_controller.py RENAMED Viewed

@@ -15,9 +15,11 @@
 import logging
 import multiprocessing as mp
+import signal
 import threading
 from enum import Enum, auto
+import psutil
 import zmq
 from sglang.srt.managers.io_struct import (
@@ -26,13 +28,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    bind_port,
-    configure_logger,
-    get_zmq_socket,
-    kill_parent_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -235,7 +231,7 @@ def run_data_parallel_controller_process(
     pipe_writer,
 ):
     configure_logger(server_args)
-    suppress_other_loggers()
+    parent_process = psutil.Process().parent()
     try:
         controller = DataParallelController(server_args, port_args)
@@ -244,6 +240,6 @@ def run_data_parallel_controller_process(
         )
         controller.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/detokenizer_manager.py RENAMED Viewed

@@ -15,9 +15,11 @@
 import dataclasses
 import logging
+import signal
 from collections import OrderedDict
 from typing import List, Union
+import psutil
 import zmq
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -28,7 +30,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
+from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -193,11 +195,12 @@ def run_detokenizer_process(
     port_args: PortArgs,
 ):
     configure_logger(server_args)
+    parent_process = psutil.Process().parent()
     try:
         manager = DetokenizerManager(server_args, port_args)
         manager.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/image_processor.py RENAMED Viewed

@@ -338,7 +338,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             "pixel_values": pixel_values,
             "image_hashes": image_hashes,
             "image_sizes": image_sizes,
-            "modalities": request_obj.modalities,
+            "modalities": request_obj.modalities or ["image"],
             "image_grid_thws": image_grid_thws,
         }

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/io_struct.py RENAMED Viewed

@@ -376,16 +376,6 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
-@dataclass
-class GetMemPoolSizeReq:
-    pass
-@dataclass
-class GetMemPoolSizeReqOutput:
-    size: int
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/schedule_batch.py RENAMED Viewed

@@ -124,7 +124,7 @@ class FINISH_ABORT(BaseFinishReason):
 class ImageInputs:
     """The image related inputs."""
-    pixel_values: torch.Tensor
+    pixel_values: Union[torch.Tensor, np.array]
     image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
@@ -132,7 +132,7 @@ class ImageInputs:
     modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
-    image_embeds: Optional[List[torch.Tensor]] = None
+    # Llava related
     aspect_ratio_ids: Optional[List[torch.Tensor]] = None
     aspect_ratio_mask: Optional[List[torch.Tensor]] = None
@@ -141,19 +141,17 @@ class ImageInputs:
     mrope_position_delta: Optional[torch.Tensor] = None
     @staticmethod
-    def from_dict(obj, vocab_size):
-        # Use image hash as fake token_ids, which is then used for prefix matching
+    def from_dict(obj: dict):
         ret = ImageInputs(
             pixel_values=obj["pixel_values"],
-            image_hashes=hash(tuple(obj["image_hashes"])),
+            image_hashes=obj["image_hashes"],
         )
-        image_hash = ret.image_hashes
-        ret.pad_values = [
-            (image_hash) % vocab_size,
-            (image_hash >> 16) % vocab_size,
-            (image_hash >> 32) % vocab_size,
-            (image_hash >> 64) % vocab_size,
-        ]
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        ret.pad_values = [x % (1 << 30) for x in ret.image_hashes]
         optional_args = [
             "image_sizes",
@@ -168,17 +166,16 @@ class ImageInputs:
         return ret
-    def merge(self, other, vocab_size):
+    def merge(self, other):
         assert self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
         self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
-        self.image_hashes += other.image_hashes
-        self.pad_values = [
-            (self.image_hashes) % vocab_size,
-            (self.image_hashes >> 16) % vocab_size,
-            (self.image_hashes >> 32) % vocab_size,
-            (self.image_hashes >> 64) % vocab_size,
-        ]
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        self.image_hashes += other.image_hashes
+        self.pad_values = [x % (1 << 30) for x in self.image_hashes]
         optional_args = [
             "image_sizes",
@@ -231,6 +228,7 @@ class Req:
         self.tokenizer = None
         self.finished_reason = None
         self.stream = False
+        self.to_abort = False
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -290,11 +288,11 @@ class Req:
         # The number of cached tokens, that were already cached in the KV cache
         self.cached_tokens = 0
-    def extend_image_inputs(self, image_inputs, vocab_size):
+    def extend_image_inputs(self, image_inputs):
         if self.image_inputs is None:
             self.image_inputs = image_inputs
         else:
-            self.image_inputs.merge(image_inputs, vocab_size)
+            self.image_inputs.merge(image_inputs)
     # whether request reached finished condition
     def finished(self) -> bool:
@@ -368,6 +366,10 @@ class Req:
         if self.finished():
             return
+        if self.to_abort:
+            self.finished_reason = FINISH_ABORT()
+            return
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
             self.finished_reason = FINISH_LENGTH(
                 length=self.sampling_params.max_new_tokens

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/scheduler.py RENAMED Viewed

@@ -15,6 +15,7 @@
 import logging
 import os
+import signal
 import threading
 import time
 import warnings
@@ -23,6 +24,7 @@ from concurrent import futures
 from types import SimpleNamespace
 from typing import List, Optional
+import psutil
 import torch
 import zmq
@@ -36,8 +38,6 @@ from sglang.srt.managers.io_struct import (
     BatchTokenIDOut,
     CloseSessionReqInput,
     FlushCacheReq,
-    GetMemPoolSizeReq,
-    GetMemPoolSizeReqOutput,
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
@@ -73,7 +73,6 @@ from sglang.srt.utils import (
     crash_on_warnings,
     get_bool_env_var,
     get_zmq_socket,
-    kill_parent_process,
     set_gpu_proc_affinity,
     set_random_seed,
     suppress_other_loggers,
@@ -170,6 +169,10 @@ class Scheduler:
             self.enable_overlap = False
             logger.info("Overlap scheduler is disabled for embedding models.")
+        if self.model_config.is_multimodal:
+            self.enable_overlap = False
+            logger.info("Overlap scheduler is disabled for multimodal models.")
         if self.enable_overlap:
             self.disable_jump_forward = True
@@ -312,6 +315,7 @@ class Scheduler:
         self.watchdog_timeout = server_args.watchdog_timeout
         t = threading.Thread(target=self.watchdog_thread, daemon=True)
         t.start()
+        self.parent_process = psutil.Process().parent()
         # Init profiler
         if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
@@ -355,7 +359,7 @@ class Scheduler:
                     self.watchdog_last_time = time.time()
             time.sleep(self.watchdog_timeout / 2)
-        kill_parent_process()
+        self.parent_process.send_signal(signal.SIGQUIT)
     @torch.no_grad()
     def event_loop_normal(self):
@@ -515,10 +519,6 @@ class Scheduler:
                 self.send_to_tokenizer.send_pyobj(OpenSessionReqOutput(session_id))
             elif isinstance(recv_req, CloseSessionReqInput):
                 self.close_session(recv_req)
-            elif isinstance(recv_req, GetMemPoolSizeReq):
-                self.send_to_tokenizer.send_pyobj(
-                    GetMemPoolSizeReqOutput(self.max_total_num_tokens)
-                )
             else:
                 raise ValueError(f"Invalid request: {recv_req}")
@@ -526,8 +526,9 @@ class Scheduler:
         self,
         recv_req: TokenizedGenerateReqInput,
     ):
+        # Create a new request
         if recv_req.session_id is None or recv_req.session_id not in self.sessions:
-            # Create a new request
             if recv_req.input_embeds is not None:
                 # Generate fake input_ids based on the length of input_embeds
                 seq_length = len(recv_req.input_embeds)
@@ -558,25 +559,30 @@ class Scheduler:
                 self.waiting_queue.append(req)
                 return
-        # Image inputs
+        # Handle image inputs
         if recv_req.image_inputs is not None:
-            image_inputs = ImageInputs.from_dict(
-                recv_req.image_inputs, self.model_config.vocab_size
-            )
+            image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
             req.origin_input_ids = self.pad_input_ids_func(
                 req.origin_input_ids, image_inputs
             )
-            req.extend_image_inputs(image_inputs, self.model_config.vocab_size)
+            req.extend_image_inputs(image_inputs)
-            if len(req.origin_input_ids) > self.max_req_input_len:
-                req.finished_reason = FINISH_ABORT(
-                    "Image request length is longer than the KV cache pool size or "
-                    "the max context length aborting because you cannot truncate the image embeds"
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                logger.error(
+                    "Multimodal prompt is too long after expanding multimodal tokens. "
+                    f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}. "
                 )
+                req.origin_input_ids = [0]
+                req.image_inputs = None
                 req.sampling_params.max_new_tokens = 0
+                req.finished_reason = FINISH_ABORT(
+                    "Multimodal prompt is too long. Check server logs for details."
+                )
                 self.waiting_queue.append(req)
                 return
+        # Copy more attributes
         req.return_logprob = recv_req.return_logprob
         req.top_logprobs_num = recv_req.top_logprobs_num
         req.stream = recv_req.stream
@@ -1344,13 +1350,15 @@ class Scheduler:
         if to_del is not None:
             del self.waiting_queue[to_del]
+            logger.debug(f"Abort queued request. {req.rid=}")
+            return
         # Delete requests in the running batch
         if self.running_batch:
             for req in self.running_batch.reqs:
                 if req.rid == recv_req.rid and not req.finished():
-                    req.finished_reason = FINISH_ABORT()
-                    self.tree_cache.cache_finished_req(req)
+                    logger.debug(f"Abort running request. {req.rid=}")
+                    req.to_abort = True
                     break
     def update_weights(self, recv_req: UpdateWeightReqInput):
@@ -1409,9 +1417,9 @@ def run_scheduler_process(
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
         set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
-    # [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var
-    if dp_rank is None and "DP_RANK" in os.environ:
-        dp_rank = int(os.environ["DP_RANK"])
+    # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
     if dp_rank is None:
         configure_logger(server_args, prefix=f" TP{tp_rank}")
@@ -1419,6 +1427,7 @@ def run_scheduler_process(
         configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
     suppress_other_loggers()
+    parent_process = psutil.Process().parent()
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
@@ -1430,6 +1439,6 @@ def run_scheduler_process(
         else:
             scheduler.event_loop_normal()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

{sglang-0.3.6.post2 → sglang-0.3.6.post3}/sglang/srt/managers/session_controller.py RENAMED Viewed

@@ -10,10 +10,7 @@
 # limitations under the License.
 # ==============================================================================
-import copy
 import uuid
-from dataclasses import dataclass
-from typing import Optional
 from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, List, Req

sglang 0.3.6.post2__tar.gz → 0.3.6.post3__tar.gz

sglang 0.3.6.post2tar.gz → 0.3.6.post3tar.gz