PyPI - sglang - Versions diffs - 0.3.6.post1__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl - Mend

sglang 0.3.6.post1py3-none-any.whl → 0.3.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sglang/bench_offline_throughput.py +55 -2
sglang/bench_one_batch.py +4 -8
sglang/bench_one_batch_server.py +6 -5
sglang/check_env.py +7 -1
sglang/lang/tracer.py +1 -1
sglang/launch_server.py +2 -4
sglang/srt/configs/model_config.py +2 -6
sglang/srt/layers/attention/flashinfer_backend.py +3 -3
sglang/srt/layers/sampler.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -11
sglang/srt/managers/detokenizer_manager.py +7 -6
sglang/srt/managers/image_processor.py +7 -10
sglang/srt/managers/io_struct.py +0 -10
sglang/srt/managers/schedule_batch.py +51 -13
sglang/srt/managers/scheduler.py +41 -29
sglang/srt/managers/session_controller.py +15 -7
sglang/srt/managers/tokenizer_manager.py +4 -33
sglang/srt/managers/tp_worker_overlap_thread.py +11 -2
sglang/srt/models/grok.py +11 -48
sglang/srt/models/llava.py +16 -9
sglang/srt/models/olmo2.py +392 -0
sglang/srt/models/qwen2_vl.py +10 -3
sglang/srt/openai_api/adapter.py +1 -1
sglang/srt/server.py +48 -45
sglang/srt/server_args.py +1 -1
sglang/srt/utils.py +22 -24
sglang/test/test_utils.py +21 -8
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/METADATA +4 -2
{sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/RECORD +34 -36
sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
sglang/srt/layers/fused_moe_grok/layer.py +0 -630
{sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/top_level.txt +0 -0

sglang/bench_offline_throughput.py CHANGED Viewed

@@ -14,20 +14,20 @@ import argparse
 import dataclasses
 import json
 import logging
+import os
 import random
 import time
 from typing import Dict, List, Optional, Tuple
 import numpy as np
-from sglang.api import Engine
 from sglang.bench_serving import (
     get_dataset,
     get_tokenizer,
     sample_random_requests,
     set_ulimit,
 )
-from sglang.srt.server import Runtime
+from sglang.srt.server import Engine, Runtime
 from sglang.srt.server_args import ServerArgs
@@ -52,6 +52,7 @@ class BenchArgs:
     seed: int = 1
     skip_warmup: bool = False
     do_not_exit: bool = False
+    profile: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -156,6 +157,12 @@ class BenchArgs:
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -169,6 +176,7 @@ def throughput_test_once(
     reqs: List[Tuple[str, int, int]],
     ignore_eos: bool,
     extra_request_body: Dict,
+    profile: bool,
 ):
     measurement_results = {
         "backend": backend_name,
@@ -194,7 +202,15 @@ def throughput_test_once(
     ]
     st = time.perf_counter()
+    if profile:
+        backend.start_profile()
     gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    if profile:
+        backend.stop_profile()
+        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
     latency = time.perf_counter() - st
     if backend_name == "runtime":
@@ -221,6 +237,41 @@ def throughput_test_once(
     return measurement_results
+def monitor_trace_file(directory, interval=1):
+    print(f"Monitoring {directory} for new trace files...")
+    known_files = set(os.listdir(directory))
+    while True:
+        flag = False
+        time.sleep(interval)
+        current_files = set(os.listdir(directory))
+        new_files = current_files - known_files
+        for new_file in new_files:
+            new_file_path = os.path.join(directory, new_file)
+            print(f"New file detected: {new_file}")
+            previous_size = 0
+            while True:
+                try:
+                    current_size = os.path.getsize(new_file_path)
+                except FileNotFoundError:
+                    print(f"File {new_file} is no longer accessible.")
+                    break
+                if current_size > previous_size:
+                    previous_size = current_size
+                else:
+                    flag = True
+                    break
+                time.sleep(interval)
+        if flag:
+            break
 def throughput_test(
     server_args: ServerArgs,
     bench_args: BenchArgs,
@@ -268,6 +319,7 @@ def throughput_test(
             reqs=warmup_requests,
             ignore_eos=not bench_args.disable_ignore_eos,
             extra_request_body=extra_request_body,
+            profile=False,
         )
     logging.info("\nBenchmark...")
@@ -277,6 +329,7 @@ def throughput_test(
         reqs=input_requests,
         ignore_eos=not bench_args.disable_ignore_eos,
         extra_request_body=extra_request_body,
+        profile=bench_args.profile,
     )
     if bench_args.result_filename:

sglang/bench_one_batch.py CHANGED Viewed

@@ -47,6 +47,7 @@ import itertools
 import json
 import logging
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -62,11 +63,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    configure_logger,
-    kill_child_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
 @dataclasses.dataclass
@@ -466,7 +463,6 @@ if __name__ == "__main__":
     try:
         main(server_args, bench_args)
-    except Exception as e:
-        raise e
     finally:
-        kill_child_process()
+        if server_args.tp_size != 1:
+            kill_process_tree(os.getpid(), include_parent=False)

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
 It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
 Usage:
-python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
-python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 """
 import argparse
@@ -15,6 +15,7 @@ import dataclasses
 import itertools
 import json
 import multiprocessing
+import os
 import time
 from typing import Tuple
@@ -23,7 +24,7 @@ import requests
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 @dataclasses.dataclass
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)
 def launch_server_process(server_args: ServerArgs):
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid, include_self=True)
+            kill_process_tree(proc.pid)
     print(f"\nResults are saved to {bench_args.result_filename}")

sglang/check_env.py CHANGED Viewed

@@ -22,18 +22,24 @@ PACKAGE_LIST = [
     "hf_transfer",
     "huggingface_hub",
     "interegular",
+    "modelscope",
+    "orjson",
+    "outlines",
+    "packaging",
     "psutil",
     "pydantic",
     "multipart",
     "zmq",
+    "torchao",
     "uvicorn",
     "uvloop",
     "vllm",
-    "outlines",
+    "xgrammar",
     "openai",
     "tiktoken",
     "anthropic",
     "litellm",
+    "decord",
 ]

sglang/lang/tracer.py CHANGED Viewed

@@ -278,6 +278,6 @@ class TracingScope:
     def add_child_state(self, state: TracerProgramState):
         cur_scope = self
-        while cur_scope != None:
+        while cur_scope is not None:
             cur_scope.tracer_state.child_states.append(state)
             cur_scope = cur_scope.last_scope

sglang/launch_server.py CHANGED Viewed

@@ -5,14 +5,12 @@ import sys
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import prepare_server_args
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 if __name__ == "__main__":
     server_args = prepare_server_args(sys.argv[1:])
     try:
         launch_server(server_args)
-    except Exception as e:
-        raise e
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -14,13 +14,13 @@
 import json
 import logging
-import os
 from enum import IntEnum, auto
 from typing import List, Optional
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -59,13 +59,9 @@ class ModelConfig:
         # Derive context length
         derived_context_len = get_context_length(self.hf_text_config)
-        allow_long_context = os.environ.get(
-            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
-        )
         if context_length is not None:
             if context_length > derived_context_len:
-                if allow_long_context:
+                if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
                     logger.warning(
                         f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
                         f"This may lead to incorrect model outputs or CUDA errors."

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -18,7 +18,7 @@ import triton.language as tl
 from sglang.global_config import global_config
 from sglang.srt.layers.attention import AttentionBackend
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -47,8 +47,8 @@ class FlashInferAttnBackend(AttentionBackend):
         # Parse constants
         if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ:
-            self.decode_use_tensor_cores = (
-                os.environ["SGLANG_FLASHINFER_USE_TENSOR_CORE"].lower() == "true"
+            self.decode_use_tensor_cores = get_bool_env_var(
+                "SGLANG_FLASHINFER_USE_TENSOR_CORE"
             )
         else:
             if not _grouped_size_compiled_for_decode_kernels(

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -74,7 +74,7 @@ class Sampler(nn.Module):
                         filter_apply_order="joint",
                     )
-                if not torch.all(success):
+                if self.use_nan_detectioin and not torch.all(success):
                     logger.warning("Detected errors during sampling!")
                     batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -15,9 +15,11 @@
 import logging
 import multiprocessing as mp
+import signal
 import threading
 from enum import Enum, auto
+import psutil
 import zmq
 from sglang.srt.managers.io_struct import (
@@ -26,13 +28,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    bind_port,
-    configure_logger,
-    get_zmq_socket,
-    kill_parent_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -235,7 +231,7 @@ def run_data_parallel_controller_process(
     pipe_writer,
 ):
     configure_logger(server_args)
-    suppress_other_loggers()
+    parent_process = psutil.Process().parent()
     try:
         controller = DataParallelController(server_args, port_args)
@@ -244,6 +240,6 @@ def run_data_parallel_controller_process(
         )
         controller.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -15,9 +15,11 @@
 import dataclasses
 import logging
+import signal
 from collections import OrderedDict
 from typing import List, Union
+import psutil
 import zmq
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -25,12 +27,10 @@ from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchStrOut,
     BatchTokenIDOut,
-    GetMemPoolSizeReqOutput,
-    UpdateWeightReqOutput,
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
+from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -195,11 +195,12 @@ def run_detokenizer_process(
     port_args: PortArgs,
 ):
     configure_logger(server_args)
+    parent_process = psutil.Process().parent()
     try:
         manager = DetokenizerManager(server_args, port_args)
         manager.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -131,6 +131,7 @@ class LlavaImageProcessor(BaseImageProcessor):
         if not image_data:
             return None
+        modalities = request_obj.modalities or ["image"]
         aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
         grid_pinpoints = (
             self.hf_config.image_grid_pinpoints
@@ -139,9 +140,12 @@ class LlavaImageProcessor(BaseImageProcessor):
             else None
         )
+        if isinstance(image_data, str):
+            image_data = [image_data]
         if isinstance(image_data, list) and len(image_data) > 0:
-            # Multiple images
-            if len(image_data) > 1:
+            if "multi-images" in modalities or "video" in modalities:
+                # Multiple images
                 aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
                 pixel_values, image_hashes, image_sizes = [], [], []
                 res = []
@@ -166,13 +170,6 @@ class LlavaImageProcessor(BaseImageProcessor):
                 )
                 image_hashes = [image_hash]
                 image_sizes = [image_size]
-        elif isinstance(image_data, str):
-            # A single image
-            pixel_values, image_hash, image_size = await self._process_single_image(
-                image_data, aspect_ratio, grid_pinpoints
-            )
-            image_hashes = [image_hash]
-            image_sizes = [image_size]
         else:
             raise ValueError(f"Invalid image data: {image_data}")
@@ -341,7 +338,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             "pixel_values": pixel_values,
             "image_hashes": image_hashes,
             "image_sizes": image_sizes,
-            "modalities": request_obj.modalities,
+            "modalities": request_obj.modalities or ["image"],
             "image_grid_thws": image_grid_thws,
         }

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -376,16 +376,6 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
-@dataclass
-class GetMemPoolSizeReq:
-    pass
-@dataclass
-class GetMemPoolSizeReqOutput:
-    size: int
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -31,6 +31,7 @@ import dataclasses
 import logging
 from typing import List, Optional, Tuple, Union
+import numpy as np
 import torch
 import triton
 import triton.language as tl
@@ -123,7 +124,7 @@ class FINISH_ABORT(BaseFinishReason):
 class ImageInputs:
     """The image related inputs."""
-    pixel_values: torch.Tensor
+    pixel_values: Union[torch.Tensor, np.array]
     image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
@@ -131,7 +132,7 @@ class ImageInputs:
     modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
-    image_embeds: Optional[List[torch.Tensor]] = None
+    # Llava related
     aspect_ratio_ids: Optional[List[torch.Tensor]] = None
     aspect_ratio_mask: Optional[List[torch.Tensor]] = None
@@ -140,19 +141,17 @@ class ImageInputs:
     mrope_position_delta: Optional[torch.Tensor] = None
     @staticmethod
-    def from_dict(obj, vocab_size):
-        # Use image hash as fake token_ids, which is then used for prefix matching
+    def from_dict(obj: dict):
         ret = ImageInputs(
             pixel_values=obj["pixel_values"],
-            image_hashes=hash(tuple(obj["image_hashes"])),
+            image_hashes=obj["image_hashes"],
         )
-        image_hash = ret.image_hashes
-        ret.pad_values = [
-            (image_hash) % vocab_size,
-            (image_hash >> 16) % vocab_size,
-            (image_hash >> 32) % vocab_size,
-            (image_hash >> 64) % vocab_size,
-        ]
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        ret.pad_values = [x % (1 << 30) for x in ret.image_hashes]
         optional_args = [
             "image_sizes",
@@ -167,6 +166,29 @@ class ImageInputs:
         return ret
+    def merge(self, other):
+        assert self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
+        self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        self.image_hashes += other.image_hashes
+        self.pad_values = [x % (1 << 30) for x in self.image_hashes]
+        optional_args = [
+            "image_sizes",
+            "image_offsets",
+            # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
+            "aspect_ratio_ids",
+            "aspect_ratio_mask",
+            "image_grid_thws",
+        ]
+        for arg in optional_args:
+            if getattr(self, arg, None) is not None:
+                setattr(self, arg, getattr(self, arg) + getattr(other, arg))
 class Req:
     """The input and output status of a request."""
@@ -177,6 +199,7 @@ class Req:
         origin_input_text: str,
         origin_input_ids: Tuple[int],
         sampling_params: SamplingParams,
+        origin_input_ids_unpadded: Optional[Tuple[int]] = None,
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
@@ -184,7 +207,11 @@ class Req:
         # Input and output info
         self.rid = rid
         self.origin_input_text = origin_input_text
-        self.origin_input_ids_unpadded = origin_input_ids  # Before image padding
+        self.origin_input_ids_unpadded = (
+            origin_input_ids_unpadded
+            if origin_input_ids_unpadded
+            else origin_input_ids  # Before image padding
+        )
         self.origin_input_ids = origin_input_ids
         self.output_ids = []  # Each decode stage's output ids
         self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
@@ -201,6 +228,7 @@ class Req:
         self.tokenizer = None
         self.finished_reason = None
         self.stream = False
+        self.to_abort = False
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -260,6 +288,12 @@ class Req:
         # The number of cached tokens, that were already cached in the KV cache
         self.cached_tokens = 0
+    def extend_image_inputs(self, image_inputs):
+        if self.image_inputs is None:
+            self.image_inputs = image_inputs
+        else:
+            self.image_inputs.merge(image_inputs)
     # whether request reached finished condition
     def finished(self) -> bool:
         return self.finished_reason is not None
@@ -332,6 +366,10 @@ class Req:
         if self.finished():
             return
+        if self.to_abort:
+            self.finished_reason = FINISH_ABORT()
+            return
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
             self.finished_reason = FINISH_LENGTH(
                 length=self.sampling_params.max_new_tokens

sglang 0.3.6.post1__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl

sglang 0.3.6.post1py3-none-any.whl → 0.3.6.post3py3-none-any.whl