PyPI - sglang - Versions diffs - 0.2.10__tar.gz → 0.2.11__tar.gz - Mend

sglang 0.2.10tar.gz → 0.2.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

{sglang-0.2.10/sglang.egg-info → sglang-0.2.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.10
+Version: 0.2.11
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -220,7 +220,6 @@ Requires-Dist: fastapi; extra == "srt"
 Requires-Dist: hf_transfer; extra == "srt"
 Requires-Dist: huggingface_hub; extra == "srt"
 Requires-Dist: interegular; extra == "srt"
-Requires-Dist: jsonlines; extra == "srt"
 Requires-Dist: packaging; extra == "srt"
 Requires-Dist: pillow; extra == "srt"
 Requires-Dist: psutil; extra == "srt"
@@ -230,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
 Requires-Dist: zmq; extra == "srt"
-Requires-Dist: vllm==0.5.3.post1; extra == "srt"
+Requires-Dist: vllm==0.5.4; extra == "srt"
 Requires-Dist: outlines>=0.0.44; extra == "srt"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
@@ -239,11 +238,18 @@ Provides-Extra: anthropic
 Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
+Provides-Extra: test
+Requires-Dist: jsonlines; extra == "test"
+Requires-Dist: matplotlib; extra == "test"
+Requires-Dist: pandas; extra == "test"
 Provides-Extra: all
 Requires-Dist: sglang[srt]; extra == "all"
 Requires-Dist: sglang[openai]; extra == "all"
 Requires-Dist: sglang[anthropic]; extra == "all"
 Requires-Dist: sglang[litellm]; extra == "all"
+Provides-Extra: dev
+Requires-Dist: sglang[all]; extra == "dev"
+Requires-Dist: sglang[test]; extra == "dev"
 <div align="center">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -296,20 +302,20 @@ pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 ### Method 3: Using docker
@@ -383,7 +389,7 @@ response = client.chat.completions.create(
 print(response)
 ```
-It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 ### Additional Server Arguments
 - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -394,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
 ```
-- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
+- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
+- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
+```
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
+```
 - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
@@ -411,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
-### Run Llama 3.1 405B
-```bash
-## Run 405B (fp8) on a single node
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
-## Run 405B (fp16) on two nodes
-# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
-# on the first node
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
-# on the second
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
-```
 ### Supported Models
 - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -452,6 +446,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
+### Run Llama 3.1 405B
+```bash
+## Run 405B (fp8) on a single node
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+## Run 405B (fp16) on two nodes
+# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
+# on the first node
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
+# on the second
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
+```
 ### Benchmark Performance
 - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.

{sglang-0.2.10 → sglang-0.2.11}/README.md RENAMED Viewed

@@ -49,20 +49,20 @@ pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 ### Method 3: Using docker
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
 print(response)
 ```
-It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 ### Additional Server Arguments
 - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -147,10 +147,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
 ```
-- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
+- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
+- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
+```
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
+```
 - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
@@ -164,22 +168,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
-### Run Llama 3.1 405B
-```bash
-## Run 405B (fp8) on a single node
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
-## Run 405B (fp16) on two nodes
-# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
-# on the first node
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
-# on the second
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
-```
 ### Supported Models
 - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -205,6 +193,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
+### Run Llama 3.1 405B
+```bash
+## Run 405B (fp8) on a single node
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+## Run 405B (fp16) on two nodes
+# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
+# on the first node
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
+# on the second
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
+```
 ### Benchmark Performance
 - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.

{sglang-0.2.10 → sglang-0.2.11}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.2.10"
+version = "0.2.11"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -20,14 +20,16 @@ dependencies = [
 ]
 [project.optional-dependencies]
-srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "jsonlines",
+srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
        "packaging", "pillow", "psutil", "pydantic", "python-multipart",
        "torch", "uvicorn", "uvloop", "zmq",
-       "vllm==0.5.3.post1", "outlines>=0.0.44"]
+       "vllm==0.5.4", "outlines>=0.0.44"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
+test = ["jsonlines", "matplotlib", "pandas"]
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+dev = ["sglang[all]", "sglang[test]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"

{sglang-0.2.10 → sglang-0.2.11}/sglang/__init__.py RENAMED Viewed

@@ -22,6 +22,11 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.lang.choices import (
+    greedy_token_selection,
+    token_length_normalized,
+    unconditional_likelihood_normalized,
+)
 # SGLang DSL APIs
 __all__ = [
@@ -45,6 +50,9 @@ __all__ = [
     "user_begin",
     "user_end",
     "video",
+    "greedy_token_selection",
+    "token_length_normalized",
+    "unconditional_likelihood_normalized",
 ]
 # Global Configurations

{sglang-0.2.10 → sglang-0.2.11}/sglang/api.py RENAMED Viewed

@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
 from sglang.lang.ir import (
     SglExpr,
     SglExprList,
@@ -73,12 +74,18 @@ def gen(
     return_text_in_logprobs: Optional[bool] = None,
     dtype: Optional[type] = None,
     choices: Optional[List[str]] = None,
+    choices_method: Optional[ChoicesSamplingMethod] = None,
     regex: Optional[str] = None,
 ):
     """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
     if choices:
-        return SglSelect(name, choices, 0.0 if temperature is None else temperature)
+        return SglSelect(
+            name,
+            choices,
+            0.0 if temperature is None else temperature,
+            token_length_normalized if choices_method is None else choices_method,
+        )
     # check regex is valid
     if regex is not None:
@@ -186,9 +193,10 @@ def select(
     name: Optional[str] = None,
     choices: Optional[List[str]] = None,
     temperature: float = 0.0,
+    choices_method: ChoicesSamplingMethod = token_length_normalized,
 ):
     assert choices is not None
-    return SglSelect(name, choices, temperature)
+    return SglSelect(name, choices, temperature, choices_method)
 def _role_common(name: str, expr: Optional[SglExpr] = None):

{sglang-0.2.10 → sglang-0.2.11}/sglang/bench_latency.py RENAMED Viewed

@@ -1,13 +1,21 @@
 """
 Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
-# Usage (latency test) with dummy weights:
+# Usage (latency test)
+## with dummy weights:
 python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
+## sweep through multiple data points and store (append) the results in a jsonl file:
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
+## do some changes, and store the results under a different run_name:
+python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
+## plot the results in series of lines:
+python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
-### Reference output (of the correctness test above, can be gpu dependent):
+## Reference output (of the correctness test above, can be gpu dependent):
 prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
@@ -28,19 +36,23 @@ I'm going to the park
 import argparse
 import dataclasses
+import itertools
 import logging
 import multiprocessing
+import os
+import sqlite3
 import time
 from typing import Tuple
-import jsonlines
 import numpy as np
+import pandas as pd
 import torch
 import torch.distributed as dist
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
@@ -49,26 +61,42 @@ from sglang.srt.utils import suppress_other_loggers
 @dataclasses.dataclass
 class BenchArgs:
+    run_name: str = "before"
     batch_size: Tuple[int] = (1,)
-    input_len: int = 1024
-    output_len: int = 4
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (4,)
     result_filename: str = ""
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
+    # Plotting args
+    graph_sql: str = (
+        "select run_name, batch_size, prefill_throughput from results where run_name='before'"
+    )
+    graph_filename: str = "out.png"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
         parser.add_argument(
             "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
         )
-        parser.add_argument("--input-len", type=int, default=BenchArgs.input_len)
-        parser.add_argument("--output-len", type=int, default=BenchArgs.output_len)
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
         parser.add_argument(
             "--result-filename", type=str, default=BenchArgs.result_filename
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        # graphing
+        parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
+        parser.add_argument(
+            "--graph-filename", type=str, default=BenchArgs.graph_filename
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -161,7 +189,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
 def extend(reqs, model_runner):
-    batch = Batch.init_new(
+    batch = ScheduleBatch.init_new(
         reqs=reqs,
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool=model_runner.token_to_kv_pool,
@@ -222,15 +250,21 @@ def correctness_test(
 @torch.inference_mode()
 def latency_test_run_once(
-    model_runner, rank_print, reqs, batch_size, input_len, output_len
+    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
 ):
+    max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
+    if batch_size > max_batch_size:
+        rank_print(
+            f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
+        )
+        return
     # Clear the pools.
     model_runner.req_to_token_pool.clear()
     model_runner.token_to_kv_pool.clear()
     measurement_results = {
-        "run_name": "before",
+        "run_name": run_name,
         "batch_size": batch_size,
         "input_len": input_len,
         "output_len": output_len,
@@ -291,49 +325,119 @@ def latency_test(
     # Load the model
     model_runner, tokenizer = load_model(server_args, tp_rank)
-    rank_print(
-        f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
-    )
-    # To make this PR easier to review, for now, only do the first element in batch_size tuple.
-    bench_args.batch_size = bench_args.batch_size[0]
-    # Prepare inputs
+    # Prepare inputs for warm up
     reqs = prepare_synthetic_inputs_for_latency_test(
-        bench_args.batch_size, bench_args.input_len
+        bench_args.batch_size[0], bench_args.input_len[0]
     )
     # Warm up
     latency_test_run_once(
-        model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
+        bench_args.run_name,
+        model_runner,
+        rank_print,
+        reqs,
+        bench_args.batch_size[0],
+        bench_args.input_len[0],
+        4,  # shorter decoding to speed up the warmup
     )
-    # Run again
+    # Run the sweep
     result_list = []
-    result_list.append(
-        latency_test_run_once(
-            model_runner,
-            rank_print,
-            reqs,
-            bench_args.batch_size,
-            bench_args.input_len,
-            bench_args.output_len,
+    for bs, il, ol in itertools.product(
+        bench_args.batch_size, bench_args.input_len, bench_args.output_len
+    ):
+        req = prepare_synthetic_inputs_for_latency_test(bs, il)
+        ret = latency_test_run_once(
+            bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
         )
-    )
+        if ret is not None:
+            result_list.append(ret)
+    # Write results in jsonlines format on rank 0.
+    if tp_rank == 0 and bench_args.result_filename:
+        import jsonlines
-    # Write results in jsonlines format.
-    if bench_args.result_filename:
         with jsonlines.open(bench_args.result_filename, "a") as f:
             f.write_all(result_list)
+def plot_latency_test(
+    server_args,
+    bench_args,
+    tp_rank,
+):
+    assert tp_rank == 0
+    # read the jsonl file and put in sqlite
+    df = pd.read_json(bench_args.result_filename, lines=True)
+    conn = sqlite3.connect(":memory:")
+    cur = conn.cursor()
+    # get the columns and their types
+    column_names = list(df.iloc[0].keys())
+    type_dict = {
+        str: "TEXT",
+        np.int64: "INTEGER",
+        np.float64: "FLOAT",
+    }
+    column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
+    # create the table
+    cur.execute(
+        f"""
+        CREATE TABLE IF NOT EXISTS results (
+            {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
+        )
+    """
+    )
+    conn.commit()
+    # write the results to DB
+    df.to_sql("results", conn, if_exists="replace", index=False)
+    conn.commit()
+    # read it back using sql
+    df = pd.read_sql_query(bench_args.graph_sql, conn)
+    conn.close()
+    # plot it and save to a file
+    import matplotlib.pyplot as plt
+    assert (
+        len(df.columns) == 3
+    ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
+    for label in df[df.columns[0]].unique():
+        q = f"{df.columns[0]}=='{label}'"
+        series = df.query(q)
+        plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
+    plt.xlabel(df.columns[1])
+    plt.ylabel(df.columns[2])
+    plt.legend()
+    plt.savefig(bench_args.graph_filename, dpi=300)
+    # if in kitty, just dump it to the terminal
+    if os.environ["TERM"] == "xterm-kitty":
+        os.system(
+            f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
+        )
 def main(server_args, bench_args):
-    print(bench_args)
-    if bench_args.correctness_test:
-        work_func = correctness_test
+    if server_args.model_path:
+        if bench_args.correctness_test:
+            work_func = correctness_test
+        else:
+            work_func = latency_test
+    elif os.path.isfile(bench_args.result_filename):
+        assert bench_args.graph_filename, "please provide a filename for the graph"
+        work_func = plot_latency_test
     else:
-        work_func = latency_test
+        raise ValueError(
+            "Provide --model-path for running the tests or "
+            "provide --result-filename for plotting the results"
+        )
     if server_args.tp_size == 1:
         work_func(server_args, bench_args, 0)
@@ -361,6 +465,11 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     BenchArgs.add_cli_args(parser)
+    # For this script, model-path is not required
+    assert (
+        parser._actions[1].option_strings[0] == "--model-path"
+    ), "options changed, this code need to be updated"
+    parser._actions[1].required = False
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)

{sglang-0.2.10 → sglang-0.2.11}/sglang/check_env.py RENAMED Viewed

@@ -14,6 +14,7 @@ PACKAGE_LIST = [
     "sglang",
     "flashinfer",
     "triton",
+    "transformers",
     "requests",
     "tqdm",
     "numpy",
@@ -73,10 +74,26 @@ def _get_gpu_info():
     Get information about available GPUs.
     """
     devices = defaultdict(list)
+    capabilities = defaultdict(list)
     for k in range(torch.cuda.device_count()):
         devices[torch.cuda.get_device_name(k)].append(str(k))
+        capability = torch.cuda.get_device_capability(k)
+        capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
-    return {f"GPU {','.join(device_ids)}": name for name, device_ids in devices.items()}
+    gpu_info = {}
+    for name, device_ids in devices.items():
+        gpu_info[f"GPU {','.join(device_ids)}"] = name
+    if len(capabilities) == 1:
+        # All GPUs have the same compute capability
+        cap, gpu_ids = list(capabilities.items())[0]
+        gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+    else:
+        # GPUs have different compute capabilities
+        for cap, gpu_ids in capabilities.items():
+            gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+    return gpu_info
 def _get_cuda_version_info():
@@ -118,6 +135,7 @@ def _get_cuda_driver_version():
     """
     Get CUDA driver version.
     """
+    versions = set()
     try:
         output = subprocess.check_output(
             [
@@ -126,7 +144,11 @@ def _get_cuda_driver_version():
                 "--format=csv,noheader,nounits",
             ]
         )
-        return {"CUDA Driver Version": output.decode().strip()}
+        versions = set(output.decode().strip().split("\n"))
+        if len(versions) == 1:
+            return {"CUDA Driver Version": versions.pop()}
+        else:
+            return {"CUDA Driver Versions": ", ".join(sorted(versions))}
     except subprocess.SubprocessError:
         return {"CUDA Driver Version": "Not Available"}

{sglang-0.2.10 → sglang-0.2.11}/sglang/global_config.py RENAMED Viewed

@@ -19,7 +19,6 @@ class GlobalConfig:
         self.init_new_token_ratio = 0.7
         self.base_min_new_token_ratio = 0.1
         self.new_token_ratio_decay = 0.001
-        self.new_token_ratio_recovery = 0.05
         # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
         # This can improve the speed for large batch sizes during prefill.

sglang 0.2.10__tar.gz → 0.2.11__tar.gz

sglang 0.2.10tar.gz → 0.2.11tar.gz