PyPI - sglang - Versions diffs - 0.3.5.post2__tar.gz → 0.3.6.post1__tar.gz - Mend

sglang 0.3.5.post2tar.gz → 0.3.6.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/LICENSE RENAMED Viewed

@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2023-2024 SGLang Team
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.5.post2
+Version: 0.3.6.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -190,7 +190,7 @@ License:                                  Apache License
               same "printed page" as the copyright notice for easier
               identification within third-party archives.
-           Copyright [yyyy] [name of copyright owner]
+           Copyright 2023-2024 SGLang Team
            Licensed under the Apache License, Version 2.0 (the "License");
            you may not use this file except in compliance with the License.
@@ -222,29 +222,32 @@ Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
 Requires-Dist: interegular; extra == "runtime-common"
+Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
+Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: torchao; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
-Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
-Requires-Dist: modelscope; extra == "runtime-common"
+Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
-Requires-Dist: vllm==0.6.3.post1; extra == "srt"
+Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
 Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
+Provides-Extra: srt-hpu
+Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
 Requires-Dist: sglang[openai]; extra == "all-xpu"
 Requires-Dist: sglang[anthropic]; extra == "all-xpu"
 Requires-Dist: sglang[litellm]; extra == "all-xpu"
+Provides-Extra: all-hpu
+Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
+Requires-Dist: sglang[openai]; extra == "all-hpu"
+Requires-Dist: sglang[anthropic]; extra == "all-hpu"
+Requires-Dist: sglang[litellm]; extra == "all-hpu"
 Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
 Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
 Provides-Extra: dev-xpu
 Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
 Requires-Dist: sglang[test]; extra == "dev-xpu"
+Provides-Extra: dev-hpu
+Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
+Requires-Dist: sglang[test]; extra == "dev-hpu"
 <div align="center"  id="sglangtop">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Getting Started
-Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
-Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
-## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
-## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
+- [Install SGLang](https://sgl-project.github.io/start/install.html)
+- [Send requests](https://sgl-project.github.io/start/send_request.html)
+- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
+- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
 ## Benchmark And Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
-## Citation And Acknowledgment
+## Adoption and Sponsorship
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
+## Acknowledgment and Citation
+We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
-We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/README.md RENAMED Viewed

@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Getting Started
-Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
-Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
-## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
-## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
+- [Install SGLang](https://sgl-project.github.io/start/install.html)
+- [Send requests](https://sgl-project.github.io/start/send_request.html)
+- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
+- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
 ## Benchmark And Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
-## Citation And Acknowledgment
+## Adoption and Sponsorship
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
+## Acknowledgment and Citation
+We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
-We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.5.post2"
+version = "0.3.6.post1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -16,11 +16,14 @@ classifiers = [
 dependencies = ["requests", "tqdm", "numpy", "IPython"]
 [project.optional-dependencies]
-runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
-    "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
-    "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
-    "outlines>=0.0.44,<0.1.0", "modelscope"]
-srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
+runtime_common = ["aiohttp", "decord", "fastapi",
+    "hf_transfer", "huggingface_hub", "interegular", "modelscope",
+    "orjson", "outlines>=0.0.44,<0.1.0",
+    "packaging", "pillow", "prometheus-client>=0.20.0",
+    "psutil", "pydantic", "python-multipart",
+    "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
+    "xgrammar>=0.1.4"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -28,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
+#For Intel Gaudi(device : hpu) follow the installation guide
+#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu =  ["sglang[runtime_common]"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -43,9 +49,11 @@ test = [
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
+dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/__init__.py RENAMED Viewed

@@ -11,7 +11,7 @@ from sglang.api import (
     gen,
     gen_int,
     gen_string,
-    get_server_args,
+    get_server_info,
     image,
     select,
     set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
     "gen",
     "gen_int",
     "gen_string",
-    "get_server_args",
+    "get_server_info",
     "image",
     "select",
     "set_default_backend",

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/api.py RENAMED Viewed

@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
     return backend.flush_cache()
-def get_server_args(backend: Optional[BaseBackend] = None):
+def get_server_info(backend: Optional[BaseBackend] = None):
     backend = backend or global_config.default_backend
     if backend is None:
         return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
     # If backend is Runtime
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
-    return backend.get_server_args()
+    return backend.get_server_info()
 def gen(

sglang-0.3.6.post1/sglang/bench_latency.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")

{sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/bench_offline_throughput.py RENAMED Viewed

@@ -1,20 +1,13 @@
 """
-Benchmark the throughput of using the offline LLM engine.
-This script does not launch a server.
+Benchmark the throughput in the offline mode.
 It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
 # Usage
 ## Sharegpt dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
 ## Random dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
-## Shared prefix dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
-## Sharegpt dataset on runtime backend
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
 """
 import argparse
@@ -23,7 +16,7 @@ import json
 import logging
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 import numpy as np
@@ -55,7 +48,10 @@ class BenchArgs:
     gen_question_len: int = 128
     gen_output_len: int = 256
     disable_ignore_eos: bool = False
+    extra_request_body: Optional[str] = None
     seed: int = 1
+    skip_warmup: bool = False
+    do_not_exit: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -142,7 +138,24 @@ class BenchArgs:
             default=BenchArgs.disable_ignore_eos,
             help="Disable ignore EOS token",
         )
+        parser.add_argument(
+            "--extra-request-body",
+            metavar='{"key1": "value1", "key2": "value2"}',
+            type=str,
+            help="Append given JSON object to the request payload. You can use this to specify"
+            "additional generate params like sampling params.",
+        )
         parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--skip-warmup",
+            action="store_true",
+            help="Skip the warmup batches.",
+        )
+        parser.add_argument(
+            "--do-not-exit",
+            action="store_true",
+            help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -155,6 +168,7 @@ def throughput_test_once(
     backend,
     reqs: List[Tuple[str, int, int]],
     ignore_eos: bool,
+    extra_request_body: Dict,
 ):
     measurement_results = {
         "backend": backend_name,
@@ -174,6 +188,7 @@ def throughput_test_once(
             "temperature": 0,
             "max_new_tokens": r[2],
             "ignore_eos": ignore_eos,
+            **extra_request_body,
         }
         for r in reqs
     ]
@@ -227,31 +242,41 @@ def throughput_test(
     random.seed(bench_args.seed)
     np.random.seed(bench_args.seed)
+    # Parse args
+    extra_request_body = {}
+    if bench_args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
     # Read dataset
     input_requests = get_dataset(bench_args, tokenizer)
     warmup_requests = sample_random_requests(
-        input_len=20,
-        output_len=4,
-        num_prompts=2,
+        input_len=256,
+        output_len=16,
+        num_prompts=16,
         range_ratio=0.8,
         tokenizer=tokenizer,
         dataset_path=bench_args.dataset_path,
     )
     # Warm up
-    throughput_test_once(
-        backend_name=bench_args.backend,
-        backend=backend,
-        reqs=warmup_requests,
-        ignore_eos=not bench_args.disable_ignore_eos,
-    )
+    if not bench_args.skip_warmup:
+        logging.info("\nWarmup...")
+        throughput_test_once(
+            backend_name=bench_args.backend,
+            backend=backend,
+            reqs=warmup_requests,
+            ignore_eos=not bench_args.disable_ignore_eos,
+            extra_request_body=extra_request_body,
+        )
+    logging.info("\nBenchmark...")
     result = throughput_test_once(
         backend_name=bench_args.backend,
         backend=backend,
         reqs=input_requests,
         ignore_eos=not bench_args.disable_ignore_eos,
+        extra_request_body=extra_request_body,
     )
     if bench_args.result_filename:
@@ -307,3 +332,6 @@ if __name__ == "__main__":
     )
     throughput_test(server_args, bench_args)
+    while bench_args.do_not_exit:
+        pass

sglang-0.3.5.post2/sglang/bench_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch.py RENAMED Viewed

@@ -1,20 +1,17 @@
 """
-Benchmark the latency of running a single static batch.
+Benchmark the latency of running a single static batch without a server.
 This script does not launch a server and uses the low-level APIs.
-It accepts arguments similar to those of launch_server.py.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
 # Usage (latency test)
 ## with dummy weights:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 ## sweep through multiple data points and store (append) the results in a jsonl file:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
-## do some changes, and store the results under a different run_name:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
-## plot the results in series of lines:
-python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
 # Usage (correctness test):
-python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
+python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 ## Reference output (of the correctness test above, can be gpu dependent):
 input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
@@ -50,13 +47,10 @@ import itertools
 import json
 import logging
 import multiprocessing
-import os
-import sqlite3
 import time
 from typing import Tuple
 import numpy as np
-import pandas as pd
 import torch
 import torch.distributed as dist
@@ -77,19 +71,14 @@ from sglang.srt.utils import (
 @dataclasses.dataclass
 class BenchArgs:
-    run_name: str = "before"
+    run_name: str = "default"
     batch_size: Tuple[int] = (1,)
     input_len: Tuple[int] = (1024,)
     output_len: Tuple[int] = (16,)
-    result_filename: str = ""
+    result_filename: str = "result.jsonl"
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
-    # Plotting args
-    graph_sql: str = (
-        "select run_name, batch_size, prefill_throughput from results where run_name='before'"
-    )
-    graph_filename: str = "out.png"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -108,11 +97,6 @@ class BenchArgs:
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
-        # graphing
-        parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
-        parser.add_argument(
-            "--graph-filename", type=str, default=BenchArgs.graph_filename
-        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -220,7 +204,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
     return reqs
-@torch.inference_mode()
+@torch.no_grad
 def extend(reqs, model_runner):
     batch = ScheduleBatch.init_new(
         reqs=reqs,
@@ -228,6 +212,7 @@ def extend(reqs, model_runner):
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
         model_config=model_runner.model_config,
+        enable_overlap=False,
     )
     batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
@@ -237,7 +222,7 @@ def extend(reqs, model_runner):
     return next_token_ids, logits_output.next_token_logits, batch
-@torch.inference_mode()
+@torch.no_grad
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
@@ -254,6 +239,7 @@ def correctness_test(
     bench_args,
     tp_rank,
 ):
+    # Configure the logger
     configure_logger(server_args, prefix=f" TP{tp_rank}")
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
@@ -274,7 +260,7 @@ def correctness_test(
         bench_args, input_ids, reqs, model_runner
     )
-    # Extend
+    # Extend (prefill w/ KV cache)
     next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
     rank_print(f"prefill logits (final): {next_token_logits} \n")
@@ -286,17 +272,14 @@ def correctness_test(
         for i in range(len(reqs)):
             output_ids[i].append(next_token_ids_list[i])
-    # Print
+    # Print output texts
     for i in range(len(reqs)):
         rank_print(f"========== Prompt {i} ==========")
         rank_print(tokenizer.decode(output_ids[i]), "\n")
 def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
+    torch.get_device_module(device).synchronize()
 def latency_test_run_once(
@@ -352,7 +335,7 @@ def latency_test_run_once(
                 f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
-    # record decode timing from 2nd output
+    # Record decode timing from 2nd output
     if output_len > 1:
         med_decode_latency = np.median(decode_latencies)
         med_decode_throughput = batch_size / med_decode_latency
@@ -367,7 +350,7 @@ def latency_test_run_once(
         f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
     )
     measurement_results["total_latency"] = tot_latency
-    measurement_results["total_throughput"] = throughput
+    measurement_results["overall_throughput"] = throughput
     return measurement_results
@@ -377,6 +360,7 @@ def latency_test(
     bench_args,
     tp_rank,
 ):
+    # Configure the logger
     configure_logger(server_args, prefix=f" TP{tp_rank}")
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
@@ -423,71 +407,9 @@ def latency_test(
     # Write results in jsonlines format on rank 0.
     if tp_rank == 0 and bench_args.result_filename:
-        import jsonlines
-        with jsonlines.open(bench_args.result_filename, "a") as f:
-            f.write_all(result_list)
-def plot_latency_test(
-    server_args,
-    bench_args,
-    tp_rank,
-):
-    assert tp_rank == 0
-    # read the jsonl file and put in sqlite
-    df = pd.read_json(bench_args.result_filename, lines=True)
-    conn = sqlite3.connect(":memory:")
-    cur = conn.cursor()
-    # get the columns and their types
-    column_names = list(df.iloc[0].keys())
-    type_dict = {
-        str: "TEXT",
-        np.int64: "INTEGER",
-        np.float64: "FLOAT",
-    }
-    column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
-    # create the table
-    cur.execute(
-        f"""
-        CREATE TABLE IF NOT EXISTS results (
-            {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
-        )
-    """
-    )
-    conn.commit()
-    # write the results to DB
-    df.to_sql("results", conn, if_exists="replace", index=False)
-    conn.commit()
-    # read it back using sql
-    df = pd.read_sql_query(bench_args.graph_sql, conn)
-    conn.close()
-    # plot it and save to a file
-    import matplotlib.pyplot as plt
-    assert (
-        len(df.columns) == 3
-    ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
-    for label in df[df.columns[0]].unique():
-        q = f"{df.columns[0]}=='{label}'"
-        series = df.query(q)
-        plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
-    plt.xlabel(df.columns[1])
-    plt.ylabel(df.columns[2])
-    plt.legend()
-    plt.savefig(bench_args.graph_filename, dpi=300)
-    # if in kitty, just dump it to the terminal
-    if os.environ["TERM"] == "xterm-kitty":
-        os.system(
-            f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
-        )
+        with open(bench_args.result_filename, "a") as fout:
+            for result in result_list:
+                fout.write(json.dumps(result) + "\n")
 def main(server_args, bench_args):
@@ -498,9 +420,6 @@ def main(server_args, bench_args):
             work_func = correctness_test
         else:
             work_func = latency_test
-    elif os.path.isfile(bench_args.result_filename):
-        assert bench_args.graph_filename, "please provide a filename for the graph"
-        work_func = plot_latency_test
     else:
         raise ValueError(
             "Provide --model-path for running the tests or "

sglang-0.3.5.post2/sglang/bench_server_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -1,10 +1,10 @@
 """
-Benchmark the latency of serving a single batch with a real server.
+Benchmark the latency of running a single batch with a server.
 This script launches a server and uses the HTTP interface.
-It accepts arguments similar to those of launch_server.py.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
 Usage:
 python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
 python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8

sglang 0.3.5.post2__tar.gz → 0.3.6.post1__tar.gz

sglang 0.3.5.post2tar.gz → 0.3.6.post1tar.gz