PyPI - sglang - Versions diffs - 0.3.6__tar.gz → 0.3.6.post2__tar.gz - Mend

sglang 0.3.6tar.gz → 0.3.6.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

{sglang-0.3.6 → sglang-0.3.6.post2}/LICENSE RENAMED Viewed

@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2023-2024 SGLang Team
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

{sglang-0.3.6 → sglang-0.3.6.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.6
+Version: 0.3.6.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -190,7 +190,7 @@ License:                                  Apache License
               same "printed page" as the copyright notice for easier
               identification within third-party archives.
-           Copyright [yyyy] [name of copyright owner]
+           Copyright 2023-2024 SGLang Team
            Licensed under the Apache License, Version 2.0 (the "License");
            you may not use this file except in compliance with the License.
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
 Requires-Dist: interegular; extra == "runtime-common"
+Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
@@ -234,17 +235,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: torchao; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: modelscope; extra == "runtime-common"
+Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
+Requires-Dist: cuda-python; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
 Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
+Provides-Extra: srt-hpu
+Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +278,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
 Requires-Dist: sglang[openai]; extra == "all-xpu"
 Requires-Dist: sglang[anthropic]; extra == "all-xpu"
 Requires-Dist: sglang[litellm]; extra == "all-xpu"
+Provides-Extra: all-hpu
+Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
+Requires-Dist: sglang[openai]; extra == "all-hpu"
+Requires-Dist: sglang[anthropic]; extra == "all-hpu"
+Requires-Dist: sglang[litellm]; extra == "all-hpu"
 Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
 Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +292,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
 Provides-Extra: dev-xpu
 Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
 Requires-Dist: sglang[test]; extra == "dev-xpu"
+Provides-Extra: dev-hpu
+Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
+Requires-Dist: sglang[test]; extra == "dev-hpu"
 <div align="center"  id="sglangtop">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +333,16 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Getting Started
-Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
-Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
-## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
-## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
+- [Install SGLang](https://sgl-project.github.io/start/install.html)
+- [Send requests](https://sgl-project.github.io/start/send_request.html)
+- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
+- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
 ## Benchmark And Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +350,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
-## Citation And Acknowledgment
+## Adoption and Sponsorship
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
+## Acknowledgment and Citation
+We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
-We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.3.6 → sglang-0.3.6.post2}/README.md RENAMED Viewed

@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Getting Started
-Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
-Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
-## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
-## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
+- [Install SGLang](https://sgl-project.github.io/start/install.html)
+- [Send requests](https://sgl-project.github.io/start/send_request.html)
+- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
+- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
 ## Benchmark And Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
-## Citation And Acknowledgment
+## Adoption and Sponsorship
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
+## Acknowledgment and Citation
+We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
-We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.3.6 → sglang-0.3.6.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.6"
+version = "0.3.6.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -17,13 +17,13 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi",
-    "hf_transfer", "huggingface_hub", "interegular",
+    "hf_transfer", "huggingface_hub", "interegular", "modelscope",
     "orjson", "outlines>=0.0.44,<0.1.0",
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
-    "modelscope"]
-srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
+    "xgrammar>=0.1.4"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
+#For Intel Gaudi(device : hpu) follow the installation guide
+#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu =  ["sglang[runtime_common]"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -46,9 +49,11 @@ test = [
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
+dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/__init__.py RENAMED Viewed

@@ -11,7 +11,7 @@ from sglang.api import (
     gen,
     gen_int,
     gen_string,
-    get_server_args,
+    get_server_info,
     image,
     select,
     set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
     "gen",
     "gen_int",
     "gen_string",
-    "get_server_args",
+    "get_server_info",
     "image",
     "select",
     "set_default_backend",

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/api.py RENAMED Viewed

@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
     return backend.flush_cache()
-def get_server_args(backend: Optional[BaseBackend] = None):
+def get_server_info(backend: Optional[BaseBackend] = None):
     backend = backend or global_config.default_backend
     if backend is None:
         return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
     # If backend is Runtime
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
-    return backend.get_server_args()
+    return backend.get_server_info()
 def gen(

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch.py RENAMED Viewed

@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
         model_config=model_runner.model_config,
+        enable_overlap=False,
     )
     batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
@@ -278,10 +279,7 @@ def correctness_test(
 def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
+    torch.get_device_module(device).synchronize()
 def latency_test_run_once(
@@ -468,7 +466,6 @@ if __name__ == "__main__":
     try:
         main(server_args, bench_args)
-    except Exception as e:
-        raise e
     finally:
-        kill_child_process()
+        if server_args.tp_size != 1:
+            kill_child_process()

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
 It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
 Usage:
-python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
-python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 """
 import argparse

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_serving.py RENAMED Viewed

@@ -25,6 +25,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
+from pathlib import Path
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import aiohttp
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
 def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
+    if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
         import huggingface_hub.constants
         from modelscope import snapshot_download
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
     return tokenizer.decode(selected_tokens)
+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
+        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
 def sample_generated_shared_prefix_requests(
     num_groups: int,
     prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
     output_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
-    if args.generated_input_path and os.path.exists(args.generated_input_path):
-        print(f"\nloading generated input data from {args.generated_input_path}")
-        with open(args.generated_input_path, "rb") as f:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
             return pickle.load(f)
-    """Generate benchmark requests with shared system prompts using random tokens."""
+    print("\nGenerating new input data...")
     # Generate system prompts for each group
     system_prompts = []
     for _ in range(num_groups):
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
         question = gen_prompt(tokenizer, question_len)
         questions.append(question)
-    # Shuffle questions
-    random.shuffle(questions)
     # Combine system prompts with questions
     input_requests = []
     total_input_tokens = 0
     total_output_tokens = 0
-    for group_idx in range(num_groups):
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
         system_prompt = system_prompts[group_idx]
-        for prompt_idx in range(prompts_per_group):
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
             question = questions[group_idx * prompts_per_group + prompt_idx]
             full_prompt = f"{system_prompt}\n\n{question}"
             prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
             total_input_tokens += prompt_len
             total_output_tokens += output_len
+    # Shuffle questions
+    random.shuffle(input_requests)
+    # Print statistics
     print(f"\nGenerated shared prefix dataset statistics:")
     print(f"Number of groups: {num_groups}")
     print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
     print(
         f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
     )
-    if args.generated_input_save_path:
-        print(f"Saving generated input data to {args.generated_input_save_path}")
-        os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
-        with open(args.generated_input_save_path, "wb") as f:
-            pickle.dump(input_requests, f)
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)
     return input_requests
@@ -859,6 +882,7 @@ async def benchmark(
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
+    max_concurrency: Optional[int],
     disable_tqdm: bool,
     extra_request_body: Dict[str, Any],
     profile: bool,
@@ -868,6 +892,15 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -913,7 +946,7 @@ async def benchmark(
         )
         tasks.append(
             asyncio.create_task(
-                request_func(request_func_input=request_func_input, pbar=pbar)
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
             )
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +973,12 @@ async def benchmark(
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Backend:", backend))
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print(
+        "{:<40} {:<10}".format(
+            "Max reqeuest concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1042,7 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
+            "max_concurrency": max_concurrency,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
     global args
     args = args_
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
     # Set global environments
     set_ulimit()
     random.seed(args.seed)
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
                 tokenizer=tokenizer,
                 input_requests=input_requests,
                 request_rate=args.request_rate,
+                max_concurrency=args.max_concurrency,
                 disable_tqdm=args.disable_tqdm,
                 extra_request_body=extra_request_body,
                 profile=args.profile,
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
                     tokenizer=tokenizer,
                     input_requests=input_requests,
                     request_rate=rate,
+                    max_concurrency=args.max_concurrency,
                     disable_tqdm=args.disable_tqdm,
                     extra_request_body=extra_request_body,
                     profile=args.profile,
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
         help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
         "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
     )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
     parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
-    parser.add_argument(
-        "--generated-input-save-path",
-        type=str,
-        help="Path to save generated input data",
-    )
-    parser.add_argument(
-        "--generated-input-path",
-        type=str,
-        help="Path to load previously generated input data",
-    )
     parser.add_argument(
         "--profile",
         action="store_true",

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/check_env.py RENAMED Viewed

@@ -22,18 +22,24 @@ PACKAGE_LIST = [
     "hf_transfer",
     "huggingface_hub",
     "interegular",
+    "modelscope",
+    "orjson",
+    "outlines",
+    "packaging",
     "psutil",
     "pydantic",
     "multipart",
     "zmq",
+    "torchao",
     "uvicorn",
     "uvloop",
     "vllm",
-    "outlines",
+    "xgrammar",
     "openai",
     "tiktoken",
     "anthropic",
     "litellm",
+    "decord",
 ]

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/base_backend.py RENAMED Viewed

@@ -78,5 +78,5 @@ class BaseBackend:
     def flush_cache(self):
         pass
-    def get_server_args(self):
+    def get_server_info(self):
         pass

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/runtime_endpoint.py RENAMED Viewed

@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
-    def get_server_args(self):
+    def get_server_info(self):
         res = http_request(
-            self.base_url + "/get_server_args",
+            self.base_url + "/get_server_info",
             api_key=self.api_key,
             verify=self.verify,
         )

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/tracer.py RENAMED Viewed

@@ -278,6 +278,6 @@ class TracingScope:
     def add_child_state(self, state: TracerProgramState):
         cur_scope = self
-        while cur_scope != None:
+        while cur_scope is not None:
             cur_scope.tracer_state.child_states.append(state)
             cur_scope = cur_scope.last_scope

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/launch_server.py RENAMED Viewed

@@ -1,6 +1,5 @@
 """Launch the inference server."""
-import os
 import sys
 from sglang.srt.server import launch_server
@@ -12,7 +11,5 @@ if __name__ == "__main__":
     try:
         launch_server(server_args)
-    except Exception as e:
-        raise e
     finally:
         kill_child_process()

{sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -1,27 +1,26 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 import json
 import logging
-import os
 from enum import IntEnum, auto
 from typing import List, Optional
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -60,13 +59,9 @@ class ModelConfig:
         # Derive context length
         derived_context_len = get_context_length(self.hf_text_config)
-        allow_long_context = os.environ.get(
-            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
-        )
         if context_length is not None:
             if context_length > derived_context_len:
-                if allow_long_context:
+                if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
                     logger.warning(
                         f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
                         f"This may lead to incorrect model outputs or CUDA errors."

sglang 0.3.6__tar.gz → 0.3.6.post2__tar.gz

sglang 0.3.6tar.gz → 0.3.6.post2tar.gz