PyPI - sglang - Versions diffs - 0.5.4.post2__tar.gz → 0.5.4.post3__tar.gz - Mend

sglang 0.5.4.post2tar.gz → 0.5.4.post3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sglang might be problematic. Click here for more details.

Files changed (1108) hide show

{sglang-0.5.4.post2/sglang.egg-info → sglang-0.5.4.post3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.4.post2
+Version: 0.5.4.post3
 Summary: SGLang is a fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -222,7 +222,8 @@ Requires-Dist: decord2
 Requires-Dist: datasets
 Requires-Dist: einops
 Requires-Dist: fastapi
-Requires-Dist: flashinfer_python==0.4.1
+Requires-Dist: flashinfer_python==0.5.0
+Requires-Dist: flashinfer_cubin==0.5.0
 Requires-Dist: gguf
 Requires-Dist: hf_transfer
 Requires-Dist: huggingface_hub

{sglang-0.5.4.post2 → sglang-0.5.4.post3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.5.4.post2"
+version = "0.5.4.post3"
 description = "SGLang is a fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,7 +26,8 @@ dependencies = [
   "datasets",
   "einops",
   "fastapi",
-  "flashinfer_python==0.4.1",
+  "flashinfer_python==0.5.0",
+  "flashinfer_cubin==0.5.0",
   "gguf",
   "hf_transfer",
   "huggingface_hub",

{sglang-0.5.4.post2 → sglang-0.5.4.post3}/sglang/bench_offline_throughput.py RENAMED Viewed

@@ -60,6 +60,8 @@ class BenchArgs:
     skip_warmup: bool = False
     do_not_exit: bool = False
     prompt_suffix: str = ""
+    return_logprob: bool = False
+    logprob_start_len: int = -1
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -187,6 +189,17 @@ class BenchArgs:
             default="",
             help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
         )
+        parser.add_argument(
+            "--return-logprob",
+            action="store_true",
+            help="Enable returning log probabilities.",
+        )
+        parser.add_argument(
+            "--logprob-start-len",
+            type=int,
+            default=-1,
+            help="Start length for logprob. -1 means only return logprobs for output tokens (default). 0 means return logprobs for all tokens including input.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -201,6 +214,8 @@ def throughput_test_once(
     ignore_eos: bool,
     extra_request_body: Dict,
     profile: bool,
+    return_logprob: bool = False,
+    logprob_start_len: int = -1,
 ):
     measurement_results = {
         "backend": backend_name,
@@ -233,7 +248,12 @@ def throughput_test_once(
         backend.start_profile()
     st = time.perf_counter()
-    gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    gen_out = backend.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        return_logprob=return_logprob,
+        logprob_start_len=logprob_start_len,
+    )
     latency = time.perf_counter() - st
     if profile:
@@ -355,6 +375,8 @@ def throughput_test(
             ignore_eos=not bench_args.disable_ignore_eos,
             extra_request_body=extra_request_body,
             profile=False,
+            return_logprob=bench_args.return_logprob,
+            logprob_start_len=bench_args.logprob_start_len,
         )
         time.sleep(0.5)
@@ -366,6 +388,8 @@ def throughput_test(
         ignore_eos=not bench_args.disable_ignore_eos,
         extra_request_body=extra_request_body,
         profile=bench_args.profile,
+        return_logprob=bench_args.return_logprob,
+        logprob_start_len=bench_args.logprob_start_len,
     )
     backend.shutdown()

{sglang-0.5.4.post2 → sglang-0.5.4.post3}/sglang/bench_one_batch.py RENAMED Viewed

@@ -15,7 +15,7 @@ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruc
 export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
 ## run with CUDA profiler (nsys):
-nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profiler_activities CUDA_PROFILER
+nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profile-activities CUDA_PROFILER
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
@@ -98,12 +98,12 @@ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
 ]
-def start_profile(profiler_activities, profile_record_shapes=False, rank_print=print):
+def start_profile(profile_activities, profile_record_shapes=False, rank_print=print):
     """
-    Abstracted function to start profiling based on profiler_activities.
+    Abstracted function to start profiling based on profile_activities.
     Returns profiler object (or None).
     """
-    if "CUDA_PROFILER" in profiler_activities:
+    if "CUDA_PROFILER" in profile_activities:
         try:
             torch.cuda.cudart().cudaProfilerStart()
             rank_print("CUDA Profiler started (nsys will begin capturing)")
@@ -112,9 +112,9 @@ def start_profile(profiler_activities, profile_record_shapes=False, rank_print=p
         return None
     else:
         activities = []
-        if "CPU" in profiler_activities:
+        if "CPU" in profile_activities:
             activities.append(torch.profiler.ProfilerActivity.CPU)
-        if "GPU" in profiler_activities:
+        if "GPU" in profile_activities:
             activities.append(torch.profiler.ProfilerActivity.CUDA)
         if activities:
             profiler = torch.profiler.profile(
@@ -129,17 +129,17 @@ def start_profile(profiler_activities, profile_record_shapes=False, rank_print=p
 def stop_profile(
     profiler,
-    profiler_activities,
+    profile_activities,
     rank_print=print,
     save_trace=False,
     trace_filename=None,
     stage=None,
 ):
     """
-    Abstracted function to stop profiling based on profiler_activities.
+    Abstracted function to stop profiling based on profile_activities.
     Optionally saves trace results and prints completion messages.
     """
-    if "CUDA_PROFILER" in profiler_activities:
+    if "CUDA_PROFILER" in profile_activities:
         try:
             torch.cuda.cudart().cudaProfilerStop()
             rank_print("CUDA Profiler stopped (nsys should dump traces)")
@@ -156,7 +156,7 @@ def stop_profile(
                 rank_print(
                     f"torch profiler chrome trace {stage_desc} saved to {trace_filename}"
                 )
-        if "CUDA_PROFILER" in profiler_activities:
+        if "CUDA_PROFILER" in profile_activities:
             rank_print(f"CUDA profiler trace for {stage} completed")
@@ -174,7 +174,7 @@ class BenchArgs:
     log_decode_step: int = 0
     profile: bool = False
     profile_record_shapes: bool = False
-    profiler_activities: Tuple[str] = ("CPU", "GPU")
+    profile_activities: Tuple[str] = ("CPU", "GPU")
     profile_stage: str = "all"
     profile_filename_prefix: str = "profile"
@@ -211,7 +211,7 @@ class BenchArgs:
             help="Record tensor shapes in profiling results.",
         )
         parser.add_argument(
-            "--profiler_activities",
+            "--profile-activities",
             type=str,
             nargs="+",
             default=["CPU", "GPU"],
@@ -507,7 +507,7 @@ def latency_test_run_once(
     log_decode_step,
     profile,
     profile_record_shapes,
-    profiler_activities,
+    profile_activities,
     profile_filename_prefix,
     profile_stage,
     tp_rank,
@@ -535,7 +535,7 @@ def latency_test_run_once(
     enable_profile_prefill = profile and profile_stage in ["all", "prefill"]
     if enable_profile_prefill:
         profiler = start_profile(
-            profiler_activities,
+            profile_activities,
             profile_record_shapes=profile_record_shapes,
             rank_print=rank_print,
         )
@@ -552,7 +552,7 @@ def latency_test_run_once(
         )
         stop_profile(
             profiler,
-            profiler_activities,
+            profile_activities,
             rank_print=rank_print,
             save_trace=True,
             trace_filename=trace_filename,
@@ -575,7 +575,7 @@ def latency_test_run_once(
         profiler = None
         if enable_profile_decode and i == profile_step_of_interest:
             profiler = start_profile(
-                profiler_activities,
+                profile_activities,
                 profile_record_shapes=profile_record_shapes,
                 rank_print=rank_print,
             )
@@ -591,7 +591,7 @@ def latency_test_run_once(
             )
             stop_profile(
                 profiler,
-                profiler_activities,
+                profile_activities,
                 rank_print=rank_print,
                 save_trace=True,
                 trace_filename=trace_filename,
@@ -666,7 +666,7 @@ def latency_test(
         log_decode_step=0,
         profile=False,
         profile_record_shapes=False,
-        profiler_activities=("CPU", "GPU"),
+        profile_activities=("CPU", "GPU"),
         profile_filename_prefix="",
         profile_stage="all",
         tp_rank=tp_rank,
@@ -716,7 +716,7 @@ def latency_test(
             bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_record_shapes if tp_rank == 0 else None,
-            bench_args.profiler_activities,
+            bench_args.profile_activities,
             bench_args.profile_filename_prefix,
             bench_args.profile_stage,
             tp_rank,

{sglang-0.5.4.post2 → sglang-0.5.4.post3}/sglang/bench_serving.py RENAMED Viewed

@@ -25,6 +25,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
+from functools import lru_cache
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
@@ -614,7 +615,10 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
     async with _create_bench_client_session() as session:
         output = RequestFuncOutput()
         try:
-            async with session.post(url=api_url) as response:
+            body = {
+                "activities": getattr(args, "profile_activities", []),
+            }
+            async with session.post(url=api_url, json=body) as response:
                 if response.status == 200:
                     output.success = True
                 else:
@@ -1484,9 +1488,15 @@ def sample_image_requests(
     return dataset
+@lru_cache(maxsize=1)
+def get_available_tokens(tokenizer):
+    """Get all available token ids from the tokenizer vocabulary."""
+    return list(tokenizer.get_vocab().values())
 def gen_prompt(tokenizer, token_num):
     """Generate a random prompt of specified token length using tokenizer vocabulary."""
-    all_available_tokens = list(tokenizer.get_vocab().values())
+    all_available_tokens = get_available_tokens(tokenizer)
     selected_tokens = random.choices(all_available_tokens, k=token_num)
     return tokenizer.decode(selected_tokens)
@@ -2029,6 +2039,9 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
     print("=" * 50)
+    resp = requests.get(base_url + "/get_server_info", headers=get_auth_headers())
+    server_info = resp.json() if resp.status_code == 200 else None
     if (
         metrics.median_ttft_ms is not None
         and metrics.mean_itl_ms is not None
@@ -2045,6 +2058,8 @@ async def benchmark(
             "random_input_len": args.random_input_len,
             "random_output_len": args.random_output_len,
             "random_range_ratio": args.random_range_ratio,
+            # Information
+            "server_info": server_info,
             # Results
             "duration": benchmark_duration,
             "completed": metrics.completed,
@@ -2520,6 +2535,14 @@ if __name__ == "__main__":
         help="Use Torch Profiler. The endpoint must be launched with "
         "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
     )
+    # TODO unify all these
+    parser.add_argument(
+        "--profile-activities",
+        type=str,
+        nargs="+",
+        default=["CPU", "GPU"],
+        choices=["CPU", "GPU", "CUDA_PROFILER"],
+    )
     parser.add_argument(
         "--lora-name",
         type=str,

sglang 0.5.4.post2__tar.gz → 0.5.4.post3__tar.gz

Potentially problematic release.

sglang 0.5.4.post2tar.gz → 0.5.4.post3tar.gz