PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/bench_offline_throughput.py CHANGED Viewed

@@ -11,17 +11,20 @@ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1
 """
 import argparse
+import asyncio
 import dataclasses
+import inspect
 import json
 import logging
 import os
 import random
 import time
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 import numpy as np
 from sglang.bench_serving import (
+    DatasetRow,
     get_dataset,
     get_tokenizer,
     sample_random_requests,
@@ -194,7 +197,7 @@ class BenchArgs:
 def throughput_test_once(
     backend_name: str,
     backend,
-    reqs: List[Tuple[str, int, int]],
+    reqs: List[DatasetRow],
     ignore_eos: bool,
     extra_request_body: Dict,
     profile: bool,
@@ -203,7 +206,7 @@ def throughput_test_once(
         "backend": backend_name,
         "successful_requests": len(reqs),
         "total_latency": -1,
-        "total_input_tokens": sum(r[1] for r in reqs),
+        "total_input_tokens": sum(r.prompt_len for r in reqs),
         "total_output_tokens": -1,
         "request_throughput": -1,
         "input_throughput": -1,
@@ -211,11 +214,11 @@ def throughput_test_once(
         "total_throughput": -1,
     }
-    prompt = [r[0] for r in reqs]
+    prompt = [r.prompt for r in reqs]
     sampling_params = [
         {
             "temperature": 0,
-            "max_new_tokens": r[2],
+            "max_new_tokens": r.output_len,
             "ignore_eos": ignore_eos,
             **extra_request_body,
         }
@@ -234,8 +237,10 @@ def throughput_test_once(
     latency = time.perf_counter() - st
     if profile:
+        dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+        known_files = set(os.listdir(dir))
         backend.stop_profile()
-        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
+        monitor_trace_file(known_files, dir)
     if backend_name == "runtime":
         gen_out = json.loads(gen_out)
@@ -259,6 +264,10 @@ def throughput_test_once(
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
+    if inspect.isawaitable(server_info):
+        server_info = asyncio.run(server_info)
     measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
         "last_gen_throughput"
     ]
@@ -266,12 +275,9 @@ def throughput_test_once(
     return measurement_results
-def monitor_trace_file(directory, interval=1):
+def monitor_trace_file(known_files, directory, interval=1):
     print(f"Monitoring {directory} for new trace files...")
-    known_files = set(os.listdir(directory))
     while True:
         flag = False
         time.sleep(interval)

sglang/bench_one_batch.py CHANGED Viewed

@@ -269,6 +269,7 @@ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
+            moe_dense_tp_size=model_runner.server_args.moe_dense_tp_size,
             tp_cpu_group=model_runner.tp_group.cpu_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
@@ -372,10 +373,10 @@ def latency_test_run_once(
     # Prefill
     synchronize(device)
-    tic = time.time()
+    tic = time.perf_counter()
     next_token_ids, _, batch = extend(reqs, model_runner)
     synchronize(device)
-    prefill_latency = time.time() - tic
+    prefill_latency = time.perf_counter() - tic
     tot_latency += prefill_latency
     throughput = input_len * batch_size / prefill_latency
     rank_print(
@@ -388,10 +389,10 @@ def latency_test_run_once(
     decode_latencies = []
     for i in range(output_len - 1):
         synchronize(device)
-        tic = time.time()
+        tic = time.perf_counter()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         synchronize(device)
-        latency = time.time() - tic
+        latency = time.perf_counter() - tic
         tot_latency += latency
         throughput = batch_size / latency
         decode_latencies.append(latency)

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -8,6 +8,7 @@ Usage:
 python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
 """
 import argparse
@@ -19,9 +20,10 @@ import os
 import time
 from typing import Tuple
-import numpy as np
 import requests
+from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
@@ -41,6 +43,8 @@ class BenchArgs:
     base_url: str = ""
     skip_warmup: bool = False
     show_report: bool = False
+    profile: bool = False
+    profile_by_stage: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -67,6 +71,8 @@ class BenchArgs:
         parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
         parser.add_argument("--skip-warmup", action="store_true")
         parser.add_argument("--show-report", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument("--profile-by-stage", action="store_true")
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -117,16 +123,21 @@ def run_one_case(
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
+    tokenizer,
+    profile: bool = False,
+    profile_by_stage: bool = False,
 ):
     requests.post(url + "/flush_cache")
-    input_lens = [
-        int(input_len * (1 + (i - (batch_size - 1) / 2) * input_len_step_percentage))
-        for i in range(batch_size)
-    ]
-    input_ids = [
-        [int(x) for x in np.random.randint(0, high=16384, size=(input_lens[i],))]
-        for i in range(batch_size)
-    ]
+    input_requests = sample_random_requests(
+        input_len=input_len,
+        output_len=output_len,
+        num_prompts=batch_size,
+        range_ratio=1.0,
+        tokenizer=tokenizer,
+        dataset_path="",
+        random_sample=True,
+        return_text=False,
+    )
     use_structured_outputs = False
     if use_structured_outputs:
@@ -141,12 +152,17 @@ def run_one_case(
     else:
         json_schema = None
-    tic = time.time()
+    profile_link = None
+    if profile:
+        profile_link: str = run_profile(
+            url, 3, ["CPU", "GPU"], None, None, profile_by_stage
+        )
+    tic = time.perf_counter()
     response = requests.post(
         url + "/generate",
         json={
-            # "text": texts,
-            "input_ids": input_ids,
+            "input_ids": [req.prompt for req in input_requests],
             "sampling_params": {
                 "temperature": temperature,
                 "max_new_tokens": output_len,
@@ -175,9 +191,9 @@ def run_one_case(
                 or data["meta_info"]["finish_reason"]["type"] == "length"
             )
             if data["meta_info"]["completion_tokens"] == 1:
-                ttft = time.time() - tic
+                ttft = time.perf_counter() - tic
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
     input_throughput = batch_size * input_len / ttft
     output_throughput = batch_size * output_len / (latency - ttft)
     overall_throughput = batch_size * (input_len + output_len) / latency
@@ -191,8 +207,8 @@ def run_one_case(
     print(f"output_len: {output_len}")
     print(f"latency: {latency:.2f} s")
     print(f"ttft: {ttft:.2f} s")
-    print(f"Last generation throughput: {last_gen_throughput:.2f} tok/s")
-    print(f"Input throughput: {input_throughput:.2f} tok/s")
+    print(f"last generation throughput: {last_gen_throughput:.2f} tok/s")
+    print(f"input throughput: {input_throughput:.2f} tok/s")
     if output_len != 1:
         print(f"output throughput: {output_throughput:.2f} tok/s")
@@ -219,6 +235,7 @@ def run_one_case(
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        profile_link if profile else None,
     )
@@ -228,6 +245,9 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     else:
         proc, base_url = launch_server_process(server_args)
+    tokenizer_id = server_args.tokenizer_path or server_args.model_path
+    tokenizer = get_tokenizer(tokenizer_id)
     # warmup
     if not bench_args.skip_warmup:
         print("=" * 8 + " Warmup Begin " + "=" * 8)
@@ -241,11 +261,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             input_len_step_percentage=bench_args.input_len_step_percentage,
             run_name="",
             result_filename="",
+            tokenizer=tokenizer,
         )
         print("=" * 8 + " Warmup End   " + "=" * 8 + "\n")
     # benchmark
     result = []
+    bench_result = []
     try:
         for bs, il, ol in itertools.product(
             bench_args.batch_size, bench_args.input_len, bench_args.output_len
@@ -261,8 +283,36 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     input_len_step_percentage=bench_args.input_len_step_percentage,
                     run_name=bench_args.run_name,
                     result_filename=bench_args.result_filename,
+                    tokenizer=tokenizer,
                 )
             )
+        if bench_args.profile:
+            try:
+                for bs, il, ol in itertools.product(
+                    bench_args.batch_size, bench_args.input_len, bench_args.output_len
+                ):
+                    bench_result.append(
+                        (
+                            run_one_case(
+                                base_url,
+                                bs,
+                                il,
+                                ol,
+                                temperature=bench_args.temperature,
+                                return_logprob=bench_args.return_logprob,
+                                input_len_step_percentage=bench_args.input_len_step_percentage,
+                                run_name=bench_args.run_name,
+                                result_filename=bench_args.result_filename,
+                                tokenizer=tokenizer,
+                                profile=bench_args.profile,
+                                profile_by_stage=bench_args.profile_by_stage,
+                            )[-1],
+                        )
+                    )
+                result = [t1[:-1] + t2 for t1, t2 in zip(result, bench_result)]
+            except Exception as e:
+                print(f"Error profiling, there will be no profile trace dump: {e}")
     finally:
         if proc:
             kill_process_tree(proc.pid)
@@ -272,8 +322,20 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     if not bench_args.show_report:
         return
-    summary = " | batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input price ($/1M) | output price ($/1M) |\n"
-    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ------------------ | ------------------- |\n"
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+    summary += "| batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
+    if bench_args.profile:
+        summary += " profile |"
+    summary += "\n"
+    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
+    if bench_args.profile:
+        summary += "-------------|"
+    summary += "\n"
     for (
         batch_size,
@@ -284,6 +346,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        trace_link,
     ) in result:
         hourly_cost = 2 * server_args.tp_size  # $2/hour for one H100
         input_util = 0.7
@@ -296,17 +359,18 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             f"{accept_length} | "
             f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
             f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
-            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |\n"
+            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
         )
+        if trace_link:
+            line += f" [Profile]({trace_link}) |"
+        line += "\n"
         summary += line
     # print metrics table
     print(summary)
     if is_in_ci():
-        write_github_step_summary(
-            f"### Test Nightly Benchmark (bench_one_batch) \n{summary}"
-        )
+        write_github_step_summary(summary)
 if __name__ == "__main__":

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl