PyPI - sglang - Versions diffs - 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

sglang/api.py CHANGED Viewed

@@ -94,7 +94,7 @@ def gen(
     regex: Optional[str] = None,
     json_schema: Optional[str] = None,
 ):
-    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
+    """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
     if choices:
         return SglSelect(

sglang/bench_offline_throughput.py CHANGED Viewed

@@ -56,6 +56,7 @@ class BenchArgs:
     profile: bool = False
     skip_warmup: bool = False
     do_not_exit: bool = False
+    prompt_suffix: str = ""
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -177,6 +178,12 @@ class BenchArgs:
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
+        parser.add_argument(
+            "--prompt-suffix",
+            type=str,
+            default="",
+            help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +223,10 @@ def throughput_test_once(
     ]
     if profile:
+        assert (
+            "SGLANG_TORCH_PROFILER_DIR" in os.environ
+        ), "Please set SGLANG_TORCH_PROFILER_DIR."
+        os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
         backend.start_profile()
     st = time.perf_counter()
@@ -229,6 +240,8 @@ def throughput_test_once(
     if backend_name == "runtime":
         gen_out = json.loads(gen_out)
+    server_info = backend.get_server_info()
     measurement_results["total_latency"] = latency
     measurement_results["total_output_tokens"] = sum(
         o["meta_info"]["completion_tokens"] for o in gen_out
@@ -246,6 +259,7 @@ def throughput_test_once(
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
+    measurement_results["last_gen_throughput"] = server_info["last_gen_throughput"]
     return measurement_results
@@ -361,6 +375,11 @@ def throughput_test(
     print(
         "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
     )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Last generation throughput (tok/s):", result["last_gen_throughput"]
+        )
+    )
     print(
         "{:<40} {:<10.2f}".format(
             "Request throughput (req/s):", result["request_throughput"]

sglang/bench_one_batch.py CHANGED Viewed

@@ -230,7 +230,7 @@ def extend(reqs, model_runner):
     batch = ScheduleBatch.init_new(
         reqs=reqs,
         req_to_token_pool=model_runner.req_to_token_pool,
-        token_to_kv_pool=model_runner.token_to_kv_pool,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
         tree_cache=None,
         model_config=model_runner.model_config,
         enable_overlap=False,
@@ -326,7 +326,7 @@ def latency_test_run_once(
     # Clear the pools.
     model_runner.req_to_token_pool.clear()
-    model_runner.token_to_kv_pool.clear()
+    model_runner.token_to_kv_pool_allocator.clear()
     measurement_results = {
         "run_name": run_name,

sglang/bench_serving.py CHANGED Viewed

@@ -8,7 +8,6 @@ Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10
 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
-python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
 """
 import argparse
@@ -40,6 +39,7 @@ from transformers import (
 )
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+ASSISTANT_SUFFIX = "Assistant:"
 global args
@@ -71,7 +71,19 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text[len(prefix) :] if text.startswith(prefix) else text
-# trt llm not support ignore_eos
+def remove_suffix(text: str, suffix: str) -> str:
+    return text[: -len(suffix)] if text.endswith(suffix) else text
+def get_auth_headers() -> Dict[str, str]:
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        return {"Authorization": f"Bearer {api_key}"}
+    else:
+        return {}
+# trt llm does not support ignore_eos
 # https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
@@ -165,12 +177,13 @@ async def async_request_openai_completions(
             "ignore_eos": not args.disable_ignore_eos,
             **request_func_input.extra_request_body,
         }
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        headers = get_auth_headers()
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
         generated_text = ""
+        output_len = request_func_input.output_len
         ttft = 0.0
         st = time.perf_counter()
         most_recent_timestamp = st
@@ -207,11 +220,14 @@ async def async_request_openai_completions(
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
+                                output_len = data.get("usage", {}).get(
+                                    "completion_tokens", output_len
+                                )
                     output.generated_text = generated_text
                     output.success = True
                     output.latency = latency
-                    output.output_len = request_func_input.output_len
+                    output.output_len = output_len
                 else:
                     output.error = response.reason or ""
                     output.success = False
@@ -244,7 +260,7 @@ async def async_request_truss(
             "ignore_eos": not args.disable_ignore_eos,
             **request_func_input.extra_request_body,
         }
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        headers = get_auth_headers()
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -325,15 +341,17 @@ async def async_request_sglang_generate(
             "logprob_start_len": -1,
             **request_func_input.extra_request_body,
         }
-        headers = {}
+        headers = get_auth_headers()
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
         generated_text = ""
+        output_len = request_func_input.output_len
         ttft = 0.0
         st = time.perf_counter()
         most_recent_timestamp = st
+        last_output_len = 0
         try:
             async with session.post(
                 url=api_url, json=payload, headers=headers
@@ -357,6 +375,9 @@ async def async_request_sglang_generate(
                             # want to check a token was generated
                             if data["text"]:
                                 timestamp = time.perf_counter()
+                                generated_text = data["text"]
+                                output_len = data["meta_info"]["completion_tokens"]
                                 # First token
                                 if ttft == 0.0:
                                     ttft = time.perf_counter() - st
@@ -364,15 +385,21 @@ async def async_request_sglang_generate(
                                 # Decoding phase
                                 else:
-                                    output.itl.append(timestamp - most_recent_timestamp)
+                                    num_new_tokens = output_len - last_output_len
+                                    if num_new_tokens == 0:
+                                        continue
+                                    adjust_itl = (
+                                        timestamp - most_recent_timestamp
+                                    ) / num_new_tokens
+                                    output.itl.extend([adjust_itl] * num_new_tokens)
                                 most_recent_timestamp = timestamp
-                                generated_text = data["text"]
+                                last_output_len = output_len
                     output.generated_text = generated_text
                     output.success = True
                     output.latency = latency
-                    output.output_len = request_func_input.output_len
+                    output.output_len = output_len
                 else:
                     output.error = response.reason or ""
                     output.success = False
@@ -380,6 +407,7 @@ async def async_request_sglang_generate(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+            print(f"{output.error=}")
     if pbar:
         pbar.update(1)
@@ -453,6 +481,7 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
             context_len=args.sharegpt_context_len,
+            prompt_suffix=args.prompt_suffix,
             apply_chat_template=args.apply_chat_template,
         )
     elif args.dataset_name == "random":
@@ -513,7 +542,9 @@ class BenchmarkMetrics:
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
+    p95_itl_ms: float
     p99_itl_ms: float
+    max_itl_ms: float
     mean_e2e_latency_ms: float
     median_e2e_latency_ms: float
     std_e2e_latency_ms: float
@@ -564,6 +595,7 @@ def sample_sharegpt_requests(
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
     context_len: Optional[int] = None,
+    prompt_suffix: Optional[str] = "",
     apply_chat_template=False,
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
@@ -576,11 +608,19 @@ def sample_sharegpt_requests(
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    dataset = [
+        data
+        for data in dataset
+        if len(data.get("conversations", data.get("conversation", []))) >= 2
+    ]
     # Only keep the first two turns of each conversation.
     dataset = [
-        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        (
+            data.get("conversations", data.get("conversation", []))[0]["value"],
+            data.get("conversations", data.get("conversation", []))[1]["value"],
+        )
         for data in dataset
     ]
@@ -595,6 +635,12 @@ def sample_sharegpt_requests(
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
+        if prompt_suffix:
+            prompt = (
+                remove_suffix(prompt, ASSISTANT_SUFFIX)
+                + prompt_suffix
+                + ASSISTANT_SUFFIX
+            )
         if apply_chat_template:
             prompt = tokenizer.apply_chat_template(
@@ -658,10 +704,17 @@ def sample_random_requests(
         with open(dataset_path) as f:
             dataset = json.load(f)
         # Filter out the conversations with less than 2 turns.
-        dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+        dataset = [
+            data
+            for data in dataset
+            if len(data.get("conversations", data.get("conversation", []))) >= 2
+        ]
         # Only keep the first two turns of each conversation.
         dataset = [
-            (data["conversations"][0]["value"], data["conversations"][1]["value"])
+            (
+                data.get("conversations", data.get("conversation", []))[0]["value"],
+                data.get("conversations", data.get("conversation", []))[1]["value"],
+            )
             for data in dataset
         ]
         # Shuffle the dataset.
@@ -887,7 +940,9 @@ def calculate_metrics(
         mean_itl_ms=np.mean(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
+        p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        max_itl_ms=np.max(itls or 0) * 1000,
         mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
         median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
         std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
@@ -911,6 +966,7 @@ async def benchmark(
     lora_name: str,
     extra_request_body: Dict[str, Any],
     profile: bool,
+    pd_seperated: bool = False,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -996,6 +1052,17 @@ async def benchmark(
     if pbar is not None:
         pbar.close()
+    if "sglang" in backend:
+        server_info = requests.get(base_url + "/get_server_info")
+        if pd_seperated:
+            accept_length = server_info.json()["decode"][0].get(
+                "avg_spec_accept_length", None
+            )
+        else:
+            accept_length = server_info.json().get("avg_spec_accept_length", None)
+    else:
+        accept_length = None
     # Compute metrics and print results
     benchmark_duration = time.perf_counter() - benchmark_start_time
     metrics, output_lens = calculate_metrics(
@@ -1045,6 +1112,8 @@ async def benchmark(
         )
     )
     print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
+    if accept_length:
+        print("{:<40} {:<10.2f}".format("Accept length:", accept_length))
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1058,16 +1127,12 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
     print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
     print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
-    print(
-        "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
-    )
-    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
-    print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-    print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
+    print("{s:{c}^{n}}".format(s="Inter-Token Latency", n=50, c="-"))
     print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
     print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
     print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
     print("=" * 50)
     if (
@@ -1109,8 +1174,10 @@ async def benchmark(
             "mean_itl_ms": metrics.mean_itl_ms,
             "median_itl_ms": metrics.median_itl_ms,
             "std_itl_ms": metrics.std_itl_ms,
+            "p95_itl_ms": metrics.p95_itl_ms,
             "p99_itl_ms": metrics.p99_itl_ms,
             "concurrency": metrics.concurrency,
+            "accept_length": accept_length,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
@@ -1143,14 +1210,6 @@ async def benchmark(
     return result
-def parse_request_rate_range(request_rate_range):
-    if len(request_rate_range.split(",")) == 3:
-        start, stop, step = map(int, request_rate_range.split(","))
-        return list(range(start, stop, step))
-    else:
-        return list(map(int, request_rate_range.split(",")))
 def check_chat_template(model_path):
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -1160,6 +1219,12 @@ def check_chat_template(model_path):
         return False
+def set_global_args(args_: argparse.Namespace):
+    """Set the global args."""
+    global args
+    args = args_
 def run_benchmark(args_: argparse.Namespace):
     global args
     args = args_
@@ -1168,6 +1233,8 @@ def run_benchmark(args_: argparse.Namespace):
     if not hasattr(args, "max_concurrency"):
         args.max_concurrency = None
+    print(f"benchmark_args={args}")
     # Set global environments
     set_ulimit()
     random.seed(args.seed)
@@ -1238,7 +1305,7 @@ def run_benchmark(args_: argparse.Namespace):
             )
             sys.exit(1)
         try:
-            response = requests.get(model_url)
+            response = requests.get(model_url, headers=get_auth_headers())
             model_list = response.json().get("data", [])
             args.model = model_list[0]["id"] if model_list else None
         except Exception as e:
@@ -1264,49 +1331,26 @@ def run_benchmark(args_: argparse.Namespace):
     backend = args.backend
     model_id = args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer = get_tokenizer(tokenizer_id)
     input_requests = get_dataset(args, tokenizer)
-    if not args.multi:
-        return asyncio.run(
-            benchmark(
-                backend=backend,
-                api_url=api_url,
-                base_url=base_url,
-                model_id=model_id,
-                tokenizer=tokenizer,
-                input_requests=input_requests,
-                request_rate=args.request_rate,
-                max_concurrency=args.max_concurrency,
-                disable_tqdm=args.disable_tqdm,
-                lora_name=args.lora_name,
-                extra_request_body=extra_request_body,
-                profile=args.profile,
-            )
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            max_concurrency=args.max_concurrency,
+            disable_tqdm=args.disable_tqdm,
+            lora_name=args.lora_name,
+            extra_request_body=extra_request_body,
+            profile=args.profile,
+            pd_seperated=args.pd_seperated,
         )
-    else:
-        # Benchmark multiple rps. TODO: use a fixed duration to compute num_prompts
-        request_rates = parse_request_rate_range(args.request_rate_range)
-        for rate in request_rates:
-            asyncio.run(
-                benchmark(
-                    backend=backend,
-                    api_url=api_url,
-                    base_url=base_url,
-                    model_id=model_id,
-                    tokenizer=tokenizer,
-                    input_requests=input_requests,
-                    request_rate=rate,
-                    max_concurrency=args.max_concurrency,
-                    disable_tqdm=args.disable_tqdm,
-                    lora_name=args.lora_name,
-                    extra_request_body=extra_request_body,
-                    profile=args.profile,
-                )
-            )
+    )
 def set_ulimit(target_soft_limit=65535):
@@ -1420,17 +1464,6 @@ if __name__ == "__main__":
         "actual request rate may be lower than specified with --request-rate, "
         "if the server is not processing requests fast enough to keep up.",
     )
-    parser.add_argument(
-        "--multi",
-        action="store_true",
-        help="Use request rate range rather than single value.",
-    )
-    parser.add_argument(
-        "--request-rate-range",
-        type=str,
-        default="2,34,2",
-        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
-    )
     parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
     parser.add_argument(
         "--disable-tqdm",
@@ -1477,6 +1510,17 @@ if __name__ == "__main__":
         default=None,
         help="The name of LoRA adapter",
     )
+    parser.add_argument(
+        "--prompt-suffix",
+        type=str,
+        default="",
+        help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+    )
+    parser.add_argument(
+        "--pd-seperated",
+        action="store_true",
+        help="Benchmark PD disaggregation server",
+    )
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(

sglang/global_config.py CHANGED Viewed

@@ -4,6 +4,13 @@ import os
 class GlobalConfig:
+    """
+    Store some global constants.
+    See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
+    many global runtime arguments as well.
+    """
     def __init__(self):
         # Verbosity level
         # 0: do not output anything
@@ -34,11 +41,9 @@ class GlobalConfig:
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True
-        # Interpreter optimization configs
+        # Language frontend interpreter optimization configs
         self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
-        self.enable_flashinfer_mla = False
 global_config = GlobalConfig()

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -336,7 +336,7 @@ class Runtime:
     """
     A wrapper for the HTTP server.
     This is used for launching the server in a python program without
-    using the commond line interface.
+    using the command line interface.
     It is mainly used for the frontend language.
     You should use the Engine class if you want to do normal offline processing without the frontend language.

sglang/lang/ir.py CHANGED Viewed

@@ -457,7 +457,7 @@ class SglGen(SglExpr):
         regex: Optional[str] = None,
         json_schema: Optional[str] = None,
     ):
-        """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
+        """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
         super().__init__()
         self.name = name
         self.sampling_params = SglSamplingParams(

sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl