PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.api import (
     get_server_info,
     image,
     select,
+    separate_reasoning,
     set_default_backend,
     system,
     system_begin,
@@ -54,6 +55,7 @@ __all__ = [
     "get_server_info",
     "image",
     "select",
+    "separate_reasoning",
     "set_default_backend",
     "system",
     "system_begin",

sglang/api.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVideo,
 )
@@ -277,3 +278,9 @@ def assistant_begin():
 def assistant_end():
     return SglRoleEnd("assistant")
+def separate_reasoning(
+    expr: Optional[SglExpr] = None, model_type: Optional[str] = None
+):
+    return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])

sglang/bench_offline_throughput.py CHANGED Viewed

@@ -11,7 +11,9 @@ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1
 """
 import argparse
+import asyncio
 import dataclasses
+import inspect
 import json
 import logging
 import os
@@ -235,8 +237,10 @@ def throughput_test_once(
     latency = time.perf_counter() - st
     if profile:
+        dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+        known_files = set(os.listdir(dir))
         backend.stop_profile()
-        monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
+        monitor_trace_file(known_files, dir)
     if backend_name == "runtime":
         gen_out = json.loads(gen_out)
@@ -260,6 +264,10 @@ def throughput_test_once(
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
+    if inspect.isawaitable(server_info):
+        server_info = asyncio.run(server_info)
     measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
         "last_gen_throughput"
     ]
@@ -267,11 +275,9 @@ def throughput_test_once(
     return measurement_results
-def monitor_trace_file(directory, interval=1):
+def monitor_trace_file(known_files, directory, interval=1):
     print(f"Monitoring {directory} for new trace files...")
-    known_files = set(os.listdir(directory))
     while True:
         flag = False
         time.sleep(interval)

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -8,6 +8,7 @@ Usage:
 python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
 python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
 """
 import argparse
@@ -19,10 +20,10 @@ import os
 import time
 from typing import Tuple
-import numpy as np
 import requests
 from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
@@ -42,6 +43,8 @@ class BenchArgs:
     base_url: str = ""
     skip_warmup: bool = False
     show_report: bool = False
+    profile: bool = False
+    profile_by_stage: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -68,6 +71,8 @@ class BenchArgs:
         parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
         parser.add_argument("--skip-warmup", action="store_true")
         parser.add_argument("--show-report", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument("--profile-by-stage", action="store_true")
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -93,8 +98,8 @@ def launch_server_process(server_args: ServerArgs):
     base_url = f"http://{server_args.host}:{server_args.port}"
     timeout = 600
-    start_time = time.perf_counter()
-    while time.perf_counter() - start_time < timeout:
+    start_time = time.time()
+    while time.time() - start_time < timeout:
         try:
             headers = {
                 "Content-Type": "application/json; charset=utf-8",
@@ -119,6 +124,8 @@ def run_one_case(
     run_name: str,
     result_filename: str,
     tokenizer,
+    profile: bool = False,
+    profile_by_stage: bool = False,
 ):
     requests.post(url + "/flush_cache")
     input_requests = sample_random_requests(
@@ -145,6 +152,12 @@ def run_one_case(
     else:
         json_schema = None
+    profile_link = None
+    if profile:
+        profile_link: str = run_profile(
+            url, 3, ["CPU", "GPU"], None, None, profile_by_stage
+        )
     tic = time.perf_counter()
     response = requests.post(
         url + "/generate",
@@ -194,8 +207,8 @@ def run_one_case(
     print(f"output_len: {output_len}")
     print(f"latency: {latency:.2f} s")
     print(f"ttft: {ttft:.2f} s")
-    print(f"Last generation throughput: {last_gen_throughput:.2f} tok/s")
-    print(f"Input throughput: {input_throughput:.2f} tok/s")
+    print(f"last generation throughput: {last_gen_throughput:.2f} tok/s")
+    print(f"input throughput: {input_throughput:.2f} tok/s")
     if output_len != 1:
         print(f"output throughput: {output_throughput:.2f} tok/s")
@@ -222,6 +235,7 @@ def run_one_case(
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        profile_link if profile else None,
     )
@@ -253,6 +267,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     # benchmark
     result = []
+    bench_result = []
     try:
         for bs, il, ol in itertools.product(
             bench_args.batch_size, bench_args.input_len, bench_args.output_len
@@ -271,6 +286,33 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     tokenizer=tokenizer,
                 )
             )
+        if bench_args.profile:
+            try:
+                for bs, il, ol in itertools.product(
+                    bench_args.batch_size, bench_args.input_len, bench_args.output_len
+                ):
+                    bench_result.append(
+                        (
+                            run_one_case(
+                                base_url,
+                                bs,
+                                il,
+                                ol,
+                                temperature=bench_args.temperature,
+                                return_logprob=bench_args.return_logprob,
+                                input_len_step_percentage=bench_args.input_len_step_percentage,
+                                run_name=bench_args.run_name,
+                                result_filename=bench_args.result_filename,
+                                tokenizer=tokenizer,
+                                profile=bench_args.profile,
+                                profile_by_stage=bench_args.profile_by_stage,
+                            )[-1],
+                        )
+                    )
+                result = [t1[:-1] + t2 for t1, t2 in zip(result, bench_result)]
+            except Exception as e:
+                print(f"Error profiling, there will be no profile trace dump: {e}")
     finally:
         if proc:
             kill_process_tree(proc.pid)
@@ -280,8 +322,20 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     if not bench_args.show_report:
         return
-    summary = " | batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input price ($/1M) | output price ($/1M) |\n"
-    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ------------------ | ------------------- |\n"
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+    summary += "| batch size | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
+    if bench_args.profile:
+        summary += " profile |"
+    summary += "\n"
+    summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
+    if bench_args.profile:
+        summary += "-------------|"
+    summary += "\n"
     for (
         batch_size,
@@ -292,6 +346,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         overall_throughput,
         last_gen_throughput,
         acc_length,
+        trace_link,
     ) in result:
         hourly_cost = 2 * server_args.tp_size  # $2/hour for one H100
         input_util = 0.7
@@ -304,17 +359,18 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             f"{accept_length} | "
             f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
             f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
-            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |\n"
+            f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
         )
+        if trace_link:
+            line += f" [Profile]({trace_link}) |"
+        line += "\n"
         summary += line
     # print metrics table
     print(summary)
     if is_in_ci():
-        write_github_step_summary(
-            f"### Test Nightly Benchmark (bench_one_batch) \n{summary}"
-        )
+        write_github_step_summary(summary)
 if __name__ == "__main__":

sglang/bench_serving.py CHANGED Viewed

@@ -39,7 +39,6 @@ from transformers import (
     PreTrainedTokenizerFast,
 )
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 ASSISTANT_SUFFIX = "Assistant:"
 global args
@@ -51,6 +50,19 @@ def _get_bool_env_var(name: str, default: str = "false") -> bool:
     return value.lower() in ("true", "1")
+def _create_bench_client_session():
+    # When the pressure is big, the read buffer could be full before aio thread read
+    # the content. We increase the read_bufsize from 64K to 10M.
+    # Define constants for timeout and buffer size for clarity and maintainability
+    BENCH_AIOHTTP_TIMEOUT_SECONDS = 6 * 60 * 60  # 6 hours
+    BENCH_AIOHTTP_READ_BUFSIZE_BYTES = 10 * 1024**2  # 10 MB
+    aiohttp_timeout = aiohttp.ClientTimeout(total=BENCH_AIOHTTP_TIMEOUT_SECONDS)
+    return aiohttp.ClientSession(
+        timeout=aiohttp_timeout, read_bufsize=BENCH_AIOHTTP_READ_BUFSIZE_BYTES
+    )
 @dataclass
 class RequestFuncInput:
     prompt: str
@@ -106,7 +118,7 @@ async def async_request_trt_llm(
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with _create_bench_client_session() as session:
         payload = {
             "accumulate_tokens": True,
             "text_input": request_func_input.prompt,
@@ -179,7 +191,7 @@ async def async_request_openai_completions(
     prompt = request_func_input.prompt
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with _create_bench_client_session() as session:
         payload = {
             "model": request_func_input.model,
             "prompt": prompt,
@@ -261,7 +273,7 @@ async def async_request_truss(
     prompt = request_func_input.prompt
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with _create_bench_client_session() as session:
         payload = {
             "model": request_func_input.model,
             "prompt": prompt,
@@ -338,9 +350,9 @@ async def async_request_sglang_generate(
     api_url = request_func_input.api_url
     prompt = request_func_input.prompt
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with _create_bench_client_session() as session:
         payload = {
-            "text": prompt,
+            ("text" if isinstance(prompt, str) else "input_ids"): prompt,
             "sampling_params": {
                 "temperature": 0.0,
                 "max_new_tokens": request_func_input.output_len,
@@ -376,7 +388,6 @@ async def async_request_sglang_generate(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
-                        # print(chunk_bytes)
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
                         latency = time.perf_counter() - st
@@ -388,7 +399,7 @@ async def async_request_sglang_generate(
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["text"]:
+                            if "text" in data and data["text"]:
                                 timestamp = time.perf_counter()
                                 generated_text = data["text"]
                                 output_len = data["meta_info"]["completion_tokens"]
@@ -437,7 +448,7 @@ async def async_request_gserver(
 async def async_request_profile(api_url: str) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with _create_bench_client_session() as session:
         output = RequestFuncOutput()
         try:
             async with session.post(url=api_url) as response:
@@ -493,7 +504,9 @@ def get_tokenizer(
 def get_dataset(args, tokenizer):
+    tokenize_prompt = getattr(args, "tokenize_prompt", False)
     if args.dataset_name == "sharegpt":
+        assert not tokenize_prompt
         input_requests = sample_sharegpt_requests(
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,
@@ -512,8 +525,10 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             dataset_path=args.dataset_path,
             random_sample=args.dataset_name == "random",
+            return_text=not tokenize_prompt,
         )
     elif args.dataset_name == "generated-shared-prefix":
+        assert not tokenize_prompt
         input_requests = sample_generated_shared_prefix_requests(
             num_groups=args.gsp_num_groups,
             prompts_per_group=args.gsp_prompts_per_group,
@@ -524,6 +539,7 @@ def get_dataset(args, tokenizer):
             args=args,
         )
     elif args.dataset_name == "mmmu":
+        assert not tokenize_prompt
         input_requests = sample_mmmu_requests(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
@@ -638,6 +654,7 @@ class DatasetRow:
     prompt: str
     prompt_len: int
     output_len: int
+    image_data: Optional[str] = None
 def sample_mmmu_requests(
@@ -713,42 +730,50 @@ def sample_mmmu_requests(
                     buffered = io.BytesIO()
                     image.save(buffered, format="JPEG")
                     img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-                    image_path = f"data:image/jpeg;base64,{img_str}"
+                    image_data = f"data:image/jpeg;base64,{img_str}"
                 else:
                     continue
                 # Extract the question
                 question = example.get("question")
-                # Create the prompt with image, question
+                # Construct the prompt
                 prompt = f"Question: {question}\n\nAnswer: "
-                prompt = tokenizer.apply_chat_template(
-                    [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": image_path}},
-                                {"type": "text", "text": prompt},
-                            ],
-                        }
-                    ],
-                    add_generation_prompt=True,
-                    tokenize=False,
-                )
-                prompt = f"<image>{image_path}</image>{prompt}"
-                # Calculate token lengths
-                # Note: This is approximate since we're not rendering the actual image tokens
+                try:
+                    prompt = tokenizer.apply_chat_template(
+                        [
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": image_data},
+                                    },
+                                    {"type": "text", "text": prompt},
+                                ],
+                            }
+                        ],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                    )
+                except Exception as e:
+                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+                    print(f"Error applying chat template: {e}, fallback to <image> tag")
+                    prompt = f"<image>{prompt}"
+                # Calculate token lengths for text only (without image data)
                 prompt_token_ids = tokenizer.encode(prompt)
-                prompt_len = (
-                    len(prompt_token_ids) + 512
-                )  # Add estimate for image tokens
+                prompt_len = len(prompt_token_ids)
                 output_len = fixed_output_len if fixed_output_len is not None else 256
                 filtered_dataset.append(
                     DatasetRow(
-                        prompt=prompt, prompt_len=prompt_len, output_len=output_len
+                        prompt=prompt,
+                        prompt_len=prompt_len,
+                        output_len=output_len,
+                        image_data=image_data,
                     )
                 )
@@ -1182,34 +1207,21 @@ async def benchmark(
     # Use the first request for all warmup iterations
     test_request = input_requests[0]
-    test_prompt, test_prompt_len, test_output_len = (
-        test_request.prompt,
-        test_request.prompt_len,
-        test_request.output_len,
-    )
     if lora_names is not None and len(lora_names) != 0:
         lora_name = lora_names[0]
     else:
         lora_name = None
-    if "<image>" in test_prompt:
-        import re
-        image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
-        image_data = image_match.group(1) if image_match else None
-        test_prompt = image_match.group(2) if image_match else test_prompt
-    else:
-        image_data = None
     # Create the test input once
     test_input = RequestFuncInput(
         model=model_id,
-        prompt=test_prompt,
+        prompt=test_request.prompt,
         api_url=api_url,
-        prompt_len=test_prompt_len,
-        output_len=min(test_output_len, 32),
+        prompt_len=test_request.prompt_len,
+        output_len=min(test_request.output_len, 32),
         lora_name=lora_name,
-        image_data=image_data,
+        image_data=test_request.image_data,
         extra_request_body=extra_request_body,
     )
@@ -1254,36 +1266,23 @@ async def benchmark(
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
-        prompt, prompt_len, output_len = (
-            request.prompt,
-            request.prompt_len,
-            request.output_len,
-        )
         if lora_names is not None and len(lora_names) != 0:
             idx = random.randint(0, len(lora_names) - 1)
             lora_name = lora_names[idx]
         else:
             lora_name = None
-        if "<image>" in prompt:
-            import re
-            image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
-            image_data = image_match.group(1) if image_match else None
-            prompt = image_match.group(2) if image_match else prompt
-        else:
-            image_data = None
         request_func_input = RequestFuncInput(
             model=model_id,
-            prompt=prompt,
+            prompt=request.prompt,
             api_url=api_url,
-            prompt_len=prompt_len,
-            output_len=output_len,
+            prompt_len=request.prompt_len,
+            output_len=request.output_len,
             lora_name=lora_name,
-            image_data=image_data,
+            image_data=request.image_data,
             extra_request_body=extra_request_body,
         )
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input, pbar=pbar)
@@ -1304,14 +1303,12 @@ async def benchmark(
     if "sglang" in backend:
         server_info = requests.get(base_url + "/get_server_info")
         if server_info.status_code == 200:
-            if pd_separated:
-                accept_length = server_info.json()["decode"][0]["internal_states"][
-                    0
-                ].get("avg_spec_accept_length", None)
-            else:
-                accept_length = server_info.json()["internal_states"][0].get(
-                    "avg_spec_accept_length", None
-                )
+            server_info_json = server_info.json()
+            if "decode" in server_info_json:
+                server_info_json = server_info_json["decode"][0]
+            accept_length = server_info_json["internal_states"][0].get(
+                "avg_spec_accept_length", None
+            )
         else:
             accept_length = None
     else:
@@ -1497,6 +1494,9 @@ def run_benchmark(args_: argparse.Namespace):
     if not hasattr(args, "output_details"):
         args.output_details = False
+    if not hasattr(args, "tokenize_prompt"):
+        args.tokenize_prompt = False
     print(f"benchmark_args={args}")
     # Set global environments
@@ -1508,6 +1508,11 @@ def run_benchmark(args_: argparse.Namespace):
     if args.extra_request_body:
         extra_request_body = json.loads(args.extra_request_body)
+    if args.tokenize_prompt:
+        assert (
+            args.backend == "sglang"
+        ), "`--tokenize-prompt` only compatible with `--backend sglang` currently"
     # Set url
     if args.port is None:
         args.port = {
@@ -1618,6 +1623,7 @@ def run_benchmark(args_: argparse.Namespace):
             profile=args.profile,
             pd_separated=args.pd_separated,
             flush_cache=args.flush_cache,
+            warmup_requests=args.warmup_requests,
         )
     )
@@ -1813,6 +1819,11 @@ if __name__ == "__main__":
         default=1,
         help="Number of warmup requests to run before the benchmark",
     )
+    parser.add_argument(
+        "--tokenize-prompt",
+        action="store_true",
+        help="Use integer ids instead of string for inputs. Useful to control prompt lengths accurately",
+    )
     group = parser.add_argument_group("generated-shared-prefix dataset arguments")
     group.add_argument(

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -85,6 +85,22 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
+    def start_profile(self):
+        res = http_request(
+            self.base_url + "/start_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def stop_profile(self):
+        res = http_request(
+            self.base_url + "/stop_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
     def commit_lazy_operations(self, s: StreamExecutor):
         data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
         self._add_images(s, data)
@@ -374,7 +390,8 @@ class Runtime:
         self.pid = None
         pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
-        proc = multiprocessing.Process(
+        ctx = multiprocessing.get_context("spawn")
+        proc = ctx.Process(
             target=launch_server,
             args=(self.server_args, pipe_writer),
         )
@@ -406,6 +423,12 @@ class Runtime:
             kill_process_tree(self.pid)
             self.pid = None
+    def start_profile(self):
+        self.endpoint.start_profile()
+    def stop_profile(self):
+        self.endpoint.stop_profile()
     def cache_prefix(self, prefix: str):
         self.endpoint.cache_prefix(prefix)

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl