PyPI - sglang - Versions diffs - 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl - Mend

sglang 0.3.1.post1py3-none-any.whl → 0.3.1.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

sglang/bench_latency.py +11 -2
sglang/bench_server_latency.py +187 -0
sglang/bench_serving.py +1 -1
sglang/srt/layers/activation.py +8 -4
sglang/srt/layers/attention_backend.py +3 -1
sglang/srt/layers/layernorm.py +10 -7
sglang/srt/layers/linear.py +1133 -0
sglang/srt/layers/quantization/__init__.py +76 -0
sglang/srt/layers/quantization/base_config.py +122 -0
sglang/srt/layers/sampler.py +9 -2
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/policy_scheduler.py +49 -93
sglang/srt/managers/schedule_batch.py +1 -1
sglang/srt/managers/tp_worker.py +11 -6
sglang/srt/model_executor/cuda_graph_runner.py +15 -14
sglang/srt/model_executor/model_runner.py +13 -5
sglang/srt/models/baichuan.py +1 -1
sglang/srt/models/chatglm.py +6 -6
sglang/srt/models/commandr.py +7 -7
sglang/srt/models/dbrx.py +7 -7
sglang/srt/models/deepseek.py +7 -7
sglang/srt/models/deepseek_v2.py +9 -9
sglang/srt/models/exaone.py +6 -6
sglang/srt/models/gemma.py +6 -6
sglang/srt/models/gemma2.py +6 -6
sglang/srt/models/gpt_bigcode.py +6 -6
sglang/srt/models/grok.py +6 -6
sglang/srt/models/internlm2.py +6 -6
sglang/srt/models/llama.py +7 -9
sglang/srt/models/llama_classification.py +3 -4
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +6 -6
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/mixtral.py +6 -6
sglang/srt/models/mixtral_quant.py +6 -6
sglang/srt/models/olmoe.py +1 -1
sglang/srt/models/qwen.py +6 -6
sglang/srt/models/qwen2.py +6 -6
sglang/srt/models/qwen2_moe.py +7 -7
sglang/srt/models/stablelm.py +6 -6
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +2 -5
sglang/srt/models/yivl.py +1 -1
sglang/srt/server_args.py +17 -21
sglang/srt/utils.py +21 -1
sglang/test/few_shot_gsm8k.py +8 -2
sglang/test/test_utils.py +5 -2
sglang/version.py +1 -1
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +5 -5
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +54 -50
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """
-Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
+Benchmark the latency of running a single static batch.
+This script does not launch a server and uses the low-level APIs.
+It accepts arguments similar to those of launch_server.py.
 # Usage (latency test)
 ## with dummy weights:
@@ -62,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process, suppress_other_loggers
+from sglang.srt.utils import (
+    configure_logger,
+    kill_child_process,
+    suppress_other_loggers,
+)
 @dataclasses.dataclass
@@ -339,6 +346,8 @@ def latency_test(
     bench_args,
     tp_rank,
 ):
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    _set_envs_and_config(server_args)
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     # Load the model

sglang/bench_server_latency.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""
+Benchmark the latency of serving a single batch with a real server.
+This script launches a server and uses the HTTP interface.
+It accepts arguments similar to those of launch_server.py.
+Usage:
+python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+"""
+import argparse
+import dataclasses
+import itertools
+import json
+import multiprocessing
+import os
+import time
+from typing import Tuple
+import numpy as np
+import requests
+from sglang.srt.server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_child_process
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    result_filename: str = "result.jsonl"
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)
+def launch_server_process(server_args: ServerArgs):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = 600
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc, base_url
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError("Server failed to start within the timeout period.")
+def run_one_case(
+    url: str,
+    batch_size: int,
+    input_len: int,
+    output_len: int,
+    run_name: str,
+    result_filename: str,
+):
+    input_ids = [
+        [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
+        for _ in range(batch_size)
+    ]
+    tic = time.time()
+    response = requests.post(
+        url + "/generate",
+        json={
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": output_len,
+                "ignore_eos": True,
+            },
+        },
+    )
+    latency = time.time() - tic
+    _ = response.json()
+    output_throughput = batch_size * output_len / latency
+    overall_throughput = batch_size * (input_len + output_len) / latency
+    print(f"batch size: {batch_size}")
+    print(f"latency: {latency:.2f} s")
+    print(f"output throughput: {output_throughput:.2f} token/s")
+    print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
+    if result_filename:
+        with open(result_filename, "a") as fout:
+            res = {
+                "run_name": run_name,
+                "batch_size": batch_size,
+                "input_len": input_len,
+                "output_len": output_len,
+                "latency": round(latency, 4),
+                "output_throughput": round(output_throughput, 2),
+                "overall_throughput": round(overall_throughput, 2),
+            }
+            fout.write(json.dumps(res) + "\n")
+def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
+    proc, base_url = launch_server_process(server_args)
+    # warmup
+    run_one_case(
+        base_url,
+        batch_size=16,
+        input_len=1024,
+        output_len=16,
+        run_name="",
+        result_filename="",
+    )
+    # benchmark
+    try:
+        for bs, il, ol in itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        ):
+            run_one_case(
+                base_url,
+                bs,
+                il,
+                ol,
+                bench_args.run_name,
+                bench_args.result_filename,
+            )
+    finally:
+        kill_child_process(proc.pid)
+    print(f"\nResults are saved to {bench_args.result_filename}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    # For this script, model-path is not required
+    assert (
+        parser._actions[1].option_strings[0] == "--model-path"
+    ), "options changed, this code need to be updated"
+    parser._actions[1].required = False
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    run_benchmark(server_args, bench_args)

sglang/bench_serving.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
 """
-Benchmark online serving.
+Benchmark online serving with dynamic requests.
 Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10

sglang/srt/layers/activation.py CHANGED Viewed

@@ -19,17 +19,21 @@ from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.utils import set_weight_attrs
-from sglang.srt.utils import is_hip
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)

sglang/srt/layers/attention_backend.py CHANGED Viewed

@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = model_runner.model_config.num_attention_heads
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
         if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
             self.reduce_dtype = torch.float32

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from flashinfer.norm import (
-    fused_add_rmsnorm,
-    gemma_fused_add_rmsnorm,
-    gemma_rmsnorm,
-    rmsnorm,
-)
-from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.norm import (
+        fused_add_rmsnorm,
+        gemma_fused_add_rmsnorm,
+        gemma_rmsnorm,
+        rmsnorm,
+    )
+from vllm.model_executor.custom_op import CustomOp
 logger = logging.getLogger(__name__)

sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl

sglang 0.3.1.post1py3-none-any.whl → 0.3.1.post3py3-none-any.whl