PyPI - sglang - Versions diffs - 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sglang might be problematic. Click here for more details.

Files changed (62) hide show

sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +337 -0
sglang/bench_one_batch.py +474 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +115 -31
sglang/check_env.py +3 -6
sglang/srt/constrained/base_grammar_backend.py +4 -3
sglang/srt/constrained/outlines_backend.py +39 -26
sglang/srt/constrained/xgrammar_backend.py +58 -14
sglang/srt/layers/activation.py +3 -0
sglang/srt/layers/attention/flashinfer_backend.py +93 -48
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/custom_op_util.py +26 -0
sglang/srt/layers/fused_moe/fused_moe.py +11 -4
sglang/srt/layers/fused_moe/patch.py +4 -2
sglang/srt/layers/layernorm.py +4 -0
sglang/srt/layers/logits_processor.py +10 -10
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/managers/data_parallel_controller.py +74 -9
sglang/srt/managers/detokenizer_manager.py +1 -14
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/schedule_batch.py +104 -38
sglang/srt/managers/schedule_policy.py +5 -1
sglang/srt/managers/scheduler.py +210 -56
sglang/srt/managers/session_controller.py +62 -0
sglang/srt/managers/tokenizer_manager.py +38 -0
sglang/srt/managers/tp_worker.py +12 -1
sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
sglang/srt/model_executor/cuda_graph_runner.py +43 -6
sglang/srt/model_executor/forward_batch_info.py +109 -15
sglang/srt/model_executor/model_runner.py +102 -43
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/gemma2.py +9 -8
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/torch_native_llama.py +94 -78
sglang/srt/openai_api/adapter.py +11 -4
sglang/srt/openai_api/protocol.py +30 -27
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +58 -57
sglang/srt/sampling/sampling_params.py +3 -3
sglang/srt/server.py +29 -2
sglang/srt/server_args.py +97 -60
sglang/srt/utils.py +103 -51
sglang/test/runners.py +25 -6
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +33 -22
sglang/version.py +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/METADATA +43 -43
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/RECORD +62 -56
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/WHEEL +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/LICENSE +0 -0
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -1,553 +1 @@
-"""
-Benchmark the latency of running a single static batch.
-This script does not launch a server and uses the low-level APIs.
-It accepts arguments similar to those of launch_server.py.
-# Usage (latency test)
-## with dummy weights:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
-## sweep through multiple data points and store (append) the results in a jsonl file:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
-## do some changes, and store the results under a different run_name:
-python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
-## plot the results in series of lines:
-python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
-# Usage (correctness test):
-python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
-## Reference output (of the correctness test above, can be gpu dependent):
-input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
-prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
-       device='cuda:0')
-prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
-        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
-        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
-       device='cuda:0')
-========== Prompt 0 ==========
-<s> The capital of France is Paris.
-The capital of the United States is Washington, D.C.
-========== Prompt 1 ==========
-<s> The capital of the United Kindom is London.
-The capital of the United Kingdom is London.
-The capital of the
-========== Prompt 2 ==========
-<s> Today is a sunny day and I like to go for a walk in the park.
-I'm going to the park
-"""
-import argparse
-import dataclasses
-import itertools
-import json
-import logging
-import multiprocessing
-import os
-import sqlite3
-import time
-from typing import Tuple
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-from sglang.srt.configs.model_config import ModelConfig
-from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_executor.model_runner import ModelRunner
-from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server import _set_envs_and_config
-from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    configure_logger,
-    kill_child_process,
-    suppress_other_loggers,
-)
-@dataclasses.dataclass
-class BenchArgs:
-    run_name: str = "before"
-    batch_size: Tuple[int] = (1,)
-    input_len: Tuple[int] = (1024,)
-    output_len: Tuple[int] = (16,)
-    result_filename: str = ""
-    correctness_test: bool = False
-    # This is only used for correctness test
-    cut_len: int = 4
-    # Plotting args
-    graph_sql: str = (
-        "select run_name, batch_size, prefill_throughput from results where run_name='before'"
-    )
-    graph_filename: str = "out.png"
-    @staticmethod
-    def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
-        parser.add_argument(
-            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
-        )
-        parser.add_argument(
-            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
-        )
-        parser.add_argument(
-            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
-        )
-        parser.add_argument(
-            "--result-filename", type=str, default=BenchArgs.result_filename
-        )
-        parser.add_argument("--correctness-test", action="store_true")
-        parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
-        # graphing
-        parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
-        parser.add_argument(
-            "--graph-filename", type=str, default=BenchArgs.graph_filename
-        )
-    @classmethod
-    def from_cli_args(cls, args: argparse.Namespace):
-        # use the default value's type to case the args into correct types.
-        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
-        return cls(
-            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
-        )
-def load_model(server_args, port_args, tp_rank):
-    suppress_other_loggers()
-    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    model_config = ModelConfig(
-        server_args.model_path,
-        trust_remote_code=server_args.trust_remote_code,
-        context_length=server_args.context_length,
-        model_override_args=server_args.json_model_override_args,
-    )
-    model_runner = ModelRunner(
-        model_config=model_config,
-        mem_fraction_static=server_args.mem_fraction_static,
-        gpu_id=tp_rank,
-        tp_rank=tp_rank,
-        tp_size=server_args.tp_size,
-        nccl_port=port_args.nccl_port,
-        server_args=server_args,
-    )
-    rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
-    tokenizer = get_tokenizer(
-        server_args.tokenizer_path,
-        tokenizer_mode=server_args.tokenizer_mode,
-        trust_remote_code=server_args.trust_remote_code,
-    )
-    if server_args.tp_size > 1:
-        dist.barrier()
-    return model_runner, tokenizer
-def prepare_inputs_for_correctness_test(bench_args, tokenizer):
-    prompts = [
-        "The capital of France is",
-        "The capital of the United Kindom is",
-        "Today is a sunny day and I like",
-    ]
-    input_ids = [tokenizer.encode(p) for p in prompts]
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_new_tokens=BenchArgs.output_len,
-    )
-    reqs = []
-    for i in range(len(prompts)):
-        assert len(input_ids[i]) > bench_args.cut_len
-        tmp_input_ids = input_ids[i][: bench_args.cut_len]
-        req = Req(
-            rid=i,
-            origin_input_text=prompts[i],
-            origin_input_ids=tmp_input_ids,
-            sampling_params=sampling_params,
-        )
-        req.prefix_indices = []
-        req.fill_ids = req.origin_input_ids
-        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
-        reqs.append(req)
-    return input_ids, reqs
-def prepare_extend_inputs_for_correctness_test(
-    bench_args, input_ids, reqs, model_runner
-):
-    for i in range(len(reqs)):
-        req = reqs[i]
-        req.fill_ids += input_ids[i][bench_args.cut_len :]
-        req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
-            i, : bench_args.cut_len
-        ]
-        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
-    return reqs
-def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
-    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_new_tokens=BenchArgs.output_len,
-    )
-    reqs = []
-    for i in range(len(input_ids)):
-        req = Req(
-            rid=i,
-            origin_input_text="",
-            origin_input_ids=list(input_ids[i]),
-            sampling_params=sampling_params,
-        )
-        req.prefix_indices = []
-        req.fill_ids = req.origin_input_ids
-        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
-        reqs.append(req)
-    return reqs
-@torch.inference_mode()
-def extend(reqs, model_runner):
-    batch = ScheduleBatch.init_new(
-        reqs=reqs,
-        req_to_token_pool=model_runner.req_to_token_pool,
-        token_to_kv_pool=model_runner.token_to_kv_pool,
-        tree_cache=None,
-        model_config=model_runner.model_config,
-    )
-    batch.prepare_for_extend()
-    model_worker_batch = batch.get_model_worker_batch()
-    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
-    logits_output = model_runner.forward(forward_batch)
-    next_token_ids = model_runner.sample(logits_output, forward_batch)
-    return next_token_ids, logits_output.next_token_logits, batch
-@torch.inference_mode()
-def decode(input_token_ids, batch, model_runner):
-    batch.output_ids = input_token_ids
-    batch.prepare_for_decode()
-    model_worker_batch = batch.get_model_worker_batch()
-    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
-    logits_output = model_runner.forward(forward_batch)
-    next_token_ids = model_runner.sample(logits_output, forward_batch)
-    return next_token_ids, logits_output.next_token_logits
-def correctness_test(
-    server_args,
-    port_args,
-    bench_args,
-    tp_rank,
-):
-    configure_logger(server_args, prefix=f" TP{tp_rank}")
-    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    # Load the model
-    model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
-    # Prepare inputs
-    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
-    rank_print(f"\n{input_ids=}\n")
-    if bench_args.cut_len > 0:
-        # Prefill
-        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-        rank_print(f"prefill logits (first half): {next_token_logits} \n")
-    # Prepare extend inputs
-    reqs = prepare_extend_inputs_for_correctness_test(
-        bench_args, input_ids, reqs, model_runner
-    )
-    # Extend
-    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print(f"prefill logits (final): {next_token_logits} \n")
-    # Decode
-    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len[0] - 1):
-        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
-        next_token_ids_list = next_token_ids.tolist()
-        for i in range(len(reqs)):
-            output_ids[i].append(next_token_ids_list[i])
-    # Print
-    for i in range(len(reqs)):
-        rank_print(f"========== Prompt {i} ==========")
-        rank_print(tokenizer.decode(output_ids[i]), "\n")
-def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
-def latency_test_run_once(
-    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
-):
-    max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
-    if batch_size > max_batch_size:
-        rank_print(
-            f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
-        )
-        return
-    # Clear the pools.
-    model_runner.req_to_token_pool.clear()
-    model_runner.token_to_kv_pool.clear()
-    measurement_results = {
-        "run_name": run_name,
-        "batch_size": batch_size,
-        "input_len": input_len,
-        "output_len": output_len,
-    }
-    tot_latency = 0
-    # Prefill
-    synchronize(device)
-    tic = time.time()
-    next_token_ids, _, batch = extend(reqs, model_runner)
-    synchronize(device)
-    prefill_latency = time.time() - tic
-    tot_latency += prefill_latency
-    throughput = input_len * batch_size / prefill_latency
-    rank_print(
-        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-    )
-    measurement_results["prefill_latency"] = prefill_latency
-    measurement_results["prefill_throughput"] = throughput
-    # Decode
-    decode_latencies = []
-    for i in range(output_len - 1):
-        synchronize(device)
-        tic = time.time()
-        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
-        synchronize(device)
-        latency = time.time() - tic
-        tot_latency += latency
-        throughput = batch_size / latency
-        decode_latencies.append(latency)
-        if i < 5:
-            rank_print(
-                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-            )
-    # record decode timing from 2nd output
-    if output_len > 1:
-        med_decode_latency = np.median(decode_latencies)
-        med_decode_throughput = batch_size / med_decode_latency
-        rank_print(
-            f"Decode.  median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
-        )
-        measurement_results["median_decode_latency"] = med_decode_latency
-        measurement_results["median_decode_throughput"] = med_decode_throughput
-    throughput = (input_len + output_len) * batch_size / tot_latency
-    rank_print(
-        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
-    )
-    measurement_results["total_latency"] = tot_latency
-    measurement_results["total_throughput"] = throughput
-    return measurement_results
-def latency_test(
-    server_args,
-    port_args,
-    bench_args,
-    tp_rank,
-):
-    configure_logger(server_args, prefix=f" TP{tp_rank}")
-    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    # Load the model
-    model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
-    # Prepare inputs for warm up
-    reqs = prepare_synthetic_inputs_for_latency_test(
-        bench_args.batch_size[0], bench_args.input_len[0]
-    )
-    # Warm up
-    rank_print("Warmup ...")
-    latency_test_run_once(
-        bench_args.run_name,
-        model_runner,
-        rank_print,
-        reqs,
-        bench_args.batch_size[0],
-        bench_args.input_len[0],
-        8,  # shorter decoding to speed up the warmup
-        server_args.device,
-    )
-    rank_print("Benchmark ...")
-    # Run the sweep
-    result_list = []
-    for bs, il, ol in itertools.product(
-        bench_args.batch_size, bench_args.input_len, bench_args.output_len
-    ):
-        reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
-        ret = latency_test_run_once(
-            bench_args.run_name,
-            model_runner,
-            rank_print,
-            reqs,
-            bs,
-            il,
-            ol,
-            server_args.device,
-        )
-        if ret is not None:
-            result_list.append(ret)
-    # Write results in jsonlines format on rank 0.
-    if tp_rank == 0 and bench_args.result_filename:
-        import jsonlines
-        with jsonlines.open(bench_args.result_filename, "a") as f:
-            f.write_all(result_list)
-def plot_latency_test(
-    server_args,
-    bench_args,
-    tp_rank,
-):
-    assert tp_rank == 0
-    # read the jsonl file and put in sqlite
-    df = pd.read_json(bench_args.result_filename, lines=True)
-    conn = sqlite3.connect(":memory:")
-    cur = conn.cursor()
-    # get the columns and their types
-    column_names = list(df.iloc[0].keys())
-    type_dict = {
-        str: "TEXT",
-        np.int64: "INTEGER",
-        np.float64: "FLOAT",
-    }
-    column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
-    # create the table
-    cur.execute(
-        f"""
-        CREATE TABLE IF NOT EXISTS results (
-            {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
-        )
-    """
-    )
-    conn.commit()
-    # write the results to DB
-    df.to_sql("results", conn, if_exists="replace", index=False)
-    conn.commit()
-    # read it back using sql
-    df = pd.read_sql_query(bench_args.graph_sql, conn)
-    conn.close()
-    # plot it and save to a file
-    import matplotlib.pyplot as plt
-    assert (
-        len(df.columns) == 3
-    ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
-    for label in df[df.columns[0]].unique():
-        q = f"{df.columns[0]}=='{label}'"
-        series = df.query(q)
-        plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
-    plt.xlabel(df.columns[1])
-    plt.ylabel(df.columns[2])
-    plt.legend()
-    plt.savefig(bench_args.graph_filename, dpi=300)
-    # if in kitty, just dump it to the terminal
-    if os.environ["TERM"] == "xterm-kitty":
-        os.system(
-            f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
-        )
-def main(server_args, bench_args):
-    _set_envs_and_config(server_args)
-    if server_args.model_path:
-        if bench_args.correctness_test:
-            work_func = correctness_test
-        else:
-            work_func = latency_test
-    elif os.path.isfile(bench_args.result_filename):
-        assert bench_args.graph_filename, "please provide a filename for the graph"
-        work_func = plot_latency_test
-    else:
-        raise ValueError(
-            "Provide --model-path for running the tests or "
-            "provide --result-filename for plotting the results"
-        )
-    port_args = PortArgs.init_new(server_args)
-    if server_args.tp_size == 1:
-        work_func(server_args, port_args, bench_args, 0)
-    else:
-        workers = []
-        for tp_rank in range(server_args.tp_size):
-            proc = multiprocessing.Process(
-                target=work_func,
-                args=(
-                    server_args,
-                    port_args,
-                    bench_args,
-                    tp_rank,
-                ),
-            )
-            proc.start()
-            workers.append(proc)
-        for proc in workers:
-            proc.join()
-        proc.terminate()
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    ServerArgs.add_cli_args(parser)
-    BenchArgs.add_cli_args(parser)
-    args = parser.parse_args()
-    server_args = ServerArgs.from_cli_args(args)
-    bench_args = BenchArgs.from_cli_args(args)
-    logging.basicConfig(
-        level=getattr(logging, server_args.log_level.upper()),
-        format="%(message)s",
-    )
-    try:
-        main(server_args, bench_args)
-    except Exception as e:
-        raise e
-    finally:
-        kill_child_process()
+raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")

sglang 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl

Potentially problematic release.

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl