PyPI - sglang - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl - Mend

sglang 0.3.0py3-none-any.whl → 0.3.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_latency.py +17 -8
sglang/bench_serving.py +33 -38
sglang/global_config.py +5 -17
sglang/lang/backend/runtime_endpoint.py +5 -2
sglang/lang/interpreter.py +1 -4
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +33 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +1 -3
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/fused_moe/layer.py +27 -7
sglang/srt/layers/layernorm.py +12 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +38 -122
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +259 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +105 -71
sglang/srt/managers/tokenizer_manager.py +17 -8
sglang/srt/managers/tp_worker.py +188 -121
sglang/srt/model_executor/cuda_graph_runner.py +69 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +123 -154
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +1 -5
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/exaone.py +1 -5
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama.py +51 -5
sglang/srt/models/llama_classification.py +1 -20
sglang/srt/models/llava.py +30 -5
sglang/srt/models/llavavid.py +2 -2
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +669 -0
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/olmoe.py +415 -0
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +46 -80
sglang/srt/server.py +30 -15
sglang/srt/server_args.py +163 -28
sglang/srt/utils.py +19 -51
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +7 -5
sglang/test/test_utils.py +85 -2
sglang/utils.py +32 -37
sglang/version.py +1 -1
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
sglang-0.3.1.post1.dist-info/RECORD +130 -0
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
sglang-0.3.0.dist-info/RECORD +0 -118
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -21,9 +21,22 @@ import logging
 import random
 from typing import List, Optional, Union
+from sglang.srt.utils import is_hip
 logger = logging.getLogger(__name__)
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, {})
+        for lora_path in values:
+            if "=" in lora_path:
+                name, path = lora_path.split("=", 1)
+                getattr(namespace, self.dest)[name] = path
+            else:
+                getattr(namespace, self.dest)[lora_path] = lora_path
 @dataclasses.dataclass
 class ServerArgs:
     # Model and tokenizer
@@ -49,7 +62,6 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
-    max_num_reqs: Optional[int] = None
     max_total_tokens: Optional[int] = None
     chunked_prefill_size: int = 8192
     max_prefill_tokens: int = 16384
@@ -60,6 +72,7 @@ class ServerArgs:
     tp_size: int = 1
     stream_interval: int = 1
     random_seed: Optional[int] = None
+    constrained_json_whitespace_pattern: Optional[str] = None
     # Logging
     log_level: str = "info"
@@ -75,7 +88,18 @@ class ServerArgs:
     dp_size: int = 1
     load_balance_method: str = "round_robin"
+    # Distributed args
+    nccl_init_addr: Optional[str] = None
+    nnodes: int = 1
+    node_rank: Optional[int] = None
+    # Model override args in JSON
+    json_model_override_args: str = "{}"
     # Optimization/debug options
+    attention_backend: Optional[str] = None
+    sampling_backend: Optional[str] = None
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
     disable_radix_cache: bool = False
@@ -86,16 +110,18 @@ class ServerArgs:
     disable_custom_all_reduce: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
+    max_torch_compile_bs: int = 32
+    torchao_config: str = ""
     enable_p2p_check: bool = False
     enable_mla: bool = False
     triton_attention_reduce_in_fp32: bool = False
-    # Distributed args
-    nccl_init_addr: Optional[str] = None
-    nnodes: int = 1
-    node_rank: Optional[int] = None
+    # LoRA
+    lora_paths: Optional[List[str]] = None
+    max_loras_per_batch: int = 8
     def __post_init__(self):
+        # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
@@ -106,6 +132,7 @@ class ServerArgs:
             # Disable chunked prefill
             self.chunked_prefill_size = None
+        # Mem fraction depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
@@ -126,6 +153,47 @@ class ServerArgs:
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
+        # Deprecation warnings
+        if self.disable_flashinfer:
+            logger.warning(
+                "The option '--disable-flashinfer' will be deprecated in the next release. "
+                "Please use '--attention-backend triton' instead."
+            )
+            self.attention_backend = "triton"
+        if self.disable_flashinfer_sampling:
+            logger.warning(
+                "The option '--disable-flashinfer-sampling' will be deprecated in the next release. "
+                "Please use '--sampling-backend pytorch' instead. "
+            )
+            self.sampling_backend = "pytorch"
+        # ROCm: flashinfer available later
+        if is_hip():
+            self.attention_backend = "triton"
+            self.sampling_backend = "pytorch"
+        # Default kernel backends
+        if self.enable_mla:
+            logger.info("MLA optimization is tunred on. Use triton backend.")
+            self.attention_backend = "triton"
+        if self.attention_backend is None:
+            self.attention_backend = "flashinfer"
+        if self.sampling_backend is None:
+            self.sampling_backend = "flashinfer"
+        # Model-specific patches
+        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
+            logger.info(
+                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
+            )
+            self.trust_remote_code = False
+        if "gemma-2" in self.model_path.lower():
+            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
+            self.attention_backend = "flashinfer"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
@@ -209,11 +277,6 @@ class ServerArgs:
             action="store_true",
             help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
         )
-        parser.add_argument(
-            "--is-embedding",
-            action="store_true",
-            help="Whether to use a CausalLM as an embedding model.",
-        )
         parser.add_argument(
             "--context-length",
             type=int,
@@ -248,6 +311,11 @@ class ServerArgs:
             default=ServerArgs.chat_template,
             help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
         )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
         parser.add_argument(
             "--mem-fraction-static",
             type=float,
@@ -260,17 +328,12 @@ class ServerArgs:
             default=ServerArgs.max_running_requests,
             help="The maximum number of running requests.",
         )
-        parser.add_argument(
-            "--max-num-reqs",
-            type=int,
-            default=ServerArgs.max_num_reqs,
-            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
-        )
         parser.add_argument(
             "--max-total-tokens",
             type=int,
             default=ServerArgs.max_total_tokens,
-            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. "
+            "This option is typically used for development and debugging purposes.",
         )
         parser.add_argument(
             "--chunked-prefill-size",
@@ -316,6 +379,12 @@ class ServerArgs:
             default=ServerArgs.random_seed,
             help="The random seed.",
         )
+        parser.add_argument(
+            "--constrained-json-whitespace-pattern",
+            type=str,
+            default=ServerArgs.constrained_json_whitespace_pattern,
+            help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
+        )
         parser.add_argument(
             "--log-level",
             type=str,
@@ -381,16 +450,38 @@ class ServerArgs:
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
+        # Model override args
+        parser.add_argument(
+            "--json-model-override-args",
+            type=str,
+            help="A dictionary in JSON string format used to override default model configurations.",
+            default=ServerArgs.json_model_override_args,
+        )
         # Optimization/debug options
+        parser.add_argument(
+            "--attention-backend",
+            type=str,
+            choices=["flashinfer", "triton"],
+            default=ServerArgs.attention_backend,
+            help="Choose the kernels for attention layers.",
+        )
+        parser.add_argument(
+            "--sampling-backend",
+            type=str,
+            choices=["flashinfer", "pytorch"],
+            default=ServerArgs.sampling_backend,
+            help="Choose the kernels for sampling layers.",
+        )
         parser.add_argument(
             "--disable-flashinfer",
             action="store_true",
-            help="Disable flashinfer attention kernels.",
+            help="Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead.",
         )
         parser.add_argument(
             "--disable-flashinfer-sampling",
             action="store_true",
-            help="Disable flashinfer sampling kernels.",
+            help="Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead.",
         )
         parser.add_argument(
             "--disable-radix-cache",
@@ -431,7 +522,19 @@ class ServerArgs:
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
-            help="Optimize the model with torch.compile, experimental feature.",
+            help="Optimize the model with torch.compile. Experimental feature.",
+        )
+        parser.add_argument(
+            "--max-torch-compile-bs",
+            type=int,
+            default=ServerArgs.max_torch_compile_bs,
+            help="Set the maximum batch size when using torch compile.",
+        )
+        parser.add_argument(
+            "--torchao-config",
+            type=str,
+            default=ServerArgs.torchao_config,
+            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
         )
         parser.add_argument(
             "--enable-p2p-check",
@@ -455,6 +558,22 @@ class ServerArgs:
             help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
         )
+        # LoRA options
+        parser.add_argument(
+            "--lora-paths",
+            type=str,
+            nargs="*",
+            default=None,
+            action=LoRAPathAction,
+            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
+        )
+        parser.add_argument(
+            "--max-loras-per-batch",
+            type=int,
+            default=8,
+            help="Maximum number of adapters for a running batch, include base-only request",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
@@ -472,14 +591,30 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
-        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
-            logger.info(
-                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
-            )
-            self.trust_remote_code = False
-        if "gemma-2" in self.model_path.lower():
-            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
-            self.disable_flashinfer = False
+        assert (
+            self.max_loras_per_batch > 0
+            # FIXME
+            and (self.lora_paths is None or self.disable_cuda_graph)
+            and (self.lora_paths is None or self.disable_radix_cache)
+        ), "compatibility of lora and cuda graph and radix attention is in progress"
+def prepare_server_args(argv: List[str]) -> ServerArgs:
+    """
+    Prepare the server arguments from the command line arguments.
+    Args:
+        args: The command line arguments. Typically, it should be `sys.argv[1:]`
+            to ensure compatibility with `parse_args` when no arguments are passed.
+    Returns:
+        The server arguments.
+    """
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    raw_args = parser.parse_args(argv)
+    server_args = ServerArgs.from_cli_args(raw_args)
+    return server_args
 @dataclasses.dataclass

sglang/srt/utils.py CHANGED Viewed

@@ -35,6 +35,7 @@ import torch
 import torch.distributed as dist
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
+from torch import nn
 from torch.nn.parameter import Parameter
 from triton.runtime.cache import (
     FileCacheManager,
@@ -50,6 +51,11 @@ show_time_cost = False
 time_infos = {}
+# torch flag AMD GPU
+def is_hip() -> bool:
+    return torch.version.hip is not None
 def enable_show_time_cost():
     global show_time_cost
     show_time_cost = True
@@ -186,7 +192,7 @@ def allocate_init_ports(
         cur_port += 1
     if port is not None and ret_ports[0] != port:
-        logger.warn(
+        logger.warning(
             f"WARNING: Port {port} is not available. Use port {ret_ports[0]} instead."
         )
@@ -622,56 +628,7 @@ def set_ulimit(target_soft_limit=65535):
         try:
             resource.setrlimit(resource_type, (target_soft_limit, current_hard))
         except ValueError as e:
-            logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
-def is_llama3_405b_fp8_head_16(model_config):
-    """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
-    if (
-        model_config.hf_config.architectures[0] == "LlamaForCausalLM"
-        and model_config.hf_config.hidden_size == 16384
-        and model_config.hf_config.intermediate_size == 53248
-        and model_config.hf_config.num_hidden_layers == 126
-        and model_config.hf_config.num_key_value_heads == 16
-        and hasattr(model_config.hf_config, "quantization_config")
-        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
-    ):
-        return True
-    return False
-def monkey_patch_vllm_qvk_linear_loader():
-    """A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
-    from vllm.model_executor.layers.linear import QKVParallelLinear
-    origin_weight_loader = QKVParallelLinear.weight_loader
-    def get_original_weight(loaded_weight, head_dim):
-        n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
-        dim = loaded_weight.shape[1]
-        for i in range(n_kv_head):
-            loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
-                2 * i * head_dim : (2 * i + 1) * head_dim, :
-            ]
-        original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
-        assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
-        return original_kv_weight
-    def weight_loader_srt(
-        self,
-        param: Parameter,
-        loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
-    ):
-        if (
-            loaded_shard_id in ["k", "v"]
-            and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
-        ):
-            loaded_weight = get_original_weight(loaded_weight, self.head_size)
-        origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
-    setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
+            logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
 def add_api_key_middleware(app, api_key: str):
@@ -714,3 +671,14 @@ def configure_logger(server_args, prefix: str = ""):
         datefmt="%H:%M:%S",
         force=True,
     )
+# source: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/vllm/lora/utils.py#L9
+def replace_submodule(
+    model: nn.Module, module_name: str, new_module: nn.Module
+) -> nn.Module:
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    return new_module

sglang/test/few_shot_gsm8k.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+Run few-shot GSM-8K evaluation.
+Usage:
+python3 -m sglang.test.few_shot_gsm8k --num-questions 200
+"""
+import argparse
+import ast
+import re
+import time
+import numpy as np
+from sglang.api import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+INVALID = -9999999
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+def main(args):
+    # Select backend
+    set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
+    # Read data
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Run requests
+    tic = time.time()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.time() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+    # print(f"{preds=}")
+    # print(f"{labels=}")
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+    # Dump results
+    dump_state_text("tmp_output_gsm8k.txt", states)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--parallel", type=int, default=128)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+    main(args)

sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl

sglang 0.3.0py3-none-any.whl → 0.3.1.post1py3-none-any.whl