PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +60 -23
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +52 -167
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +130 -43
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +49 -11
sglang/srt/model_executor/forward_batch_info.py +59 -27
sglang/srt/model_executor/model_runner.py +210 -61
sglang/srt/models/chatglm.py +4 -12
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +5 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +15 -7
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +16 -2
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mixtral.py +5 -1
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +13 -3
sglang/srt/models/qwen2_moe.py +5 -14
sglang/srt/models/stablelm.py +5 -1
sglang/srt/openai_api/adapter.py +117 -37
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
sglang/srt/server.py +84 -56
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +23 -31
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -224,13 +224,18 @@ def is_multimodal_model(model):
     raise ValueError("unrecognized type")
-def is_generation_model(model_architectures):
+def is_generation_model(model_architectures, is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architectue
+    # 2. check the `is_embedding` server args
     if (
         "LlamaEmbeddingModel" in model_architectures
         or "MistralModel" in model_architectures
     ):
         return False
-    return True
+    else:
+        return not is_embedding
 def decode_video_base64(video_base64):
@@ -347,7 +352,7 @@ def suppress_other_loggers():
         logging.WARN
     )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
-    logging.getLogger("vllm.utils").setLevel(logging.WARN)
+    logging.getLogger("vllm.utils").setLevel(logging.ERROR)
 def assert_pkg_version(pkg: str, min_version: str, message: str):
@@ -369,14 +374,11 @@ def kill_parent_process():
     """Kill the parent process and all children of the parent process."""
     current_process = psutil.Process()
     parent_process = current_process.parent()
-    children = parent_process.children(recursive=True)
-    for child in children:
-        if child.pid != current_process.pid:
-            os.kill(child.pid, 9)
-    os.kill(parent_process.pid, 9)
+    kill_child_process(parent_process.pid, skip_pid=current_process.pid)
-def kill_child_process(pid, including_parent=True):
+def kill_child_process(pid, including_parent=True, skip_pid=None):
+    """Kill the process and all its children process."""
     try:
         parent = psutil.Process(pid)
     except psutil.NoSuchProcess:
@@ -384,6 +386,8 @@ def kill_child_process(pid, including_parent=True):
     children = parent.children(recursive=True)
     for child in children:
+        if child.pid == skip_pid:
+            continue
         try:
             child.kill()
         except psutil.NoSuchProcess:
@@ -452,10 +456,6 @@ def monkey_patch_vllm_dummy_weight_loader():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
                     quant_method.process_weights_after_loading(module)
-                # FIXME: Remove this after Mixtral is updated
-                # to use quant_method.
-                if hasattr(module, "process_weights_after_loading"):
-                    module.process_weights_after_loading()
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
@@ -692,7 +692,7 @@ def monkey_patch_vllm_qvk_linear_loader():
     setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
-def add_api_key_middleware(app, api_key):
+def add_api_key_middleware(app, api_key: str):
     @app.middleware("http")
     async def authentication(request, call_next):
         if request.method == "OPTIONS":
@@ -704,7 +704,7 @@ def add_api_key_middleware(app, api_key):
         return await call_next(request)
-def prepare_model(model_path):
+def prepare_model(model_path: str):
     if "SGLANG_USE_MODELSCOPE" in os.environ:
         if not os.path.exists(model_path):
             from modelscope import snapshot_download
@@ -713,7 +713,7 @@ def prepare_model(model_path):
     return model_path
-def prepare_tokenizer(tokenizer_path):
+def prepare_tokenizer(tokenizer_path: str):
     if "SGLANG_USE_MODELSCOPE" in os.environ:
         if not os.path.exists(tokenizer_path):
             from modelscope import snapshot_download
@@ -722,3 +722,13 @@ def prepare_tokenizer(tokenizer_path):
                 tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
             )
     return tokenizer_path
+def configure_logger(server_args, prefix: str = ""):
+    format = f"[%(asctime)s{prefix}] %(message)s"
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format=format,
+        datefmt="%H:%M:%S",
+        force=True,
+    )

sglang/test/runners.py CHANGED Viewed

@@ -14,7 +14,7 @@ limitations under the License.
 """
 import json
-import multiprocessing
+import multiprocessing as mp
 import os
 from dataclasses import dataclass
 from typing import List, Union
@@ -24,15 +24,15 @@ import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sglang.srt.server import Runtime
-from sglang.srt.utils import is_generation_model
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
 DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
     # "The capital of France is",
+    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
     "The capital of the United Kindom is",
     "Today is a sunny day and I like",
     "AI is a field of computer science focused on",
-    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
 ]
 dirpath = os.path.dirname(__file__)
@@ -63,44 +63,37 @@ class HFRunner:
     def __init__(
         self,
         model_path,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
+        torch_dtype,
+        is_generation,
     ):
-        self.in_queue = multiprocessing.Queue()
-        self.out_queue = multiprocessing.Queue()
+        self.is_generation = is_generation
+        self.in_queue = mp.Queue()
+        self.out_queue = mp.Queue()
-        self.model_proc = multiprocessing.Process(
+        self.model_proc = mp.Process(
             target=self.start_model_process,
             args=(
                 self.in_queue,
                 self.out_queue,
                 model_path,
                 torch_dtype,
-                is_generation_model,
             ),
         )
         self.model_proc.start()
-    def start_model_process(
-        self, in_queue, out_queue, model_path, torch_dtype, is_generation_model
-    ):
+    def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
             torch_dtype=torch_dtype,
-            trust_remote_code=True,
         )
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
-        if self.is_generation_model:
+        if self.is_generation:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
                 torch_dtype=torch_dtype,
+                trust_remote_code=False,
                 low_cpu_mem_usage=True,
-                trust_remote_code=True,
             ).cuda()
         else:
             from sentence_transformers import SentenceTransformer
@@ -113,7 +106,7 @@ class HFRunner:
         while True:
             prompts, max_new_tokens = in_queue.get()
             if prompts is not None:
-                if self.is_generation_model:
+                if self.is_generation:
                     output_strs = []
                     prefill_logprobs = []
                     for p in prompts:
@@ -176,22 +169,20 @@ class SRTRunner:
     def __init__(
         self,
         model_path,
+        torch_dtype,
+        is_generation,
         tp_size=1,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
-        port=5157,
+        port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
     ):
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
+        self.is_generation = is_generation
         self.runtime = Runtime(
             model_path=model_path,
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
-            mem_fraction_static=0.7,
+            mem_fraction_static=0.69,
+            trust_remote_code=False,
+            is_embedding=not self.is_generation,
         )
     def forward(
@@ -199,7 +190,7 @@ class SRTRunner:
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
         max_new_tokens=8,
     ):
-        if self.is_generation_model:
+        if self.is_generation:
             # the return value contains logprobs from prefill
             output_strs = []
             top_input_logprobs = []
@@ -209,6 +200,7 @@ class SRTRunner:
                     prompt,
                     sampling_params=sampling_params,
                     return_logprob=True,
+                    logprob_start_len=0,
                     top_logprobs_num=NUM_TOP_LOGPROBS,
                 )
                 response = json.loads(response)

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # Adapted from https://github.com/openai/simple-evals/
-import base64
 import os
 import resource
 import time
 from collections import defaultdict
 from dataclasses import dataclass, field
 from multiprocessing.pool import ThreadPool
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import httpx
 import jinja2
@@ -44,8 +43,8 @@ class EvalResult:
     Result of running an evaluation (usually consisting of many samples)
     """
-    score: float | None  # top-line metric
-    metrics: Dict[str, float] | None  # other metrics
+    score: Optional[float]  # top-line metric
+    metrics: Optional[Dict[str, float]]  # other metrics
     htmls: List[str]  # strings of valid HTML
     convos: List[MessageList]  # sampled conversations
@@ -56,10 +55,10 @@ class SingleEvalResult:
     Result of evaluating a single sample
     """
-    score: float | None
+    score: Optional[float]
     metrics: Dict[str, float] = field(default_factory=dict)
-    html: str | None = None
-    convo: MessageList | None = None  # sampled conversation
+    html: Optional[str] = None
+    convo: Optional[MessageList] = None  # sampled conversation
 class Eval:
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
     def __init__(
         self,
         base_url: str = None,
-        model: str | None = None,
-        system_message: str | None = None,
+        model: Optional[str] = None,
+        system_message: Optional[str] = None,
         temperature: float = 0.0,
         max_tokens: int = 2048,
     ):
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
 def aggregate_results(
     single_eval_results: List[SingleEvalResult],
     default_stats: Tuple[str] = ("mean", "std"),
-    name2stats: Dict[str, Tuple[str]] | None = None,
+    name2stats: Optional[Dict[str, Tuple[str]]] = None,
 ) -> EvalResult:
     """
     Aggregate results from multiple evaluations into a single EvalResult.

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
 import random
 import re
+from typing import Optional
 import pandas
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
     def __init__(
         self,
         filename: str,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         n_repeats: int = 1,
     ):

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
 import random
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List
+from typing import Dict, List, Optional
 import tqdm
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
 class HumanEval(Eval):
     def __init__(
         self,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         num_samples_per_task: int = 5,
         ks_passes: List[int] = [1, 2, 5],

sglang/test/simple_eval_math.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
 import random
 import re
+from typing import Optional
 import pandas
@@ -36,7 +37,7 @@ class MathEval(Eval):
         self,
         filename: str,
         equality_checker: SamplerBase,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
     ):
         df = pandas.read_csv(filename)

sglang/test/simple_eval_mmlu.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
 import random
 import re
+from typing import Optional
 import pandas
@@ -84,7 +85,7 @@ subject2category = {
 class MMLUEval(Eval):
-    def __init__(self, filename: str, num_examples: int | None, num_threads: int):
+    def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
         df = pandas.read_csv(filename)
         examples = [row.to_dict() for _, row in df.iterrows()]
         if num_examples:

sglang/test/test_activation.py ADDED Viewed

@@ -0,0 +1,55 @@
+import itertools
+import unittest
+import torch
+from sglang.srt.layers.activation import GeluAndMul
+class TestGeluAndMul(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+        layer = GeluAndMul().to(dtype=dtype)
+        x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x)
+            out = layer.forward_cuda(x)
+        if dtype == torch.bfloat16:
+            atol = rtol = 1e-2
+        else:
+            atol = rtol = 1e-3
+        self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
+    def test_gelu_and_mul(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._run_gelu_and_mul_test(*params)
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_utils.py CHANGED Viewed

@@ -2,11 +2,10 @@
 import argparse
 import asyncio
-import multiprocessing
+import os
 import subprocess
 import threading
 import time
-import unittest
 from functools import partial
 from typing import Callable, List, Optional
@@ -18,14 +17,19 @@ import torch.nn.functional as F
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.srt.utils import kill_child_process
 from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
-DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
-DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
-DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
+else:
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
     return pred
-def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
-    import grpc
-    from ginfer import sampler_pb2, sampler_pb2_grpc
-    sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
-    sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
-    if stop is None:
-        stop_strings = None
-    else:
-        stop_strings = [stop]
-    sample_request = sampler_pb2.SampleTextRequest(
-        prompt=prompt,
-        settings=sampler_pb2.SampleSettings(
-            max_len=max_tokens,
-            rng_seed=0,
-            temperature=max(temperature, 1e-7),
-            nucleus_p=1,
-            stop_strings=stop_strings,
-        ),
-    )
-    stream = sampler.SampleText(sample_request)
-    response = "".join([x.text for x in stream])
-    return response
+def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
+    raise NotImplementedError()
 def call_generate_guidance(
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "vllm",
             "outlines",
             "lightllm",
-            "ginfer",
+            "gserver",
             "guidance",
             "lmql",
             "srt-raw",
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "lightllm": 22000,
             "lmql": 23000,
             "srt-raw": 30000,
-            "ginfer": 9988,
+            "gserver": 9988,
         }
         args.port = default_port.get(args.backend, None)
     return args
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
         return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "srt-raw":
         return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
-    elif args.backend == "ginfer":
-        return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
+    elif args.backend == "gserver":
+        return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
     elif args.backend == "outlines":
         return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "guidance":
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
     success = True
     for filename in files:
+        global process
-        def func():
-            print(f"\n\nRun {filename}\n\n")
-            ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
-        p = multiprocessing.Process(target=func)
-        def run_one_file():
-            p.start()
-            p.join()
+        def run_one_file(filename):
+            filename = os.path.join(os.getcwd(), filename)
+            print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
+            process = subprocess.Popen(
+                ["python3", filename], stdout=None, stderr=None, env=os.environ
+            )
+            process.wait()
+            return process.returncode
         try:
-            run_with_timeout(run_one_file, timeout=timeout_per_file)
-            if p.exitcode != 0:
-                success = False
-                break
+            ret_code = run_with_timeout(
+                run_one_file, args=(filename,), timeout=timeout_per_file
+            )
+            assert ret_code == 0
         except TimeoutError:
-            p.terminate()
+            kill_child_process(process.pid)
             time.sleep(5)
             print(
-                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
+                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
+                flush=True,
             )
-            return False
+            success = False
+            break
     if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     return 0 if success else -1

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.13"
1	+ __version__ = "0.2.14"

sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl