PyPI - sglang - Versions diffs - 0.2.9.post1__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

sglang 0.2.9.post1py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sglang/bench_latency.py +114 -63
sglang/check_env.py +1 -0
sglang/lang/backend/runtime_endpoint.py +0 -11
sglang/srt/hf_transformers_utils.py +2 -2
sglang/srt/layers/extend_attention.py +59 -7
sglang/srt/layers/radix_attention.py +22 -9
sglang/srt/layers/token_attention.py +28 -2
sglang/srt/managers/io_struct.py +9 -4
sglang/srt/managers/schedule_batch.py +15 -11
sglang/srt/managers/tokenizer_manager.py +28 -13
sglang/srt/mem_cache/memory_pool.py +65 -24
sglang/srt/model_config.py +11 -0
sglang/srt/model_executor/model_runner.py +46 -17
sglang/srt/models/deepseek_v2.py +198 -16
sglang/srt/openai_api/adapter.py +120 -20
sglang/srt/openai_api/protocol.py +1 -1
sglang/srt/server.py +87 -78
sglang/srt/server_args.py +8 -2
sglang/srt/utils.py +25 -20
sglang/test/run_eval.py +21 -10
sglang/test/runners.py +237 -0
sglang/test/simple_eval_common.py +12 -12
sglang/test/simple_eval_gpqa.py +92 -0
sglang/test/simple_eval_humaneval.py +5 -5
sglang/test/simple_eval_math.py +72 -0
sglang/test/test_utils.py +94 -13
sglang/utils.py +15 -37
sglang/version.py +1 -1
{sglang-0.2.9.post1.dist-info → sglang-0.2.10.dist-info}/METADATA +29 -28
{sglang-0.2.9.post1.dist-info → sglang-0.2.10.dist-info}/RECORD +33 -30
{sglang-0.2.9.post1.dist-info → sglang-0.2.10.dist-info}/LICENSE +0 -0
{sglang-0.2.9.post1.dist-info → sglang-0.2.10.dist-info}/WHEEL +0 -0
{sglang-0.2.9.post1.dist-info → sglang-0.2.10.dist-info}/top_level.txt +0 -0

sglang/test/run_eval.py CHANGED Viewed

@@ -10,7 +10,6 @@ import time
 from sglang.test.simple_eval_common import (
     ChatCompletionSampler,
-    download_dataset,
     make_report,
     set_ulimit,
 )
@@ -27,14 +26,26 @@ def run_eval(args):
     if args.eval_name == "mmlu":
         from sglang.test.simple_eval_mmlu import MMLUEval
-        dataset_path = "mmlu.csv"
-        if not os.path.exists(dataset_path):
-            download_dataset(
-                dataset_path,
-                "https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
-            )
-        eval_obj = MMLUEval(dataset_path, args.num_examples, args.num_threads)
+        filename = "https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv"
+        eval_obj = MMLUEval(filename, args.num_examples, args.num_threads)
+    elif args.eval_name == "math":
+        from sglang.test.simple_eval_math import MathEval
+        equality_checker = ChatCompletionSampler(model="gpt-4-turbo")
+        filename = (
+            "https://openaipublic.blob.core.windows.net/simple-evals/math_test.csv"
+        )
+        eval_obj = MathEval(
+            filename, equality_checker, args.num_examples, args.num_threads
+        )
+    elif args.eval_name == "gpqa":
+        from sglang.test.simple_eval_gpqa import GPQAEval
+        filename = (
+            "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
+        )
+        eval_obj = GPQAEval(filename, args.num_examples, args.num_threads)
     elif args.eval_name == "humaneval":
         from sglang.test.simple_eval_humaneval import HumanEval
@@ -97,7 +108,7 @@ if __name__ == "__main__":
     )
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
-    parser.add_argument("--num-threads", type=int, default=64)
+    parser.add_argument("--num-threads", type=int, default=512)
     set_ulimit()
     args = parser.parse_args()

sglang/test/runners.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import json
+import multiprocessing
+from dataclasses import dataclass
+from typing import List, Union
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from sglang.srt.server import Runtime
+DEFAULT_PROMPTS = [
+    "The capital of France is",
+    "The capital of the United Kindom is",
+    "Today is a sunny day and I like",
+]
+NUM_TOP_LOGPROBS = 5
+def is_embedding_model(model_path):
+    # FIXME incomplete list
+    if "e5-mistral-7b-instruct" in model_path.lower():
+        return True
+    return False
+def get_dtype_str(torch_dtype):
+    if torch_dtype is torch.float16:
+        return "float16"
+    else:
+        raise NotImplementedError()
+@dataclass
+class ModelOutput:
+    output_strs: str = None
+    top_input_logprobs: torch.Tensor = None
+    top_output_logprobs: torch.Tensor = None
+    embed_logits: torch.Tensor = None
+class HFRunner:
+    def __init__(
+        self,
+        model_path,
+        torch_dtype=torch.float16,
+        is_embedding_model=None,
+    ):
+        self.in_queue = multiprocessing.Queue()
+        self.out_queue = multiprocessing.Queue()
+        self.model_proc = multiprocessing.Process(
+            target=self.start_model_process,
+            args=(
+                self.in_queue,
+                self.out_queue,
+                model_path,
+                torch_dtype,
+                is_embedding_model,
+            ),
+        )
+        self.model_proc.start()
+    def start_model_process(
+        self, in_queue, out_queue, model_path, torch_dtype, is_embedding_model
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
+        self.is_embedding_model = (
+            is_embedding_model(model_path)
+            if is_embedding_model is None
+            else is_embedding_model
+        )
+        if not self.is_embedding_model:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            ).cuda()
+        else:
+            from sentence_transformers import SentenceTransformer
+            self.model = SentenceTransformer(
+                model_path,
+                device="cpu",
+            ).to(dtype=torch_dtype)
+        while True:
+            prompts, max_new_tokens = in_queue.get()
+            if prompts is not None:
+                if not self.is_embedding_model:
+                    output_strs = []
+                    prefill_logprobs = []
+                    for p in prompts:
+                        if isinstance(p, str):
+                            input_ids = self.tokenizer.encode(
+                                p, return_tensors="pt"
+                            ).cuda()
+                        else:
+                            input_ids = torch.tensor([p], device="cuda")
+                        output_ids = self.model.generate(
+                            input_ids, do_sample=False, max_new_tokens=max_new_tokens
+                        )
+                        output_strs.append(self.tokenizer.decode(output_ids[0]))
+                        logits = self.model.forward(input_ids).logits[0]
+                        logprobs = F.log_softmax(
+                            logits, dim=-1, dtype=torch.float32
+                        ).tolist()
+                        # index_of_max = (lambda nums: nums.index(max(nums)))(logprobs[-1])
+                        # print("index", index_of_max)
+                        logprobs = [
+                            sorted(token_logprobs, reverse=True)[:NUM_TOP_LOGPROBS]
+                            for token_logprobs in logprobs
+                        ]
+                        prefill_logprobs.append(logprobs)
+                    out_queue.put(
+                        ModelOutput(
+                            output_strs=output_strs, top_input_logprobs=prefill_logprobs
+                        )
+                    )
+                else:
+                    assert isinstance(prompts, List[str])
+                    logits = self.model.encode(prompts).tolist()
+                    out_queue.put(ModelOutput(embed_logits=logits))
+    def forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=64,
+    ):
+        self.in_queue.put((prompts, max_new_tokens))
+        return self.out_queue.get()
+    def terminate(self):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+class SRTRunner:
+    def __init__(
+        self,
+        model_path,
+        tp_size=1,
+        torch_dtype=torch.float16,
+        is_embedding_model=None,
+    ):
+        self.is_embedding_model = (
+            is_embedding_model(model_path)
+            if is_embedding_model is None
+            else is_embedding_model
+        )
+        if self.is_embedding_model:
+            raise NotImplementedError()
+        self.runtime = Runtime(
+            model_path=model_path,
+            tp_size=tp_size,
+            dtype=get_dtype_str(torch_dtype),
+        )
+    def forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=64,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        top_input_logprobs = []
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        for prompt in prompts:
+            response = self.runtime.generate(
+                prompt,
+                sampling_params=sampling_params,
+                return_logprob=True,
+                top_logprobs_num=NUM_TOP_LOGPROBS,
+            )
+            response = json.loads(response)
+            output_strs.append(response["text"])
+            top_input_logprobs.append(
+                [
+                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["input_top_logprobs"][1:]
+                ]
+                + [
+                    [
+                        tup[0]
+                        for tup in response["meta_info"]["output_top_logprobs"][0][
+                            :NUM_TOP_LOGPROBS
+                        ]
+                    ]
+                ]
+            )
+            # print(response["meta_info"]["output_top_logprobs"][0])
+        return ModelOutput(
+            output_strs=output_strs, top_input_logprobs=top_input_logprobs
+        )
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.runtime.shutdown()
+        del self.runtime

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -7,7 +7,7 @@ import time
 from collections import defaultdict
 from dataclasses import dataclass, field
 from multiprocessing.pool import ThreadPool
-from typing import Any
+from typing import Any, Dict, List, Tuple
 import httpx
 import jinja2
@@ -24,8 +24,8 @@ OPENAI_SYSTEM_MESSAGE_CHATGPT = (
 )
-Message = dict[str, Any]  # keys role, content
-MessageList = list[Message]
+Message = Dict[str, Any]  # keys role, content
+MessageList = List[Message]
 class SamplerBase:
@@ -45,9 +45,9 @@ class EvalResult:
     """
     score: float | None  # top-line metric
-    metrics: dict[str, float] | None  # other metrics
-    htmls: list[str]  # strings of valid HTML
-    convos: list[MessageList]  # sampled conversations
+    metrics: Dict[str, float] | None  # other metrics
+    htmls: List[str]  # strings of valid HTML
+    convos: List[MessageList]  # sampled conversations
 @dataclass
@@ -57,7 +57,7 @@ class SingleEvalResult:
     """
     score: float | None
-    metrics: dict[str, float] = field(default_factory=dict)
+    metrics: Dict[str, float] = field(default_factory=dict)
     html: str | None = None
     convo: MessageList | None = None  # sampled conversation
@@ -270,9 +270,9 @@ def _compute_stat(values: list, stat: str):
 def aggregate_results(
-    single_eval_results: list[SingleEvalResult],
-    default_stats: tuple[str] = ("mean", "std"),
-    name2stats: dict[str, tuple[str]] | None = None,
+    single_eval_results: List[SingleEvalResult],
+    default_stats: Tuple[str] = ("mean", "std"),
+    name2stats: Dict[str, Tuple[str]] | None = None,
 ) -> EvalResult:
     """
     Aggregate results from multiple evaluations into a single EvalResult.
@@ -302,7 +302,7 @@ def aggregate_results(
     )
-def map_with_progress(f: callable, xs: list[Any], num_threads: int):
+def map_with_progress(f: callable, xs: List[Any], num_threads: int):
     """
     Apply f to each element of xs, using a ThreadPool, and show progress.
     """
@@ -422,7 +422,7 @@ def make_report(eval_result: EvalResult) -> str:
     )
-def make_report_from_example_htmls(htmls: list[str]):
+def make_report_from_example_htmls(htmls: List[str]):
     """
     Create a standalone HTML report from a list of example htmls
     """

sglang/test/simple_eval_gpqa.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Adapted from https://github.com/openai/simple-evals/
+"""
+GPQA: A Graduate-Level Google-Proof Q&A Benchmark
+David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman
+https://arxiv.org/abs/2311.12022
+"""
+import random
+import re
+import pandas
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN_MULTICHOICE,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    MessageList,
+    SamplerBase,
+    SingleEvalResult,
+    format_multichoice_question,
+)
+class GPQAEval(Eval):
+    def __init__(
+        self,
+        filename: str,
+        num_examples: int | None,
+        num_threads: int,
+        n_repeats: int = 1,
+    ):
+        df = pandas.read_csv(filename)
+        examples = [row.to_dict() for _, row in df.iterrows()]
+        rng = random.Random(0)
+        if num_examples:
+            assert n_repeats == 1, "n_repeats only supported for num_examples"
+            examples = rng.sample(examples, num_examples)
+        examples = examples * n_repeats
+        examples = [
+            example | {"permutation": rng.sample(range(4), 4)} for example in examples
+        ]
+        self.examples = examples
+        self.n_repeats = n_repeats
+        self.num_threads = num_threads
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(row: dict):
+            choices = [
+                row["Correct Answer"],
+                row["Incorrect Answer 1"],
+                row["Incorrect Answer 2"],
+                row["Incorrect Answer 3"],
+            ]
+            choices = [choices[i] for i in row["permutation"]]
+            correct_index = choices.index(row["Correct Answer"])
+            correct_answer = "ABCD"[correct_index]
+            choices_dict = dict(
+                A=choices[0],
+                B=choices[1],
+                C=choices[2],
+                D=choices[3],
+                Question=row["Question"],
+            )
+            prompt_messages = [
+                sampler._pack_message(
+                    content=format_multichoice_question(choices_dict), role="user"
+                )
+            ]
+            response_text = sampler(prompt_messages)
+            match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
+            extracted_answer = match.group(1) if match else None
+            score = 1.0 if extracted_answer == correct_answer else 0.0
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=correct_answer,
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics={"chars": len(response_text)},
+            )
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -14,7 +14,7 @@ import re
 from collections import Counter, defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from io import BytesIO
-from typing import Any, Tuple
+from typing import Any, Dict, List, Tuple
 import blobfile as bf
 import tqdm
@@ -38,8 +38,8 @@ from sglang.test.simple_eval_common import (
 def evaluate_functional_correctness(
-    sample: dict[str, str],
-    completions: list[str],
+    sample: Dict[str, str],
+    completions: List[str],
     n_workers: int = 4,
     timeout: float = 3.0,
 ):
@@ -70,7 +70,7 @@ class HumanEval(Eval):
         num_examples: int | None,
         num_threads: int,
         num_samples_per_task: int = 5,
-        ks_passes: list[int] = [1, 2, 5],
+        ks_passes: List[int] = [1, 2, 5],
         timeout: int = 120,
     ):
         self.seed = 0
@@ -97,7 +97,7 @@ class HumanEval(Eval):
             ]  # remove signature
             return extracted_answer
-        def fn(sample: dict[str, str]):
+        def fn(sample: Dict[str, str]):
             prompt_messages = [
                 sampler._pack_message(
                     role="user", content=instruction + sample["prompt"]

sglang/test/simple_eval_math.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Adapted from https://github.com/openai/simple-evals/
+"""
+Measuring Mathematical Problem Solving With the MATH Dataset
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, Jacob Steinhardt
+https://arxiv.org/abs/2103.03874
+"""
+import random
+import re
+import pandas
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+    check_equality,
+)
+QUERY_TEMPLATE = """
+Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
+{Question}
+Remember to put your answer on its own line after "Answer:", and you do not need to use a \\boxed command.
+""".strip()
+class MathEval(Eval):
+    def __init__(
+        self,
+        filename: str,
+        equality_checker: SamplerBase,
+        num_examples: int | None,
+        num_threads: int,
+    ):
+        df = pandas.read_csv(filename)
+        examples = [row.to_dict() for _, row in df.iterrows()]
+        if num_examples:
+            examples = random.Random(0).sample(examples, num_examples)
+        self.examples = examples
+        self.equality_checker = equality_checker
+        self.num_threads = num_threads
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(row: dict):
+            prompt_messages = [
+                sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
+            ]
+            response_text = sampler(prompt_messages)
+            match = re.search(ANSWER_PATTERN, response_text)
+            extracted_answer = match.group(1) if match else None
+            score = float(
+                check_equality(self.equality_checker, row["Answer"], extracted_answer)
+            )
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=row["Answer"],
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(html=html, score=score, convo=convo)
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)

sglang 0.2.9.post1__py3-none-any.whl → 0.2.10__py3-none-any.whl

sglang 0.2.9.post1py3-none-any.whl → 0.2.10py3-none-any.whl