PyPI - sglang - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl - Mend

sglang 0.3.0py3-none-any.whl → 0.3.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_latency.py +17 -8
sglang/bench_serving.py +33 -38
sglang/global_config.py +5 -17
sglang/lang/backend/runtime_endpoint.py +5 -2
sglang/lang/interpreter.py +1 -4
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +33 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +1 -3
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/fused_moe/layer.py +27 -7
sglang/srt/layers/layernorm.py +12 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +38 -122
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +259 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +105 -71
sglang/srt/managers/tokenizer_manager.py +17 -8
sglang/srt/managers/tp_worker.py +188 -121
sglang/srt/model_executor/cuda_graph_runner.py +69 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +123 -154
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +1 -5
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/exaone.py +1 -5
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama.py +51 -5
sglang/srt/models/llama_classification.py +1 -20
sglang/srt/models/llava.py +30 -5
sglang/srt/models/llavavid.py +2 -2
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +669 -0
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/olmoe.py +415 -0
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +46 -80
sglang/srt/server.py +30 -15
sglang/srt/server_args.py +163 -28
sglang/srt/utils.py +19 -51
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +7 -5
sglang/test/test_utils.py +85 -2
sglang/utils.py +32 -37
sglang/version.py +1 -1
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
sglang-0.3.1.post1.dist-info/RECORD +130 -0
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
sglang-0.3.0.dist-info/RECORD +0 -118
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0

sglang/test/runners.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import List, Union
 import torch
 import torch.nn.functional as F
+from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sglang.srt.server import Runtime
@@ -50,6 +51,13 @@ def get_dtype_str(torch_dtype):
         raise NotImplementedError()
+def get_top_logprobs(logits, k):
+    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    del logits
+    logprobs, top_indices = torch.topk(logprobs, k=k, dim=-1)
+    return logprobs
 @dataclass
 class ModelOutput:
     output_strs: List[str] = None
@@ -65,8 +73,10 @@ class HFRunner:
         model_path,
         torch_dtype,
         is_generation,
+        output_str_only=False,
     ):
         self.is_generation = is_generation
+        self.output_str_only = output_str_only
         self.in_queue = mp.Queue()
         self.out_queue = mp.Queue()
@@ -89,7 +99,7 @@ class HFRunner:
         )
         if self.is_generation:
-            self.model = AutoModelForCausalLM.from_pretrained(
+            self.base_model = AutoModelForCausalLM.from_pretrained(
                 model_path,
                 torch_dtype=torch_dtype,
                 trust_remote_code=False,
@@ -104,12 +114,16 @@ class HFRunner:
             )
         while True:
-            prompts, max_new_tokens = in_queue.get()
+            prompts, max_new_tokens, lora_paths = in_queue.get()
+            if lora_paths is not None:
+                assert len(prompts) == len(lora_paths)
             if prompts is not None:
                 if self.is_generation:
                     output_strs = []
-                    prefill_logprobs = []
-                    for p in prompts:
+                    top_input_logprobs = []
+                    top_output_logprobs = []
+                    for i, p in enumerate(prompts):
                         if isinstance(p, str):
                             input_ids = self.tokenizer.encode(
                                 p, return_tensors="pt"
@@ -117,40 +131,68 @@ class HFRunner:
                         else:
                             input_ids = torch.tensor([p], device="cuda")
-                        output_ids = self.model.generate(
-                            input_ids, do_sample=False, max_new_tokens=max_new_tokens
+                        if lora_paths is not None and lora_paths[i] is not None:
+                            self.model = PeftModel.from_pretrained(
+                                self.base_model,
+                                lora_paths[i],
+                                torch_dtype=torch_dtype,
+                                is_trainable=False,
+                            )
+                        else:
+                            self.model = self.base_model
+                        outputs = self.model.generate(
+                            input_ids,
+                            do_sample=False,
+                            temperature=None,
+                            top_p=None,
+                            max_new_tokens=max_new_tokens,
+                            return_dict_in_generate=True,
+                            output_scores=(not self.output_str_only),
                         )
                         output_strs.append(
-                            self.tokenizer.decode(output_ids[0][len(input_ids[0]) :])
+                            self.tokenizer.decode(outputs[0][0][len(input_ids[0]) :])
                         )
-                        logits = self.model.forward(input_ids).logits[0]
-                        logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-                        logprobs, top_indices = torch.topk(
-                            logprobs, k=NUM_TOP_LOGPROBS, dim=-1
-                        )
-                        # print("index", top_indices)
-                        prefill_logprobs.append(logprobs.tolist())
-                        del logits
-                        del logprobs
+                        if not self.output_str_only:
+                            # outputs.scores: (num_token, 1, vocab_size)
+                            top_output_logprobs.append(
+                                [
+                                    get_top_logprobs(
+                                        logits[0], NUM_TOP_LOGPROBS
+                                    ).tolist()
+                                    for logits in outputs.scores
+                                ]
+                            )
+                            del outputs
+                            input_logits = self.model.forward(input_ids).logits[0]
+                            top_input_logprobs.append(
+                                get_top_logprobs(
+                                    input_logits, NUM_TOP_LOGPROBS
+                                ).tolist()
+                            )
+                            del input_logits
                     out_queue.put(
                         ModelOutput(
-                            output_strs=output_strs, top_input_logprobs=prefill_logprobs
+                            output_strs=output_strs,
+                            top_input_logprobs=top_input_logprobs,
+                            top_output_logprobs=top_output_logprobs,
                         )
                     )
                 else:
+                    assert not self.output_str_only
                     logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
         max_new_tokens=8,
+        lora_paths=None,
     ):
-        self.in_queue.put((prompts, max_new_tokens))
+        self.in_queue.put((prompts, max_new_tokens, lora_paths))
         return self.out_queue.get()
     def terminate(self):
@@ -173,6 +215,10 @@ class SRTRunner:
         is_generation,
         tp_size=1,
         port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
+        lora_paths=None,
+        max_loras_per_batch=4,
+        disable_cuda_graph=False,
+        disable_radix_cache=False,
     ):
         self.is_generation = is_generation
         self.runtime = Runtime(
@@ -183,21 +229,28 @@ class SRTRunner:
             mem_fraction_static=0.69,
             trust_remote_code=False,
             is_embedding=not self.is_generation,
+            lora_paths=lora_paths,
+            max_loras_per_batch=max_loras_per_batch,
+            disable_cuda_graph=disable_cuda_graph,
+            disable_radix_cache=disable_radix_cache,
         )
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
         max_new_tokens=8,
+        lora_paths=None,
     ):
         if self.is_generation:
             # the return value contains logprobs from prefill
             output_strs = []
             top_input_logprobs = []
+            top_output_logprobs = []
             sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
-            for prompt in prompts:
+            for i, prompt in enumerate(prompts):
                 response = self.runtime.generate(
                     prompt,
+                    lora_path=lora_paths[i] if lora_paths else None,
                     sampling_params=sampling_params,
                     return_logprob=True,
                     logprob_start_len=0,
@@ -219,9 +272,48 @@ class SRTRunner:
                         ]
                     ]
                 )
+                top_output_logprobs.append(
+                    [
+                        [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                        for x in response["meta_info"]["output_top_logprobs"]
+                    ]
+                )
+            return ModelOutput(
+                output_strs=output_strs,
+                top_input_logprobs=top_input_logprobs,
+                top_output_logprobs=top_output_logprobs,
+            )
+        else:
+            response = self.runtime.encode(prompts)
+            response = json.loads(response)
+            logits = [x["embedding"] for x in response]
+            return ModelOutput(embed_logits=logits)
+    def batch_forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=8,
+        lora_paths=None,
+    ):
+        """
+        testing serving by sending all prompts once
+        only return output strings and no logprobs
+        """
+        if self.is_generation:
+            # the return value contains logprobs from prefill
+            output_strs = []
+            sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+            response = self.runtime.generate(
+                prompts,
+                lora_path=lora_paths if lora_paths else None,
+                sampling_params=sampling_params,
+            )
+            response = json.loads(response)
+            output_strs = [r["text"] for r in response]
             return ModelOutput(
-                output_strs=output_strs, top_input_logprobs=top_input_logprobs
+                output_strs=output_strs,
             )
         else:
             response = self.runtime.encode(prompts)

sglang/test/test_programs.py CHANGED Viewed

@@ -7,7 +7,7 @@ import time
 import numpy as np
 import sglang as sgl
-from sglang.utils import fetch_and_cache_jsonl
+from sglang.utils import download_and_cache_file, read_jsonl
 def test_few_shot_qa():
@@ -456,10 +456,6 @@ def test_chat_completion_speculative():
 def test_hellaswag_select():
     """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
-    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
-    lines = fetch_and_cache_jsonl(url)
-    # Construct prompts
     def get_one_example(lines, i, include_answer):
         ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
         if include_answer:
@@ -472,6 +468,12 @@ def test_hellaswag_select():
             ret += get_one_example(lines, i, True) + "\n\n"
         return ret
+    # Read data
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+    # Construct prompts
     num_questions = 200
     num_shots = 20
     few_shot_examples = get_few_shot_examples(lines, num_shots)

sglang/test/test_utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import subprocess
 import threading
 import time
 from functools import partial
+from types import SimpleNamespace
 from typing import Callable, List, Optional
 import numpy as np
@@ -14,6 +15,7 @@ import requests
 import torch
 import torch.nn.functional as F
+from sglang.bench_serving import run_benchmark
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
@@ -28,7 +30,13 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruc
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
-if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+def is_in_ci():
+    """Return whether it is in CI runner."""
+    return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
+if is_in_ci():
     DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
     DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
 else:
@@ -296,7 +304,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
 def select_sglang_backend(args: argparse.Namespace):
     if args.backend.startswith("srt"):
         if args.backend == "srt-no-parallel":
-            global_config.enable_parallel_decoding = False
             global_config.enable_parallel_encoding = False
         backend = RuntimeEndpoint(f"{args.host}:{args.port}")
     elif args.backend.startswith("gpt-"):
@@ -501,3 +508,79 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
 def get_similarities(vec1, vec2):
     return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
+def run_bench_serving(model, num_prompts, request_rate, other_server_args):
+    # Launch the server
+    base_url = DEFAULT_URL_FOR_TEST
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
+    # Run benchmark
+    args = SimpleNamespace(
+        backend="sglang",
+        base_url=base_url,
+        host=None,
+        port=None,
+        dataset_name="random",
+        dataset_path="",
+        model=None,
+        tokenizer=None,
+        num_prompts=num_prompts,
+        sharegpt_output_len=None,
+        random_input_len=4096,
+        random_output_len=2048,
+        random_range_ratio=0.0,
+        request_rate=request_rate,
+        multi=None,
+        seed=0,
+        output_file=None,
+        disable_tqdm=False,
+        disable_stream=False,
+        disable_ignore_eos=False,
+        extra_request_body=None,
+    )
+    try:
+        res = run_benchmark(args)
+    finally:
+        kill_child_process(process.pid)
+    assert res["completed"] == num_prompts
+    return res
+def run_bench_latency(model, other_args):
+    command = [
+        "python3",
+        "-m",
+        "sglang.bench_latency",
+        "--model-path",
+        model,
+        "--batch-size",
+        "1",
+        "--input",
+        "128",
+        "--output",
+        "8",
+        *other_args,
+    ]
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}", flush=True)
+        print(f"Error: {error}", flush=True)
+        lastline = output.split("\n")[-3]
+        output_throughput = float(lastline.split(" ")[-2])
+    finally:
+        kill_child_process(process.pid)
+    return output_throughput

sglang/utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ import urllib.request
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from json import dumps
-from typing import Union
+from typing import Optional, Union
 import numpy as np
 import requests
@@ -38,13 +38,11 @@ def is_same_type(values: list):
 def read_jsonl(filename: str):
     """Read a JSONL file."""
-    rets = []
     with open(filename) as fin:
         for line in fin:
             if line.startswith("#"):
                 continue
-            rets.append(json.loads(line))
-    return rets
+            yield json.loads(line)
 def dump_state_text(filename: str, states: list, mode: str = "w"):
@@ -264,38 +262,35 @@ class LazyImport:
         return module(*args, **kwargs)
-def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
-    """Read and cache a jsonl file from a url."""
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
     # Check if the cache file already exists
-    if os.path.exists(cache_file):
-        print("Loading data from cache...")
-        with open(cache_file, "r") as f:
-            data = [json.loads(line) for line in f]
-    else:
-        print("Downloading data from URL...")
-        # Stream the response to show the progress bar
-        response = requests.get(url, stream=True)
-        response.raise_for_status()  # Check for request errors
-        # Total size of the file in bytes
-        total_size = int(response.headers.get("content-length", 0))
-        chunk_size = 1024  # Download in chunks of 1KB
-        # Use tqdm to display the progress bar
-        with open(cache_file, "wb") as f, tqdm(
-            desc=cache_file,
-            total=total_size,
-            unit="B",
-            unit_scale=True,
-            unit_divisor=1024,
-        ) as bar:
-            for chunk in response.iter_content(chunk_size=chunk_size):
-                f.write(chunk)
-                bar.update(len(chunk))
-        # Convert the data to a list of dictionaries
-        with open(cache_file, "r") as f:
-            data = [json.loads(line) for line in f]
-    return data
+    if os.path.exists(filename):
+        return filename
+    print(f"Downloading from {url} to {filename}")
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
+    return filename

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.0"
1	+ __version__ = "0.3.1.post1"

{sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.0
+Version: 0.3.1.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -242,6 +242,7 @@ Requires-Dist: psutil; extra == "srt"
 Requires-Dist: pydantic; extra == "srt"
 Requires-Dist: python-multipart; extra == "srt"
 Requires-Dist: torch; extra == "srt"
+Requires-Dist: torchao; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
 Requires-Dist: zmq; extra == "srt"
@@ -253,6 +254,7 @@ Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
 Requires-Dist: sentence-transformers; extra == "test"
 Requires-Dist: accelerate; extra == "test"
+Requires-Dist: peft; extra == "test"
 <div align="center">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -271,14 +273,16 @@ Requires-Dist: accelerate; extra == "test"
 SGLang is a fast serving framework for large language models and vision language models.
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
-- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
+- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
 ## News
+- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 <details>
@@ -300,6 +304,8 @@ The core features include:
 ## Install
+You can install SGLang using any of the methods below.
 ### Method 1: With pip
 ```
 pip install --upgrade pip
@@ -312,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -323,7 +329,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 ### Method 3: Using docker
-The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
+The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
 Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
 ```bash
@@ -391,7 +397,7 @@ sky status --endpoint 30000 sglang
 ### Common Notes
-- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Backend: SGLang Runtime (SRT)
@@ -457,24 +463,29 @@ print(response)
 It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 ### Additional Server Arguments
-- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
+- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
 ```
-- Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
+- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
 ```
 - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
 ```
 - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
-- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
+- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
+- To enable DeepSeek MLA acceleration, add `--enable-mla`.
+- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
+- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -482,9 +493,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 # Node 1
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
-- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
-- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
-- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 ### Supported Models
@@ -510,6 +518,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - ChatGLM
 - InternLM 2
 - Exaone 3
+- BaiChuan2
+- MiniCPM / MiniCPM 3
+- XVERSE / XVERSE MoE
 **Embedding Models**

sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl

sglang 0.3.0py3-none-any.whl → 0.3.1.post1py3-none-any.whl