PyPI - sglang - Versions diffs - 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

sglang/bench_offline_throughput.py +17 -11
sglang/bench_one_batch.py +14 -6
sglang/bench_serving.py +47 -44
sglang/lang/chat_template.py +31 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
sglang/srt/entrypoints/engine.py +5 -2
sglang/srt/entrypoints/http_server.py +24 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +5 -5
sglang/srt/layers/dp_attention.py +3 -1
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +24 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +20 -12
sglang/srt/layers/moe/fused_moe_native.py +17 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
sglang/srt/layers/parameter.py +16 -7
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +4 -1
sglang/srt/layers/rotary_embedding.py +6 -1
sglang/srt/layers/sampler.py +28 -8
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/managers/detokenizer_manager.py +1 -0
sglang/srt/managers/io_struct.py +36 -5
sglang/srt/managers/schedule_batch.py +31 -25
sglang/srt/managers/scheduler.py +61 -35
sglang/srt/managers/tokenizer_manager.py +4 -0
sglang/srt/model_executor/cuda_graph_runner.py +23 -25
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +7 -4
sglang/srt/model_loader/loader.py +75 -0
sglang/srt/model_loader/weight_utils.py +91 -5
sglang/srt/models/commandr.py +14 -2
sglang/srt/models/dbrx.py +9 -1
sglang/srt/models/deepseek_v2.py +3 -3
sglang/srt/models/gemma2.py +9 -1
sglang/srt/models/grok.py +1 -0
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/torch_native_llama.py +17 -4
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +5 -4
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +4 -14
sglang/srt/server.py +2 -2
sglang/srt/server_args.py +20 -1
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +62 -65
sglang/test/test_programs.py +1 -0
sglang/test/test_utils.py +81 -22
sglang/version.py +1 -1
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA +7 -7
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/RECORD +67 -56
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Common utilities."""
 import base64
+import ctypes
 import dataclasses
 import io
 import ipaddress
@@ -29,6 +30,7 @@ import shutil
 import signal
 import socket
 import subprocess
+import sys
 import tempfile
 import time
 import warnings
@@ -59,7 +61,6 @@ from triton.runtime.cache import (
     default_dump_dir,
     default_override_dir,
 )
-from uvicorn.config import LOGGING_CONFIG
 logger = logging.getLogger(__name__)
@@ -73,7 +74,7 @@ def is_hip() -> bool:
 def is_cuda():
-    return hasattr(torch, "cuda") and torch.cuda.is_available()
+    return hasattr(torch, "cuda") and torch.version.cuda is not None
 def is_cuda_alike():
@@ -773,7 +774,7 @@ def get_zmq_socket(
 def dump_to_file(dirpath, name, value):
-    from vllm.distributed import get_tensor_model_parallel_rank
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
     if get_tensor_model_parallel_rank() != 0:
         return
@@ -1242,68 +1243,6 @@ def dataclass_to_string_truncated(data, max_length=2048):
         return str(data)
-TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
-def parse_tool_response(text, tools, **kwargs):
-    """Parse model response containing tool information.
-    Args:
-        text(str): model response in string format
-        tools(List): tools from user request
-    """
-    if "<|plugin|>" in text:  # internlm2
-        text, action = text.split("<|action_start|><|plugin|>")
-        action = action.split("<|action_end|>".strip())[0]
-        action = action[action.find("{") :]
-        action = json.loads(action)
-        name, parameters = action["name"], json.dumps(
-            action.get("parameters", action.get("arguments", {})), ensure_ascii=False
-        )
-        call_info_list = [(name, parameters)]
-    elif "<function=" in text:  # llama3.1
-        action, _ = text.split("</function>")
-        parameters = action[action.find("{") :]
-        name = action.split("<function=")[1].split(">{")[0]
-        call_info_list = [(name, parameters)]
-    elif "<tool_call>" in text and "</tool_call>" in text:  # qwen2.5
-        # get tool_call in text
-        pattern = r"<tool_call>(.*?)</tool_call>"
-        match_result_list = re.findall(pattern, text, re.DOTALL)
-        call_info_list = []
-        for match_result in match_result_list:
-            action = json.loads(match_result)
-            call_info_list.append(
-                (action["name"], json.dumps(action["arguments"], ensure_ascii=False))
-            )
-        # get text outside of tags
-        if not text.startswith("<tool_call>"):
-            text = text[: text.find("<tool_call>")]
-        elif not text.endswith("</tool_call>"):
-            text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
-        else:
-            text = ""
-    elif "<|python_tag|>" in text:  # llama3.2
-        _, action = text.split("<|python_tag|>")
-        action = json.loads(action)
-        name, parameters = action["name"], json.dumps(
-            action.get("parameters", action.get("arguments", {})), ensure_ascii=False
-        )
-        call_info_list = [(name, parameters)]
-    else:
-        raise RuntimeError(f"Unexpected model response: {text}")
-    call_info_list = [
-        (
-            [tool.function.name for tool in tools].index(call_info[0]),
-            call_info[0],
-            call_info[1],
-        )
-        for call_info in call_info_list
-    ]
-    return text, call_info_list
 def permute_weight(x: torch.Tensor) -> torch.Tensor:
     b_ = x.shape[0]
     n_ = x.shape[1]
@@ -1366,7 +1305,33 @@ def nullable_str(val: str):
     return val
+def pyspy_dump_schedulers():
+    """py-spy dump on all scheduler in a local node."""
+    try:
+        pid = psutil.Process().pid
+        # Command to run py-spy with the PID
+        cmd = f"py-spy dump --pid {pid}"
+        result = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, check=True
+        )
+        logger.info(f"Profile for PID {pid}:\n{result.stdout}")
+    except subprocess.CalledProcessError as e:
+        logger.info(f"Failed to profile PID {pid}. Error: {e.stderr}")
+def kill_itself_when_parent_died():
+    if sys.platform == "linux":
+        # sigkill this process when parent worker manager dies
+        PR_SET_PDEATHSIG = 1
+        libc = ctypes.CDLL("libc.so.6")
+        libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL)
+    else:
+        logger.warninig("kill_itself_when_parent_died is only supported in linux.")
 def set_uvicorn_logging_configs():
+    from uvicorn.config import LOGGING_CONFIG
     LOGGING_CONFIG["formatters"]["default"][
         "fmt"
     ] = "[%(asctime)s] %(levelprefix)s %(message)s"
@@ -1442,3 +1407,35 @@ def is_valid_ipv6_address(address: str) -> bool:
         return True
     except ValueError:
         return False
+def rank0_print(msg: str):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+    if get_tensor_model_parallel_rank() == 0:
+        print(msg, flush=True)
+def launch_dummy_health_check_server(host, port):
+    import uvicorn
+    from fastapi import FastAPI, Response
+    app = FastAPI()
+    @app.get("/health")
+    async def health():
+        """Check the health of the http server."""
+        return Response(status_code=200)
+    @app.get("/health_generate")
+    async def health_generate():
+        """Check the health of the http server."""
+        return Response(status_code=200)
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        timeout_keep_alive=5,
+        loop="uvloop",
+    )

sglang/test/test_programs.py CHANGED Viewed

@@ -535,6 +535,7 @@ def test_hellaswag_select():
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
+    print(f"{accuracy=}, {accuracy_gen=}")
     assert np.abs(accuracy_gen - accuracy) < 0.05
     assert np.abs(latency_gen - latency) < 1

sglang/test/test_utils.py CHANGED Viewed

@@ -34,7 +34,7 @@ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
 DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
 DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
-DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
+DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
+DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
 def is_in_ci():
     """Return whether it is in CI runner."""
@@ -132,10 +135,6 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
     return pred
-def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
-    raise NotImplementedError()
 def call_generate_guidance(
     prompt, temperature, max_tokens, stop=None, n=1, regex=None, model=None
 ):
@@ -527,6 +526,48 @@ def get_similarities(vec1, vec2):
     return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
+def get_benchmark_args(
+    base_url="",
+    dataset_name="",
+    dataset_path="",
+    tokenizer="",
+    num_prompts=500,
+    random_input_len=4096,
+    random_output_len=2048,
+    request_rate=float("inf"),
+    disable_stream=False,
+    disable_ignore_eos=False,
+):
+    return SimpleNamespace(
+        backend="sglang",
+        base_url=base_url,
+        host=None,
+        port=None,
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        model=None,
+        tokenizer=tokenizer,
+        num_prompts=num_prompts,
+        sharegpt_output_len=None,
+        sharegpt_context_len=None,
+        random_input_len=random_input_len,
+        random_output_len=random_output_len,
+        random_range_ratio=0.0,
+        request_rate=request_rate,
+        multi=None,
+        output_file=None,
+        disable_tqdm=False,
+        disable_stream=disable_stream,
+        return_logprob=False,
+        seed=0,
+        disable_ignore_eos=disable_ignore_eos,
+        extra_request_body=None,
+        apply_chat_template=False,
+        profile=None,
+        lora_name=None,
+    )
 def run_bench_serving(
     model,
     num_prompts,
@@ -538,6 +579,7 @@ def run_bench_serving(
     random_input_len=4096,
     random_output_len=2048,
     disable_stream=False,
+    disable_ignore_eos=False,
     need_warmup=False,
 ):
     # Launch the server
@@ -550,32 +592,17 @@ def run_bench_serving(
     )
     # Run benchmark
-    args = SimpleNamespace(
-        backend="sglang",
+    args = get_benchmark_args(
         base_url=base_url,
-        host=None,
-        port=None,
         dataset_name=dataset_name,
         dataset_path=dataset_path,
-        model=None,
         tokenizer=tokenizer,
         num_prompts=num_prompts,
-        sharegpt_output_len=None,
-        sharegpt_context_len=None,
         random_input_len=random_input_len,
         random_output_len=random_output_len,
-        random_range_ratio=0.0,
         request_rate=request_rate,
-        multi=None,
-        seed=0,
-        output_file=None,
-        disable_tqdm=False,
         disable_stream=disable_stream,
-        disable_ignore_eos=False,
-        return_logprob=False,
-        lora_name=None,
-        extra_request_body=None,
-        profile=None,
+        disable_ignore_eos=disable_ignore_eos,
     )
     try:
@@ -591,6 +618,38 @@ def run_bench_serving(
     return res
+def run_bench_serving_multi(
+    model,
+    base_url,
+    other_server_args,
+    benchmark_args,
+    need_warmup=False,
+):
+    # Launch the server
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
+    # run benchmark for all
+    res_l = []
+    try:
+        for args in benchmark_args:
+            if need_warmup:
+                warmup_args = copy.deepcopy(args)
+                warmup_args.num_prompts = 16
+                run_benchmark(warmup_args)
+            res = run_benchmark(args)
+            res_l.append((args, res))
+    finally:
+        kill_process_tree(process.pid)
+    return res_l
 def run_bench_one_batch(model, other_args):
     command = [
         "python3",

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.~~1.post7~~"
1	+ __version__ = "0.4.2"

{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.1.post7
+Version: 0.4.2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm==0.6.4.post1; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
@@ -333,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
-- [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
-- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
-- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
+- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
+- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
+- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
+- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
-- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).

sglang 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl