PyPI - sglang - Versions diffs - 0.3.3.post1__py3-none-any.whl → 0.3.4.post1__py3-none-any.whl - Mend

sglang 0.3.3.post1py3-none-any.whl → 0.3.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sglang/bench_latency.py +30 -11
sglang/bench_server_latency.py +21 -10
sglang/bench_serving.py +101 -7
sglang/global_config.py +0 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +2 -0
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/attention/__init__.py +38 -5
sglang/srt/layers/attention/double_sparsity_backend.py +297 -0
sglang/srt/layers/attention/flashinfer_backend.py +486 -97
sglang/srt/layers/attention/triton_backend.py +26 -8
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
sglang/srt/layers/attention/triton_ops/prefill_attention.py +30 -6
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/rotary_embedding.py +145 -0
sglang/srt/layers/sampler.py +6 -2
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/detokenizer_manager.py +31 -10
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +4 -0
sglang/srt/managers/schedule_batch.py +319 -82
sglang/srt/managers/schedule_policy.py +2 -1
sglang/srt/managers/scheduler.py +233 -158
sglang/srt/managers/tokenizer_manager.py +15 -5
sglang/srt/managers/tp_worker.py +30 -5
sglang/srt/managers/tp_worker_overlap_thread.py +172 -0
sglang/srt/mem_cache/chunk_cache.py +8 -4
sglang/srt/mem_cache/memory_pool.py +123 -11
sglang/srt/mem_cache/radix_cache.py +19 -10
sglang/srt/model_executor/cuda_graph_runner.py +63 -12
sglang/srt/model_executor/forward_batch_info.py +101 -23
sglang/srt/model_executor/model_runner.py +92 -12
sglang/srt/models/baichuan.py +2 -3
sglang/srt/models/chatglm.py +8 -9
sglang/srt/models/commandr.py +1 -2
sglang/srt/models/dbrx.py +1 -2
sglang/srt/models/deepseek.py +4 -5
sglang/srt/models/deepseek_v2.py +7 -8
sglang/srt/models/exaone.py +1 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +5 -5
sglang/srt/models/gpt_bigcode.py +5 -5
sglang/srt/models/grok.py +1 -2
sglang/srt/models/internlm2.py +1 -2
sglang/srt/models/llama.py +1 -2
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +4 -8
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -2
sglang/srt/models/minicpm3.py +5 -6
sglang/srt/models/mixtral.py +1 -2
sglang/srt/models/mixtral_quant.py +1 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/olmo.py +352 -0
sglang/srt/models/olmoe.py +1 -2
sglang/srt/models/qwen.py +1 -2
sglang/srt/models/qwen2.py +1 -2
sglang/srt/models/qwen2_moe.py +4 -5
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/models/stablelm.py +1 -2
sglang/srt/models/torch_native_llama.py +1 -2
sglang/srt/models/xverse.py +1 -2
sglang/srt/models/xverse_moe.py +4 -5
sglang/srt/models/yivl.py +1 -2
sglang/srt/openai_api/adapter.py +92 -49
sglang/srt/openai_api/protocol.py +10 -2
sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
sglang/srt/sampling/sampling_batch_info.py +103 -59
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server.py +116 -17
sglang/srt/server_args.py +131 -45
sglang/srt/utils.py +33 -3
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/few_shot_gsm8k_engine.py +144 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +16 -12
sglang/version.py +1 -1
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.post1.dist-info}/METADATA +75 -32
sglang-0.3.4.post1.dist-info/RECORD +148 -0
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/attention/flashinfer_utils.py +0 -237
sglang-0.3.3.post1.dist-info/RECORD +0 -140
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.post1.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -227,22 +227,24 @@ def extend(reqs, model_runner):
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
+        model_config=model_runner.model_config,
     )
-    batch.prepare_for_extend(model_runner.model_config.vocab_size)
+    batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
-    next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
     return next_token_ids, logits_output.next_token_logits, batch
 @torch.inference_mode()
 def decode(input_token_ids, batch, model_runner):
-    batch.prepare_for_decode(input_token_ids)
+    batch.output_ids = input_token_ids
+    batch.prepare_for_decode()
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
-    next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
     return next_token_ids, logits_output.next_token_logits
@@ -252,6 +254,7 @@ def correctness_test(
     bench_args,
     tp_rank,
 ):
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     # Load the model
@@ -279,8 +282,9 @@ def correctness_test(
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
     for _ in range(bench_args.output_len[0] - 1):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        next_token_ids_list = next_token_ids.tolist()
         for i in range(len(reqs)):
-            output_ids[i].append(next_token_ids[i])
+            output_ids[i].append(next_token_ids_list[i])
     # Print
     for i in range(len(reqs)):
@@ -288,8 +292,15 @@ def correctness_test(
         rank_print(tokenizer.decode(output_ids[i]), "\n")
+def synchronize(device):
+    if device == "cuda":
+        torch.cuda.synchronize()
+    elif device == "xpu":
+        torch.xpu.synchronize()
 def latency_test_run_once(
-    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
+    run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
     if batch_size > max_batch_size:
@@ -312,10 +323,10 @@ def latency_test_run_once(
     tot_latency = 0
     # Prefill
-    torch.cuda.synchronize()
+    synchronize(device)
     tic = time.time()
     next_token_ids, _, batch = extend(reqs, model_runner)
-    torch.cuda.synchronize()
+    synchronize(device)
     prefill_latency = time.time() - tic
     tot_latency += prefill_latency
     throughput = input_len * batch_size / prefill_latency
@@ -328,10 +339,10 @@ def latency_test_run_once(
     # Decode
     decode_latencies = []
     for i in range(output_len - 1):
-        torch.cuda.synchronize()
+        synchronize(device)
         tic = time.time()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
-        torch.cuda.synchronize()
+        synchronize(device)
         latency = time.time() - tic
         tot_latency += latency
         throughput = batch_size / latency
@@ -387,6 +398,7 @@ def latency_test(
         bench_args.batch_size[0],
         bench_args.input_len[0],
         8,  # shorter decoding to speed up the warmup
+        server_args.device,
     )
     rank_print("Benchmark ...")
@@ -397,7 +409,14 @@ def latency_test(
     ):
         reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
         ret = latency_test_run_once(
-            bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
+            bench_args.run_name,
+            model_runner,
+            rank_print,
+            reqs,
+            bs,
+            il,
+            ol,
+            server_args.device,
         )
         if ret is not None:
             result_list.append(ret)

sglang/bench_server_latency.py CHANGED Viewed

@@ -6,6 +6,8 @@ It accepts arguments similar to those of launch_server.py.
 Usage:
 python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
 """
 import argparse
@@ -32,6 +34,8 @@ class BenchArgs:
     input_len: Tuple[int] = (1024,)
     output_len: Tuple[int] = (16,)
     result_filename: str = "result.jsonl"
+    base_url: str = ""
+    skip_warmup: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -48,6 +52,8 @@ class BenchArgs:
         parser.add_argument(
             "--result-filename", type=str, default=BenchArgs.result_filename
         )
+        parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
+        parser.add_argument("--skip-warmup", action="store_true")
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -139,17 +145,21 @@ def run_one_case(
 def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
-    proc, base_url = launch_server_process(server_args)
+    if bench_args.base_url:
+        proc, base_url = None, bench_args.base_url
+    else:
+        proc, base_url = launch_server_process(server_args)
     # warmup
-    run_one_case(
-        base_url,
-        batch_size=16,
-        input_len=1024,
-        output_len=16,
-        run_name="",
-        result_filename="",
-    )
+    if not bench_args.skip_warmup:
+        run_one_case(
+            base_url,
+            batch_size=16,
+            input_len=1024,
+            output_len=16,
+            run_name="",
+            result_filename="",
+        )
     # benchmark
     try:
@@ -165,7 +175,8 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                 bench_args.result_filename,
             )
     finally:
-        kill_child_process(proc.pid)
+        if proc:
+            kill_child_process(proc.pid)
     print(f"\nResults are saved to {bench_args.result_filename}")

sglang/bench_serving.py CHANGED Viewed

@@ -222,6 +222,85 @@ async def async_request_openai_completions(
     return output
+async def async_request_sglang_generate(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    prompt = request_func_input.prompt
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "text": prompt,
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": request_func_input.output_len,
+                "ignore_eos": not args.disable_ignore_eos,
+            },
+            "stream": not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = {}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        # print(chunk_bytes)
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_gserver(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -264,7 +343,9 @@ def get_tokenizer(
 ASYNC_REQUEST_FUNCS = {
-    "sglang": async_request_openai_completions,
+    "sglang": async_request_sglang_generate,
+    "sglang-native": async_request_sglang_generate,
+    "sglang-oai": async_request_openai_completions,
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
@@ -387,6 +468,8 @@ def sample_sharegpt_requests(
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
+    print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
+    print(f"#Output tokens: {np.sum([x[2] for x in filtered_dataset])}")
     return filtered_dataset
@@ -587,6 +670,8 @@ async def benchmark(
     else:
         print("Initial test run completed. Starting main benchmark run...")
+    time.sleep(1.5)
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
     benchmark_start_time = time.perf_counter()
@@ -782,24 +867,33 @@ def run_benchmark(args_: argparse.Namespace):
     if args.port is None:
         args.port = {
             "sglang": 30000,
+            "sglang-native": 30000,
+            "sglang-oai": 30000,
             "lmdeploy": 23333,
             "vllm": 8000,
             "trt": 8000,
             "gserver": 9988,
         }.get(args.backend, 30000)
-    api_url = (
-        f"{args.base_url}/v1/completions"
-        if args.base_url
-        else f"http://{args.host}:{args.port}/v1/completions"
-    )
     model_url = (
         f"{args.base_url}/v1/models"
         if args.base_url
         else f"http://{args.host}:{args.port}/v1/models"
     )
-    if args.backend == "trt":
+    if args.backend in ["sglang", "sglang-native"]:
+        api_url = (
+            f"{args.base_url}/generate"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/generate"
+        )
+    elif args.backend in ["sglang-oai", "vllm", "lmdeploy"]:
+        api_url = (
+            f"{args.base_url}/v1/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/completions"
+        )
+    elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"
             if args.base_url

sglang/global_config.py CHANGED Viewed

@@ -19,7 +19,6 @@ class GlobalConfig:
         self.new_token_ratio_decay = 0.001
         # Runtime constants: others
-        self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
         self.flashinfer_workspace_size = os.environ.get(
             "FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024

sglang/lang/chat_template.py CHANGED Viewed

@@ -133,6 +133,22 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="qwen2-vl",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>"),
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -213,6 +229,7 @@ register_chat_template(
             ),
         },
         stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
     )
 )

sglang/launch_server_llavavid.py CHANGED Viewed

@@ -14,7 +14,7 @@ if __name__ == "__main__":
     model_override_args["num_frames"] = 16
     model_override_args["model_type"] = "llavavid"
     if model_override_args["num_frames"] == 32:
-        model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
+        model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
         model_override_args["max_sequence_length"] = 4096 * 2
         model_override_args["tokenizer_model_max_length"] = 4096 * 2
         model_override_args["model_max_length"] = 4096 * 2

sglang/srt/configs/__init__.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from sglang.srt.configs.exaone import ExaoneConfig
+from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
 __all__ = [
     "ExaoneConfig",
+    "Qwen2VLConfig",
+    "Qwen2VLVisionConfig",
 ]

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -89,6 +89,8 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
+        self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""

sglang/srt/configs/qwen2vl.py ADDED Viewed

@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers import PretrainedConfig
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        if config_dict.get("model_type") == "qwen2_vl":
+            config_dict = config_dict["vision_config"]
+        return cls.from_dict(config_dict, **kwargs)
+class Qwen2VLConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = Qwen2VLVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = Qwen2VLVisionConfig()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # NOTE: the following section from original transformers config
+        # for Qwen2-VL is commented out to address rope config loading issue
+        #
+        # if self.rope_scaling is not None and "type" in self.rope_scaling:
+        #     if self.rope_scaling["type"] == "mrope":
+        #         self.rope_scaling["type"] = "default"
+        #     self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

sglang/srt/conversation.py CHANGED Viewed

@@ -509,6 +509,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="llama_3_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+        image_token="<|image|>",
+    )
+)
 register_conv_template(
     Conversation(
         name="llava_llama_3",
@@ -530,3 +543,17 @@ register_conv_template(
         stop_str=["<|im_end|>", "<|action_end|>"],
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_conv_template(
+    Conversation(
+        name="qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -33,12 +33,13 @@ from transformers import (
 try:
     from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
-    from sglang.srt.configs import ExaoneConfig
+    from sglang.srt.configs import ExaoneConfig, Qwen2VLConfig
     _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
         ChatGLMConfig.model_type: ChatGLMConfig,
         DbrxConfig.model_type: DbrxConfig,
         ExaoneConfig.model_type: ExaoneConfig,
+        Qwen2VLConfig.model_type: Qwen2VLConfig,
     }
 except ImportError:
     # We want this file to run without vllm dependency

sglang/srt/layers/attention/__init__.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from abc import ABC, abstractmethod
+from typing import Optional
+import torch
 from torch import nn
+from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -18,13 +21,22 @@ class AttentionBackend(ABC):
         raise NotImplementedError()
     def init_forward_metadata_capture_cuda_graph(
-        self, bs: int, req_pool_indices, seq_lens
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor] = None,
     ):
         """Init the metadata for a forward pass for capturing a cuda graph."""
         raise NotImplementedError()
     def init_forward_metadata_replay_cuda_graph(
-        self, bs: int, req_pool_indices, seq_lens
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor] = None,
     ):
         """Init the metadata for a forward pass for replying a cuda graph."""
         raise NotImplementedError()
@@ -33,17 +45,38 @@ class AttentionBackend(ABC):
         """Get the fill value for padded seq lens. Typically, it is 0 or 1."""
         raise NotImplementedError()
-    def forward(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+    ):
         """Run forward on an attention layer."""
         if forward_batch.forward_mode.is_decode():
             return self.forward_decode(q, k, v, layer, forward_batch)
         else:
             return self.forward_extend(q, k, v, layer, forward_batch)
-    def forward_decode(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+    ):
         """Run a forward for decode."""
         raise NotImplementedError()
-    def forward_extend(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+    ):
         """Run a forward for extend."""
         raise NotImplementedError()

sglang 0.3.3.post1__py3-none-any.whl → 0.3.4.post1__py3-none-any.whl

sglang 0.3.3.post1py3-none-any.whl → 0.3.4.post1py3-none-any.whl