PyPI - sglang - Versions diffs - 0.4.5.post3__tar.gz → 0.4.6.post1__tar.gz - Mend

sglang 0.4.5.post3tar.gz → 0.4.6.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (631) hide show

{sglang-0.4.5.post3/sglang.egg-info → sglang-0.4.6.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.5.post3
+Version: 0.4.6.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
 Requires-Dist: interegular; extra == "runtime-common"
-Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
+Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: ninja; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
@@ -242,11 +242,10 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
 Requires-Dist: transformers==4.51.1; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
+Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.6.0; extra == "srt"
 Requires-Dist: torchvision==0.21.0; extra == "srt"
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
 For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
-## Acknowledgment and Citation
-We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+## Acknowledgment
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/README.md RENAMED Viewed

@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
 For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
-## Acknowledgment and Citation
-We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+## Acknowledgment
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.5.post3"
+version = "0.4.6.post1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -24,7 +24,7 @@ runtime_common = [
     "hf_transfer",
     "huggingface_hub",
     "interegular",
-    "llguidance>=0.6.15",
+    "llguidance>=0.7.11,<0.8.0",
     "modelscope",
     "ninja",
     "orjson",
@@ -41,13 +41,12 @@ runtime_common = [
     "transformers==4.51.1",
     "uvicorn",
     "uvloop",
-    "compressed-tensors",
     "xgrammar==0.1.17",
 ]
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.9.post2",
+    "sgl-kernel==0.1.0",
     "flashinfer_python==0.2.3",
     "torch==2.6.0",
     "torchvision==0.21.0",

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_one_batch.py RENAMED Viewed

@@ -57,6 +57,7 @@ import torch
 import torch.distributed as dist
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
@@ -85,6 +86,7 @@ class BenchArgs:
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
+    log_decode_step: int = 0
     profile: bool = False
     profile_filename_prefix: str = "profile"
@@ -105,6 +107,12 @@ class BenchArgs:
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--log-decode-step",
+            type=int,
+            default=BenchArgs.log_decode_step,
+            help="Log decode latency by step, default is set to zero to disable.",
+        )
         parser.add_argument(
             "--profile", action="store_true", help="Use Torch Profiler."
         )
@@ -335,6 +343,7 @@ def latency_test_run_once(
     input_len,
     output_len,
     device,
+    log_decode_step,
     profile,
     profile_filename_prefix,
 ):
@@ -394,9 +403,9 @@ def latency_test_run_once(
         tot_latency += latency
         throughput = batch_size / latency
         decode_latencies.append(latency)
-        if i < 5:
+        if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
             rank_print(
-                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
     if profile:
@@ -457,8 +466,9 @@ def latency_test(
         reqs,
         bench_args.batch_size[0],
         bench_args.input_len[0],
-        8,  # shorter decoding to speed up the warmup
+        min(32, bench_args.output_len[0]),  # shorter decoding to speed up the warmup
         server_args.device,
+        log_decode_step=0,
         profile=False,
         profile_filename_prefix="",  # not used
     )
@@ -480,6 +490,7 @@ def latency_test(
             il,
             ol,
             server_args.device,
+            bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_filename_prefix,
         )
@@ -492,8 +503,13 @@ def latency_test(
             for result in result_list:
                 fout.write(json.dumps(result) + "\n")
+    if server_args.tp_size > 1:
+        destroy_distributed_environment()
 def main(server_args, bench_args):
+    server_args.cuda_graph_max_bs = max(bench_args.batch_size)
     _set_envs_and_config(server_args)
     if server_args.model_path:

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/bench_serving.py RENAMED Viewed

@@ -295,7 +295,7 @@ async def async_request_truss(
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["choices"][0]["delta"]["content"]:
+                            if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
                                 if ttft == 0.0:
@@ -307,7 +307,7 @@ async def async_request_truss(
                                     output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["delta"]["content"]
+                                generated_text += data["choices"][0]["text"]
                     output.generated_text = generated_text
                     output.success = True
@@ -977,6 +977,7 @@ async def benchmark(
     profile: bool,
     pd_seperated: bool = False,
     flush_cache: bool = False,
+    warmup_requests: int = 1,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -994,11 +995,11 @@ async def benchmark(
             return await request_func(request_func_input=request_func_input, pbar=pbar)
     # Warmup
-    print(f"Starting warmup with {args.warmup_requests} sequences...")
+    print(f"Starting warmup with {warmup_requests} sequences...")
     # Use the first request for all warmup iterations
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
-    if lora_names != None and len(lora_names) != 0:
+    if lora_names is not None and len(lora_names) != 0:
         lora_name = lora_names[0]
     else:
         lora_name = None
@@ -1016,7 +1017,7 @@ async def benchmark(
     # Run warmup requests
     warmup_tasks = []
-    for _ in range(args.warmup_requests):
+    for _ in range(warmup_requests):
         warmup_tasks.append(
             asyncio.create_task(request_func(request_func_input=test_input))
         )
@@ -1024,9 +1025,7 @@ async def benchmark(
     warmup_outputs = await asyncio.gather(*warmup_tasks)
     # Check if at least one warmup request succeeded
-    if args.warmup_requests > 0 and not any(
-        output.success for output in warmup_outputs
-    ):
+    if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
         raise ValueError(
             "Warmup failed - Please make sure benchmark arguments "
             f"are correctly specified. Error: {warmup_outputs[0].error}"
@@ -1058,7 +1057,7 @@ async def benchmark(
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
-        if lora_names != None and len(lora_names) != 0:
+        if lora_names is not None and len(lora_names) != 0:
             idx = random.randint(0, len(lora_names) - 1)
             lora_name = lora_names[idx]
         else:

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/compile_deep_gemm.py RENAMED Viewed

@@ -27,7 +27,11 @@ from sglang.srt.warmup import warmup
 multiprocessing.set_start_method("spawn", force=True)
 # Reduce warning
-os.environ["SGL_IN_DEEP_GEMM_PRE_COMPILE_STAGE"] = "1"
+os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
+# Force enable deep gemm
+os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
+# Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
+os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
 @dataclasses.dataclass
@@ -84,8 +88,36 @@ def launch_server_process_and_send_one_request(
             headers = {
                 "Content-Type": "application/json; charset=utf-8",
             }
-            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if server_args.node_rank == 0:
+                response = requests.get(f"{base_url}/v1/models", headers=headers)
+            else:
+                # This http api is created by launch_dummy_health_check_server for none-rank0 node.
+                response = requests.get(f"{base_url}/health", headers=headers)
             if response.status_code == 200:
+                # Rank-0 node send a request to sync with other node and then return.
+                if server_args.node_rank == 0:
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "input_ids": [0, 1, 2, 3],
+                            "sampling_params": {
+                                "max_new_tokens": 8,
+                                "temperature": 0,
+                            },
+                        },
+                        timeout=600,
+                    )
+                    if response.status_code != 200:
+                        error = response.json()
+                        raise RuntimeError(f"Sync request failed: {error}")
+                # Other nodes should wait for the exit signal from Rank-0 node.
+                else:
+                    start_time_waiting = time.time()
+                    while proc.is_alive():
+                        if time.time() - start_time_waiting < timeout:
+                            time.sleep(10)
+                        else:
+                            raise TimeoutError("Waiting for main node timeout!")
                 return proc
         except requests.RequestException:
             pass
@@ -118,10 +150,19 @@ def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
     proc = launch_server_process_and_send_one_request(server_args, compile_args)
-    kill_process_tree(proc.pid)
     print("\nDeepGEMM Kernels compilation finished successfully.")
+    # Sleep for safety
+    time.sleep(10)
+    if proc.is_alive():
+        # This is the rank0 node.
+        kill_process_tree(proc.pid)
+    else:
+        try:
+            kill_process_tree(proc.pid)
+        except Exception:
+            pass
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/code_completion_parser.py RENAMED Viewed

@@ -113,7 +113,7 @@ def completion_template_exists(template_name: str) -> bool:
 def is_completion_template_defined() -> bool:
     global completion_template_name
-    return completion_template_name != None
+    return completion_template_name is not None
 def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/deepseekvl2.py RENAMED Viewed

@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
             messages,
             pil_images[image_index : image_index + image_token_cnt],
-            bos=False,
+            bos=True,
             eos=True,
             cropping=len(pil_images) <= 2,
             max_req_input_len=max_req_input_len,

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -73,10 +73,14 @@ class ModelConfig:
         )
         if enable_multimodal is None:
-            if self.hf_config.architectures[0] == "Llama4ForConditionalGeneration":
+            mm_disabled_models = [
+                "Gemma3ForConditionalGeneration",
+                "Llama4ForConditionalGeneration",
+            ]
+            if self.hf_config.architectures[0] in mm_disabled_models:
                 enable_multimodal = False
                 logger.info(
-                    "Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
+                    f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
                 )
             else:
                 enable_multimodal = True
@@ -158,7 +162,9 @@ class ModelConfig:
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_config.kv_lora_rank
             self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
-        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
+        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
+            self.hf_text_config, "use_mla", True
+        ):
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_text_config.kv_lora_rank

sglang-0.4.6.post1/sglang/srt/constrained/llguidance_backend.py ADDED Viewed

@@ -0,0 +1,169 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constrained decoding with llguidance backend."""
+import json
+import logging
+import os
+from typing import List, Optional, Tuple
+import torch
+from llguidance import LLMatcher, LLTokenizer, StructTag, grammar_from
+from llguidance.hf import from_tokenizer
+from llguidance.torch import (
+    allocate_token_bitmask,
+    apply_token_bitmask_inplace,
+    fill_next_token_bitmask,
+)
+from sglang.srt.constrained.base_grammar_backend import (
+    BaseGrammarBackend,
+    BaseGrammarObject,
+)
+logger = logging.getLogger(__name__)
+class GuidanceGrammar(BaseGrammarObject):
+    def __init__(self, llguidance_tokenizer: LLTokenizer, serialized_grammar: str):
+        super().__init__()
+        self.llguidance_tokenizer = llguidance_tokenizer
+        self.serialized_grammar = serialized_grammar
+        self.ll_matcher = LLMatcher(
+            self.llguidance_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+        self.finished = False
+        self.bitmask = None
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
+        ff_tokens = self.ll_matcher.compute_ff_tokens()
+        if ff_tokens:
+            return ff_tokens, ""
+        else:
+            return None
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        return "", -1
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        pass
+    def accept_token(self, token: int):
+        if not self.ll_matcher.consume_token(token):
+            logger.warning(f"matcher error: {self.ll_matcher.get_error()}")
+            self.finished = True
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        if self.ll_matcher.is_stopped():
+            self.finished = True
+        fill_next_token_bitmask(self.ll_matcher, vocab_mask, idx)
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        if self.bitmask is None or self.bitmask.shape[0] < batch_size:
+            # only create bitmask when batch gets larger
+            self.bitmask = allocate_token_bitmask(
+                batch_size, self.llguidance_tokenizer.vocab_size
+            )
+            bitmask = self.bitmask
+        else:
+            bitmask = self.bitmask[:batch_size]
+        return bitmask
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask.to(device, non_blocking=True)
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        apply_token_bitmask_inplace(logits, vocab_mask)
+    def copy(self):
+        return GuidanceGrammar(
+            llguidance_tokenizer=self.llguidance_tokenizer,
+            serialized_grammar=self.serialized_grammar,
+        )
+class GuidanceBackend(BaseGrammarBackend):
+    def __init__(
+        self,
+        tokenizer,
+        whitespace_pattern: Optional[str] = None,
+        n_vocab: Optional[int] = None,
+    ):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.whitespace_pattern = whitespace_pattern
+        self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
+    def _from_serialized(self, serialized_grammar) -> Optional[GuidanceGrammar]:
+        try:
+            return GuidanceGrammar(
+                llguidance_tokenizer=self.llguidance_tokenizer,
+                serialized_grammar=serialized_grammar,
+            )
+        except Exception as e:
+            logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
+            return None
+    def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
+        serialized_grammar = LLMatcher.grammar_from_json_schema(
+            key_string,
+            defaults={
+                "whitespace_pattern": self.whitespace_pattern,
+            },
+        )
+        return self._from_serialized(serialized_grammar)
+    def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
+        serialized_grammar = grammar_from("regex", key_string)
+        return self._from_serialized(serialized_grammar)
+    def dispatch_ebnf(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            serialized_grammar = grammar_from("ebnf", key_string)
+            return self._from_serialized(serialized_grammar)
+        except ValueError as e:
+            logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
+            return None
+    def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            structural_tag = json.loads(key_string)
+            tags = [
+                StructTag(
+                    begin=structure["begin"],
+                    grammar=structure["schema"],
+                    end=structure["end"],
+                    trigger=structural_tag["triggers"][0],  # TODO?
+                )
+                for structure in structural_tag["structures"]
+            ]
+            g = StructTag.to_grammar(tags)
+            return self._from_serialized(g)
+        except Exception as e:
+            logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
+            return None

{sglang-0.4.5.post3 → sglang-0.4.6.post1}/sglang/srt/conversation.py RENAMED Viewed

@@ -463,6 +463,30 @@ def generate_embedding_convs(
     return convs
+# Models in which system adds modality tokens at prompt start automatically
+# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
+_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
+# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
+def _get_full_multimodal_text_prompt(
+    modality_token: str, modality_count: int, text_prompt: str
+) -> str:
+    """Combine multimodal prompts for a multimodal language model."""
+    # For any existing placeholder in the text prompt, we leave it as is
+    left: int = modality_count - text_prompt.count(modality_token)
+    if left < 0:
+        raise ValueError(
+            f"Found more '{modality_token}' placeholders in input prompt than "
+            "actual multimodal data items."
+        )
+    # NOTE: For now we always add missing modality_token at the front of
+    # the prompt. This may change to be customizable in the future.
+    return "\n".join([modality_token] * left + [text_prompt])
 def generate_chat_conv(
     request: ChatCompletionRequest, template_name: str
 ) -> Conversation:
@@ -520,6 +544,12 @@ def generate_chat_conv(
                         if conv.name != "qwen2-vl"
                         else conv.image_token
                     )
+                add_token_as_needed: bool = (
+                    conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
+                )
+                if add_token_as_needed:
+                    image_token = ""
                 audio_token = conv.audio_token
                 for content in message.content:
                     if content.type == "text":
@@ -533,7 +563,10 @@ def generate_chat_conv(
                     elif content.type == "audio_url":
                         real_content += audio_token
                         conv.append_audio(content.audio_url.url)
+                if add_token_as_needed:
+                    real_content = _get_full_multimodal_text_prompt(
+                        conv.image_token, num_image_url, real_content
+                    )
                 conv.append_message(conv.roles[0], real_content)
         elif msg_role == "assistant":
             parsed_content = ""

sglang 0.4.5.post3__tar.gz → 0.4.6.post1__tar.gz

sglang 0.4.5.post3tar.gz → 0.4.6.post1tar.gz