PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -8
sglang/compile_deep_gemm.py +177 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +11 -2
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +96 -5
sglang/srt/disaggregation/mini_lb.py +113 -15
sglang/srt/disaggregation/mooncake/conn.py +199 -32
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +119 -20
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +11 -9
sglang/srt/function_call_parser.py +132 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/flashattention_backend.py +809 -160
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +42 -5
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +385 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +176 -132
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +17 -4
sglang/srt/managers/io_struct.py +21 -3
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +42 -12
sglang/srt/managers/scheduler.py +47 -26
sglang/srt/managers/tokenizer_manager.py +120 -30
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +118 -13
sglang/srt/model_executor/cuda_graph_runner.py +16 -10
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +29 -27
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +153 -76
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +22 -7
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +87 -10
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +65 -60
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +48 -6
sglang/test/runners.py +6 -13
sglang/test/test_utils.py +39 -19
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/METADATA +6 -7
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/RECORD +99 -92
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -57,6 +57,7 @@ import torch
 import torch.distributed as dist
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
@@ -85,6 +86,7 @@ class BenchArgs:
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
+    log_decode_step: int = 0
     profile: bool = False
     profile_filename_prefix: str = "profile"
@@ -105,6 +107,12 @@ class BenchArgs:
         )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--log-decode-step",
+            type=int,
+            default=BenchArgs.log_decode_step,
+            help="Log decode latency by step, default is set to zero to disable.",
+        )
         parser.add_argument(
             "--profile", action="store_true", help="Use Torch Profiler."
         )
@@ -335,6 +343,7 @@ def latency_test_run_once(
     input_len,
     output_len,
     device,
+    log_decode_step,
     profile,
     profile_filename_prefix,
 ):
@@ -394,9 +403,9 @@ def latency_test_run_once(
         tot_latency += latency
         throughput = batch_size / latency
         decode_latencies.append(latency)
-        if i < 5:
+        if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
             rank_print(
-                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
     if profile:
@@ -457,8 +466,9 @@ def latency_test(
         reqs,
         bench_args.batch_size[0],
         bench_args.input_len[0],
-        8,  # shorter decoding to speed up the warmup
+        min(32, bench_args.output_len[0]),  # shorter decoding to speed up the warmup
         server_args.device,
+        log_decode_step=0,
         profile=False,
         profile_filename_prefix="",  # not used
     )
@@ -480,6 +490,7 @@ def latency_test(
             il,
             ol,
             server_args.device,
+            bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_filename_prefix,
         )
@@ -492,8 +503,13 @@ def latency_test(
             for result in result_list:
                 fout.write(json.dumps(result) + "\n")
+    if server_args.tp_size > 1:
+        destroy_distributed_environment()
 def main(server_args, bench_args):
+    server_args.cuda_graph_max_bs = max(bench_args.batch_size)
     _set_envs_and_config(server_args)
     if server_args.model_path:

sglang/bench_serving.py CHANGED Viewed

@@ -295,7 +295,7 @@ async def async_request_truss(
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["choices"][0]["delta"]["content"]:
+                            if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
                                 if ttft == 0.0:
@@ -307,7 +307,7 @@ async def async_request_truss(
                                     output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["delta"]["content"]
+                                generated_text += data["choices"][0]["text"]
                     output.generated_text = generated_text
                     output.success = True
@@ -690,7 +690,6 @@ def sample_random_requests(
     dataset_path: str,
     random_sample: bool = True,
 ) -> List[Tuple[str, int, int]]:
     input_lens = np.random.randint(
         max(int(input_len * range_ratio), 1),
         input_len + 1,
@@ -978,6 +977,7 @@ async def benchmark(
     profile: bool,
     pd_seperated: bool = False,
     flush_cache: bool = False,
+    warmup_requests: int = 1,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -995,11 +995,11 @@ async def benchmark(
             return await request_func(request_func_input=request_func_input, pbar=pbar)
     # Warmup
-    print(f"Starting warmup with {args.warmup_requests} sequences...")
+    print(f"Starting warmup with {warmup_requests} sequences...")
     # Use the first request for all warmup iterations
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
-    if lora_names != None and len(lora_names) != 0:
+    if lora_names is not None and len(lora_names) != 0:
         lora_name = lora_names[0]
     else:
         lora_name = None
@@ -1017,7 +1017,7 @@ async def benchmark(
     # Run warmup requests
     warmup_tasks = []
-    for _ in range(args.warmup_requests):
+    for _ in range(warmup_requests):
         warmup_tasks.append(
             asyncio.create_task(request_func(request_func_input=test_input))
         )
@@ -1025,7 +1025,7 @@ async def benchmark(
     warmup_outputs = await asyncio.gather(*warmup_tasks)
     # Check if at least one warmup request succeeded
-    if not any(output.success for output in warmup_outputs):
+    if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
         raise ValueError(
             "Warmup failed - Please make sure benchmark arguments "
             f"are correctly specified. Error: {warmup_outputs[0].error}"
@@ -1057,7 +1057,7 @@ async def benchmark(
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
-        if lora_names != None and len(lora_names) != 0:
+        if lora_names is not None and len(lora_names) != 0:
             idx = random.randint(0, len(lora_names) - 1)
             lora_name = lora_names[idx]
         else:

sglang/compile_deep_gemm.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+Compile DeepGEMM Kernels for a model with specify server arguments
+This script launches a server for capturing DeepGEMM calls and then compiles the kernels.
+It accepts server arguments (the same as launch_server.py).
+Usage:
+python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+"""
+import argparse
+import dataclasses
+import multiprocessing
+import os
+import time
+import requests
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+from sglang.srt.warmup import warmup
+multiprocessing.set_start_method("spawn", force=True)
+# Reduce warning
+os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
+# Force enable deep gemm
+os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
+# Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
+os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
+@dataclasses.dataclass
+class CompileArgs:
+    timeout: int = 3600
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--timeout", type=int, default=CompileArgs.timeout)
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+@warmup("compile-deep-gemm")
+async def warm_up_compile(tokenizer_manager: TokenizerManager):
+    print("\nGenerate warm up request for compiling DeepGEMM...\n")
+    generate_req_input = GenerateReqInput(
+        input_ids=[0, 1, 2, 3],
+        sampling_params={
+            "temperature": 0.0,
+            "max_new_tokens": 8,
+            "ignore_eos": True,
+        },
+    )
+    await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+def launch_server_process_and_send_one_request(
+    server_args: ServerArgs, compile_args: CompileArgs
+):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = compile_args.timeout
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            if server_args.node_rank == 0:
+                response = requests.get(f"{base_url}/v1/models", headers=headers)
+            else:
+                # This http api is created by launch_dummy_health_check_server for none-rank0 node.
+                response = requests.get(f"{base_url}/health", headers=headers)
+            if response.status_code == 200:
+                # Rank-0 node send a request to sync with other node and then return.
+                if server_args.node_rank == 0:
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "input_ids": [0, 1, 2, 3],
+                            "sampling_params": {
+                                "max_new_tokens": 8,
+                                "temperature": 0,
+                            },
+                        },
+                        timeout=600,
+                    )
+                    if response.status_code != 200:
+                        error = response.json()
+                        raise RuntimeError(f"Sync request failed: {error}")
+                # Other nodes should wait for the exit signal from Rank-0 node.
+                else:
+                    start_time_waiting = time.time()
+                    while proc.is_alive():
+                        if time.time() - start_time_waiting < timeout:
+                            time.sleep(10)
+                        else:
+                            raise TimeoutError("Waiting for main node timeout!")
+                return proc
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError(
+        "DeepGEMM Kernels compilation timeout."
+        "\n\nFeel free and please restart the command."
+    )
+def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
+    # Disbale cuda graph and torch compile to save time
+    server_args.disable_cuda_graph = True
+    server_args.enable_torch_compile = False
+    print(f"Disable CUDA Graph and Torch Compile to save time...")
+    # Set watchdog timeout to compile_args.timeout because compilation will take a long time
+    server_args.watchdog_timeout = compile_args.timeout
+    server_args.warmups = "compile-deep-gemm"
+def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
+    print(
+        "Begin DeepGEMM Kernels compilation...\n"
+        "It may take a long time and timeout maybe raised "
+        "while the compilation is still in progress.\n"
+        "Just feel free to restart the command "
+        "until the compilation is fully finished.\n"
+    )
+    proc = launch_server_process_and_send_one_request(server_args, compile_args)
+    print("\nDeepGEMM Kernels compilation finished successfully.")
+    # Sleep for safety
+    time.sleep(10)
+    if proc.is_alive():
+        # This is the rank0 node.
+        kill_process_tree(proc.pid)
+    else:
+        try:
+            kill_process_tree(proc.pid)
+        except Exception:
+            pass
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    CompileArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    compile_args = CompileArgs.from_cli_args(args)
+    refine_server_args(server_args, compile_args)
+    run_compile(server_args, compile_args)

sglang/lang/backend/openai.py CHANGED Viewed

@@ -161,7 +161,11 @@ class OpenAI(BaseBackend):
                 prompt = s.text_
             kwargs = sampling_params.to_openai_kwargs()
-            if self.model_name.startswith("o1") or self.model_name.startswith("o3"):
+            if (
+                self.model_name.startswith("o1")
+                or self.model_name.startswith("o3")
+                or "o1" in self.model_name
+            ):
                 kwargs.pop("max_tokens", None)
             else:
                 kwargs.pop("max_completion_tokens", None)

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -324,7 +324,11 @@ class RuntimeEndpoint(BaseBackend):
     def _assert_success(self, res):
         if res.status_code != 200:
-            raise RuntimeError(res.json())
+            try:
+                content = res.json()
+            except json.JSONDecodeError:
+                content = res.text
+            raise RuntimeError(content)
 def compute_normalized_prompt_logprobs(input_logprobs):

sglang/srt/code_completion_parser.py CHANGED Viewed

@@ -113,7 +113,7 @@ def completion_template_exists(template_name: str) -> bool:
 def is_completion_template_defined() -> bool:
     global completion_template_name
-    return completion_template_name != None
+    return completion_template_name is not None
 def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
             messages,
             pil_images[image_index : image_index + image_token_cnt],
-            bos=False,
+            bos=True,
             eos=True,
             cropping=len(pil_images) <= 2,
             max_req_input_len=max_req_input_len,

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -73,8 +73,15 @@ class ModelConfig:
         )
         if enable_multimodal is None:
-            if self.hf_config.architectures == "Llama4ForConditionalGeneration":
+            mm_disabled_models = [
+                "Gemma3ForConditionalGeneration",
+                "Llama4ForConditionalGeneration",
+            ]
+            if self.hf_config.architectures[0] in mm_disabled_models:
                 enable_multimodal = False
+                logger.info(
+                    f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
+                )
             else:
                 enable_multimodal = True
@@ -155,7 +162,9 @@ class ModelConfig:
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_config.kv_lora_rank
             self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
-        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
+        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
+            self.hf_text_config, "use_mla", True
+        ):
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_text_config.kv_lora_rank

sglang/srt/constrained/llguidance_backend.py CHANGED Viewed

@@ -14,49 +14,48 @@
 """Constrained decoding with llguidance backend."""
 import json
+import logging
 import os
 from typing import List, Optional, Tuple
-import llguidance
-import llguidance.hf
-import llguidance.torch
 import torch
-from llguidance.gbnf_to_lark import any_to_lark
+from llguidance import LLMatcher, LLTokenizer, StructTag, grammar_from
+from llguidance.hf import from_tokenizer
+from llguidance.torch import (
+    allocate_token_bitmask,
+    apply_token_bitmask_inplace,
+    fill_next_token_bitmask,
+)
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
+logger = logging.getLogger(__name__)
 class GuidanceGrammar(BaseGrammarObject):
-    def __init__(
-        self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
-    ):
+    def __init__(self, llguidance_tokenizer: LLTokenizer, serialized_grammar: str):
         super().__init__()
         self.llguidance_tokenizer = llguidance_tokenizer
         self.serialized_grammar = serialized_grammar
-        # TODO: add support for fast-forward tokens in the future
-        self.ll_interpreter = llguidance.LLInterpreter(
+        self.ll_matcher = LLMatcher(
             self.llguidance_tokenizer,
             self.serialized_grammar,
-            enable_backtrack=False,
-            enable_ff_tokens=False,
             log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
         )
-        self.pending_ff_tokens: list[int] = []
         self.finished = False
         self.bitmask = None
     def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
-        if len(self.pending_ff_tokens) > 0:
-            s = self.llguidance_tokenizer.decode_str(self.pending_ff_tokens)
-            ff_tokens = self.pending_ff_tokens
-            self.pending_ff_tokens = []
-            return (ff_tokens, s)
-        return None
+        ff_tokens = self.ll_matcher.compute_ff_tokens()
+        if ff_tokens:
+            return ff_tokens, ""
+        else:
+            return None
     def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
         return "", -1
@@ -67,32 +66,22 @@ class GuidanceGrammar(BaseGrammarObject):
         pass
     def accept_token(self, token: int):
-        backtrack, ff_tokens = self.ll_interpreter.commit_token(token)
-        if len(ff_tokens) > 0 and backtrack == 0:
-            # first token is last generated token
-            ff_tokens = ff_tokens[1:]
-            self.pending_ff_tokens.extend(ff_tokens)
+        if not self.ll_matcher.consume_token(token):
+            logger.warning(f"matcher error: {self.ll_matcher.get_error()}")
+            self.finished = True
     def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
-        if len(self.pending_ff_tokens) > 0:
-            # if we have pending fast-forward tokens,
-            # just return them immediately
-            ff_token = self.pending_ff_tokens.pop(0)
-            vocab_mask[idx, :] = 0
-            vocab_mask[idx, ff_token // 32] = 1 << (ff_token % 32)
-            return
-        if self.ll_interpreter.has_pending_stop():
+        if self.ll_matcher.is_stopped():
             self.finished = True
-        llguidance.torch.fill_next_token_bitmask(self.ll_interpreter, vocab_mask, idx)
+        fill_next_token_bitmask(self.ll_matcher, vocab_mask, idx)
     def allocate_vocab_mask(
         self, vocab_size: int, batch_size: int, device
     ) -> torch.Tensor:
         if self.bitmask is None or self.bitmask.shape[0] < batch_size:
             # only create bitmask when batch gets larger
-            self.bitmask = llguidance.torch.allocate_token_bitmask(
+            self.bitmask = allocate_token_bitmask(
                 batch_size, self.llguidance_tokenizer.vocab_size
             )
             bitmask = self.bitmask
@@ -107,7 +96,7 @@ class GuidanceGrammar(BaseGrammarObject):
     @staticmethod
     def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        llguidance.torch.apply_token_bitmask_inplace(logits, vocab_mask)
+        apply_token_bitmask_inplace(logits, vocab_mask)
     def copy(self):
         return GuidanceGrammar(
@@ -117,36 +106,64 @@ class GuidanceGrammar(BaseGrammarObject):
 class GuidanceBackend(BaseGrammarBackend):
-    def __init__(self, tokenizer, whitespace_pattern: Optional[str] = None):
+    def __init__(
+        self,
+        tokenizer,
+        whitespace_pattern: Optional[str] = None,
+        n_vocab: Optional[int] = None,
+    ):
         super().__init__()
         self.tokenizer = tokenizer
-        self.whitespace_flexible = (
-            True if whitespace_pattern == "whitespace_flexible" else False
-        )
-        self.llguidance_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
-    def _from_serialized(self, serialized_grammar) -> GuidanceGrammar:
-        return GuidanceGrammar(
-            llguidance_tokenizer=self.llguidance_tokenizer,
-            serialized_grammar=serialized_grammar,
+        self.whitespace_pattern = whitespace_pattern
+        self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
+    def _from_serialized(self, serialized_grammar) -> Optional[GuidanceGrammar]:
+        try:
+            return GuidanceGrammar(
+                llguidance_tokenizer=self.llguidance_tokenizer,
+                serialized_grammar=serialized_grammar,
+            )
+        except Exception as e:
+            logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
+            return None
+    def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
+        serialized_grammar = LLMatcher.grammar_from_json_schema(
+            key_string,
+            defaults={
+                "whitespace_pattern": self.whitespace_pattern,
+            },
         )
-    def dispatch_json(self, key_string: str) -> GuidanceGrammar:
-        json_schema = key_string
-        compiler = llguidance.JsonCompiler(whitespace_flexible=self.whitespace_flexible)
-        serialized_grammar = compiler.compile(json_schema)
-        return self._from_serialized(serialized_grammar)
-    def dispatch_regex(self, key_string: str) -> GuidanceGrammar:
-        compiler = llguidance.RegexCompiler()
-        serialized_grammar = compiler.compile(regex=key_string)
         return self._from_serialized(serialized_grammar)
-    def dispatch_ebnf(self, key_string: str) -> GuidanceGrammar:
-        compiler = llguidance.LarkCompiler()
-        serialized_grammar = compiler.compile(any_to_lark(key_string))
+    def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
+        serialized_grammar = grammar_from("regex", key_string)
         return self._from_serialized(serialized_grammar)
-    def dispatch_structural_tag(self, key_string: str):
-        return super().dispatch_structural_tag(key_string)
+    def dispatch_ebnf(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            serialized_grammar = grammar_from("ebnf", key_string)
+            return self._from_serialized(serialized_grammar)
+        except ValueError as e:
+            logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
+            return None
+    def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            structural_tag = json.loads(key_string)
+            tags = [
+                StructTag(
+                    begin=structure["begin"],
+                    grammar=structure["schema"],
+                    end=structure["end"],
+                    trigger=structural_tag["triggers"][0],  # TODO?
+                )
+                for structure in structural_tag["structures"]
+            ]
+            g = StructTag.to_grammar(tags)
+            return self._from_serialized(g)
+        except Exception as e:
+            logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
+            return None

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -158,6 +158,7 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     def dispatch_json(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             if key_string == "$$ANY$$":
+                # Note: This builtin JSON grammar includes *all* valid JSON (including, for example, arrays at the root)
                 ctx = self.grammar_compiler.compile_builtin_json_grammar()
             else:
                 ctx = self.grammar_compiler.compile_json_schema(schema=key_string)

sglang/srt/conversation.py CHANGED Viewed

@@ -463,6 +463,30 @@ def generate_embedding_convs(
     return convs
+# Models in which system adds modality tokens at prompt start automatically
+# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
+_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
+# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
+def _get_full_multimodal_text_prompt(
+    modality_token: str, modality_count: int, text_prompt: str
+) -> str:
+    """Combine multimodal prompts for a multimodal language model."""
+    # For any existing placeholder in the text prompt, we leave it as is
+    left: int = modality_count - text_prompt.count(modality_token)
+    if left < 0:
+        raise ValueError(
+            f"Found more '{modality_token}' placeholders in input prompt than "
+            "actual multimodal data items."
+        )
+    # NOTE: For now we always add missing modality_token at the front of
+    # the prompt. This may change to be customizable in the future.
+    return "\n".join([modality_token] * left + [text_prompt])
 def generate_chat_conv(
     request: ChatCompletionRequest, template_name: str
 ) -> Conversation:
@@ -520,6 +544,12 @@ def generate_chat_conv(
                         if conv.name != "qwen2-vl"
                         else conv.image_token
                     )
+                add_token_as_needed: bool = (
+                    conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
+                )
+                if add_token_as_needed:
+                    image_token = ""
                 audio_token = conv.audio_token
                 for content in message.content:
                     if content.type == "text":
@@ -533,7 +563,10 @@ def generate_chat_conv(
                     elif content.type == "audio_url":
                         real_content += audio_token
                         conv.append_audio(content.audio_url.url)
+                if add_token_as_needed:
+                    real_content = _get_full_multimodal_text_prompt(
+                        conv.image_token, num_image_url, real_content
+                    )
                 conv.append_message(conv.roles[0], real_content)
         elif msg_role == "assistant":
             parsed_content = ""

sglang 0.4.5.post2__py3-none-any.whl → 0.4.6__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.6py3-none-any.whl