PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/lang/chat_template.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import re
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Callable, Dict, List, Tuple
@@ -71,9 +72,9 @@ def get_chat_template(name):
 def get_chat_template_by_model_path(model_path):
     for matching_func in matching_function_registry:
-        template = matching_func(model_path)
-        if template is not None:
-            return template
+        template_name = matching_func(model_path)
+        if template_name is not None:
+            return get_chat_template(template_name)
     return get_chat_template("default")
@@ -193,6 +194,21 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="mistral",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        stop_str=("</s>",),
+        image_token="[IMG]",
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="llama-3-instruct",
@@ -479,134 +495,118 @@ register_chat_template(
 @register_chat_template_matching_function
 def match_deepseek(model_path: str):
-    if (
-        "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
-    ) and "base" not in model_path.lower():
-        return get_chat_template("deepseek-v3")
+    if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
+        r"base", model_path, re.IGNORECASE
+    ):
+        return "deepseek-v3"
 @register_chat_template_matching_function
 def match_deepseek_janus_pro(model_path: str):
-    if "janus" in model_path.lower():
-        return get_chat_template("janus-pro")
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
 @register_chat_template_matching_function
 def match_dbrx(model_path: str):
-    if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
-        return get_chat_template("dbrx-instruct")
+    if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
+        r"instruct", model_path, re.IGNORECASE
+    ):
+        return "dbrx-instruct"
 @register_chat_template_matching_function
 def match_vicuna(model_path: str):
-    if "vicuna" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-v1.5" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-next-video-7b" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
 @register_chat_template_matching_function
 def match_llama2_chat(model_path: str):
-    model_path = model_path.lower()
-    if "llama-2" in model_path and "chat" in model_path:
-        return get_chat_template("llama-2-chat")
-    if (
-        "mistral" in model_path or "mixtral" in model_path
-    ) and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
-    if "codellama" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
+    if re.search(
+        r"llama-2.*chat|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "llama-2-chat"
+@register_chat_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
 @register_chat_template_matching_function
 def match_llama3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "llama-3" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-3-instruct")
+    if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
+        return "llama-3-instruct"
 @register_chat_template_matching_function
 def match_chat_ml(model_path: str):
-    # import pdb;pdb.set_trace()
-    model_path = model_path.lower()
-    if "tinyllama" in model_path:
-        return get_chat_template("chatml")
-    # Now the suffix for qwen2 chat model is "instruct"
-    if "qwen" in model_path and "vl" in model_path:
-        return get_chat_template("qwen2-vl")
-    if "qwen" in model_path:
-        if "vl" in model_path:
-            return get_chat_template("qwen2-vl")
-        if ("chat" in model_path or "instruct" in model_path) and (
-            "llava" not in model_path
-        ):
-            return get_chat_template("qwen")
-    if (
-        "llava-v1.6-34b" in model_path
-        or "llava-v1.6-yi-34b" in model_path
-        or "llava-next-video-34b" in model_path
-        or "llava-onevision-qwen2" in model_path
+    if re.search(r"tinyllama", model_path, re.IGNORECASE):
+        return "chatml"
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
+        return "qwen2-vl"
+    if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
     ):
-        return get_chat_template("chatml-llava")
+        return "qwen"
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
-    model_path = model_path.lower()
-    if "yi-vl" in model_path and "llava" not in model_path:
-        return get_chat_template("yi-vl")
-    elif "yi-1.5" in model_path and "chat" in model_path:
-        return get_chat_template("yi-1.5")
+    if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "yi-vl"
+    elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
+        return "yi-1.5"
 @register_chat_template_matching_function
 def match_gemma_it(model_path: str):
-    model_path = model_path.lower()
-    if "gemma" in model_path and "it" in model_path:
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+        return "gemma-it"
 @register_chat_template_matching_function
 def match_openbmb_minicpm(model_path: str):
-    model_path = model_path.lower()
-    if "minicpm-v" in model_path:
-        return get_chat_template("minicpmv")
-    elif "minicpm-o" in model_path:
-        return get_chat_template("minicpmo")
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
+        return "minicpmv"
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
+        return "minicpmo"
 @register_chat_template_matching_function
 def match_c4ai_command_r(model_path: str):
-    model_path = model_path.lower()
-    if "c4ai-command-r" in model_path:
-        return get_chat_template("c4ai-command-r")
+    if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
+        return "c4ai-command-r"
 @register_chat_template_matching_function
 def match_granite_instruct(model_path: str):
-    model_path = model_path.lower()
-    # When future versions of Granite are released, this code may
-    # need to be updated. For now, assume that the Granite 3.0
-    # template works across the board.
-    if "granite" in model_path and "instruct" in model_path:
-        return get_chat_template("granite-3-instruct")
+    if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
+        return "granite-3-instruct"
 @register_chat_template_matching_function
 def match_gemma3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "gemma-3" in model_path and "1b" not in model_path:
-        # gemma-3-1b-it is completion model
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma-3", model_path, re.IGNORECASE):
+        return "gemma-it"
 @register_chat_template_matching_function
 def match_internvl_chat(model_path: str):
-    model_path = model_path.lower()
-    if "internvl" in model_path:
-        return get_chat_template("internvl-2-5")
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
 if __name__ == "__main__":

sglang/lang/tracer.py CHANGED Viewed

@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
         with TracingScope(tracer):
             tracer.ret_value = program.func(tracer, **arguments)
     except (StopTracing, TypeError, AttributeError):
-        # Some exceptions may not be catched
+        # Some exceptions may not be caught
         pass
     # Run and cache prefix

sglang/srt/code_completion_parser.py CHANGED Viewed

@@ -27,7 +27,7 @@ completion_template_name = None
 class FimPosition:
-    """Postion of fim middle token."""
+    """Position of fim middle token."""
     MIDDLE = auto()
     END = auto()

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
             h = w = math.ceil(
                 (self.image_size // self.patch_size) / self.downsample_ratio
             )
-            # global views tokens h * (w + 1), 1 is for line seperator
+            # global views tokens h * (w + 1), 1 is for line separator
             tokenized_image = [self.image_token_id] * h * (w + 1)
-            # add a seperator between global and local views
+            # add a separator between global and local views
             tokenized_image += [self.image_token_id]
             # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
             tokenized_image += (

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -22,7 +22,11 @@ from typing import List, Optional, Set, Union
 import torch
 from transformers import PretrainedConfig
-from sglang.srt.hf_transformers_utils import get_config, get_context_length
+from sglang.srt.hf_transformers_utils import (
+    get_config,
+    get_context_length,
+    get_hf_text_config,
+)
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var, is_hip
@@ -69,6 +73,7 @@ class ModelConfig:
             model_override_args=self.model_override_args,
             **kwargs,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
@@ -93,6 +98,8 @@ class ModelConfig:
         ):
             self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
+        if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
+            self.hf_config.architectures[0] = "MiMoMTP"
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -109,6 +116,10 @@ class ModelConfig:
         self.is_audio_model = enable_multimodal and is_audio_model(
             self.hf_config.architectures
         )
+        self.is_multimodal_chunked_prefill_supported = (
+            enable_multimodal
+            and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
+        )
         self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
@@ -209,7 +220,13 @@ class ModelConfig:
         # Cache attributes
         self.hf_eos_token_id = self.get_hf_eos_token_id()
-        self.image_token_id = getattr(self.hf_config, "image_token_id", None)
+        config = self.hf_config
+        # multimodal
+        self.image_token_id = getattr(config, "image_token_id", None) or getattr(
+            config, "image_token_index", None
+        )
     @staticmethod
     def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
@@ -332,6 +349,7 @@ class ModelConfig:
             "w8a8_int8",
             "w8a8_fp8",
             "moe_wna16",
+            "qoq",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
@@ -423,31 +441,6 @@ class ModelConfig:
                 self.model_path = client.get_local_dir()
-def get_hf_text_config(config: PretrainedConfig):
-    """Get the "sub" config relevant to llm for multi modal models.
-    No op for pure text models.
-    """
-    class_name = config.architectures[0]
-    if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
-        # We support non-hf version of llava models, so we do not want to
-        # read the wrong values from the unused default text_config.
-        # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
-        # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
-        setattr(config, "torch_dtype", torch.float16)
-        return config
-    if hasattr(config, "text_config"):
-        # The code operates under the assumption that text_config should have
-        # `num_attention_heads` (among others). Assert here to fail early
-        # if transformers config doesn't align with this assumption.
-        assert hasattr(config.text_config, "num_attention_heads")
-        return config.text_config
-    if hasattr(config, "language_config"):
-        return config.language_config
-    else:
-        return config
 # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.float16,
@@ -466,6 +459,8 @@ def _get_and_verify_dtype(
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
     config_dtype = getattr(config, "torch_dtype", None)
+    if isinstance(config_dtype, str):
+        config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
     if config_dtype is None:
         config_dtype = torch.float32
@@ -537,6 +532,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
 multimodal_model_archs = [
+    "CLIPModel",
     "DeepseekVL2ForCausalLM",
     "Gemma3ForConditionalGeneration",
     "Grok1VForCausalLM",
@@ -545,14 +541,15 @@ multimodal_model_archs = [
     "Llama4ForConditionalGeneration",
     "LlavaMistralForCausalLM",
     "LlavaQwenForCausalLM",
+    "LlavaForConditionalGeneration",
     "LlavaVidForCausalLM",
     "MiniCPMO",
     "MiniCPMV",
+    "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
-    "CLIPModel",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
 ]
@@ -584,6 +581,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
     return "MllamaForConditionalGeneration" in model_architectures
+def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
+    """Check if chunked prefill is supported for a MultiModal model."""
+    unsupported = [
+        "Grok1VForCausalLM",
+        "Grok1AForCausalLM",
+        "LlavaLlamaForCausalLM",
+        "MllamaForConditionalGeneration",
+        "CLIPModel",
+    ]
+    if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
+        return False
+    else:
+        return True
 def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     if scale <= 1:
         return 1.0

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -14,10 +14,9 @@
 """The baseclass of a backend for grammar-guided constrained decoding."""
 import logging
-from abc import ABC, abstractmethod
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from threading import Event, Lock
+from threading import Event
 from typing import Dict, List, Optional, Tuple
 import torch
@@ -27,11 +26,42 @@ from sglang.srt.server_args import ServerArgs
 logger = logging.getLogger(__name__)
-class BaseGrammarObject(ABC):
+class BaseGrammarObject:
     def __init__(self):
         self._finished = False
+    def accept_token(self, token: int) -> None:
+        """
+        Accept a token in the grammar.
+        """
+        raise NotImplementedError()
+    def rollback(self, k: int):
+        raise NotImplementedError()
+    def is_terminated(self):
+        return False
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        raise NotImplementedError()
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        raise NotImplementedError()
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        raise NotImplementedError()
+    def copy(self) -> "BaseGrammarObject":
+        raise NotImplementedError()
     @property
     def finished(self):
         return self._finished
@@ -40,7 +70,6 @@ class BaseGrammarObject(ABC):
     def finished(self, finished):
         self._finished = finished
-    @abstractmethod
     def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
         """
         Try to jump forward in the grammar.
@@ -49,9 +78,8 @@ class BaseGrammarObject(ABC):
             A jump forward helper which may be used in `jump_forward_str_state`.
             None if the jump forward is not possible.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
-    @abstractmethod
     def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
         """
         Jump forward for the grammar.
@@ -60,47 +88,15 @@ class BaseGrammarObject(ABC):
             A tuple of the jump forward string and the next state of the grammar
             (which can be used in `jump_and_retokenize` if needed).
         """
-        raise NotImplementedError
+        raise NotImplementedError()
-    @abstractmethod
     def jump_and_retokenize(
         self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
     ) -> None:
         """
         Jump forward occurs, and update the grammar state if needed.
         """
-        raise NotImplementedError
-    @abstractmethod
-    def accept_token(self, token: int) -> None:
-        """
-        Accept a token in the grammar.
-        """
-        raise NotImplementedError
-    @abstractmethod
-    def allocate_vocab_mask(
-        self, vocab_size: int, batch_size: int, device
-    ) -> torch.Tensor:
-        raise NotImplementedError
-    @abstractmethod
-    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
-        raise NotImplementedError
-    @staticmethod
-    @abstractmethod
-    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        raise NotImplementedError
-    @abstractmethod
-    def copy(self) -> "BaseGrammarObject":
-        raise NotImplementedError
+        raise NotImplementedError()
 @dataclass
@@ -113,10 +109,9 @@ class BaseGrammarBackend:
     def __init__(self):
         self.executor = ThreadPoolExecutor()
         self.cache: Dict[Tuple[str, str], CacheEntry] = {}
-        self.cache_lock = Lock()
     def _not_supported(self, key_type: str, key_string: str) -> None:
-        logger.warning(f"Skip unsupported {key_type}: {key_type}={key_string}")
+        logger.warning(f"Skip unsupported {key_type=}, {key_string=}")
     def dispatch_fallback(
         self, key_type: str, key_string: str
@@ -148,40 +143,25 @@ class BaseGrammarBackend:
             return self.dispatch_ebnf(key_string)
         elif key_type == "structural_tag":
             return self.dispatch_structural_tag(key_string)
+        elif key_type == "structural_pattern":
+            return self.dispatch_structural_pattern(key_string)
         else:
             return self.dispatch_fallback(key_type, key_string)
-    def _init_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
-        with self.cache_lock:
-            if key in self.cache:
-                cache_hit = True
-                entry = self.cache[key]
-            else:
-                cache_hit = False
-                entry = CacheEntry(None, Event())
-                self.cache[key] = entry
-        if cache_hit:
-            entry.event.wait()
-        else:
-            entry.value = self._init_value_dispatch(key)
-            entry.event.set()
-        return entry.value.copy() if entry.value else None
-    def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
-        with self.cache_lock:
-            entry = self.cache.get(key)
-            if not entry or not entry.event.is_set():
-                return None
-            val = self.cache[key].value
-            return val.copy() if val else None
+    def get_cached_or_future_value(
+        self, key: Tuple[str, str]
+    ) -> Optional[BaseGrammarObject]:
+        value = self.cache.get(key)
+        if value:
+            return value.copy(), True
+        value = self.executor.submit(self._init_value_dispatch, key)
+        return value, False
-    def get_future_value(self, key: Tuple[str, str]) -> Future:
-        return self.executor.submit(self._init_value, key)
+    def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):
+        self.cache[key] = value
     def reset(self):
-        with self.cache_lock:
-            self.cache.clear()
+        self.cache.clear()
 def create_grammar_backend(
@@ -211,9 +191,12 @@ def create_grammar_backend(
         raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
     if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
-        from .reasoner_grammar_backend import ReasonerGrammarBackend
+        from sglang.srt.constrained.reasoner_grammar_backend import (
+            ReasonerGrammarBackend,
+        )
         grammar_backend = ReasonerGrammarBackend(
             grammar_backend, tokenizer.think_end_id
         )
     return grammar_backend

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl