PyPI - sglang - Versions diffs - 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl - Mend

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/bench_one_batch.py +2 -0
sglang/check_env.py +3 -3
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/kimi_vl.py +38 -0
sglang/srt/configs/kimi_vl_moonvit.py +32 -0
sglang/srt/configs/model_config.py +15 -0
sglang/srt/conversation.py +122 -1
sglang/srt/entrypoints/engine.py +44 -22
sglang/srt/function_call_parser.py +97 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +107 -82
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
sglang/srt/layers/attention/flashmla_backend.py +3 -0
sglang/srt/layers/dp_attention.py +5 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +1 -1
sglang/srt/layers/utils.py +35 -0
sglang/srt/lora/layers.py +35 -9
sglang/srt/lora/lora_manager.py +84 -35
sglang/srt/managers/data_parallel_controller.py +52 -34
sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
sglang/srt/managers/schedule_batch.py +25 -15
sglang/srt/managers/scheduler.py +263 -59
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
sglang/srt/managers/tp_worker.py +51 -16
sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
sglang/srt/mem_cache/memory_pool.py +70 -36
sglang/srt/model_executor/cuda_graph_runner.py +82 -19
sglang/srt/model_executor/forward_batch_info.py +31 -1
sglang/srt/model_executor/model_runner.py +115 -57
sglang/srt/models/deepseek_nextn.py +1 -257
sglang/srt/models/deepseek_v2.py +78 -18
sglang/srt/models/kimi_vl.py +308 -0
sglang/srt/models/kimi_vl_moonvit.py +639 -0
sglang/srt/models/llama.py +92 -30
sglang/srt/models/llama4.py +2 -1
sglang/srt/models/llama_eagle.py +4 -1
sglang/srt/models/llama_eagle3.py +4 -1
sglang/srt/models/qwen2_moe.py +8 -3
sglang/srt/models/qwen2_vl.py +0 -12
sglang/srt/models/qwen3_moe.py +8 -3
sglang/srt/openai_api/adapter.py +34 -22
sglang/srt/openai_api/protocol.py +11 -1
sglang/srt/server_args.py +67 -22
sglang/srt/speculative/eagle_worker.py +3 -2
sglang/srt/utils.py +88 -9
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +29 -0
sglang/version.py +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -154,6 +154,8 @@ def load_model(server_args, port_args, tp_rank):
         gpu_id=tp_rank,
         tp_rank=tp_rank,
         tp_size=server_args.tp_size,
+        pp_rank=0,
+        pp_size=1,
         nccl_port=port_args.nccl_port,
         server_args=server_args,
     )

sglang/check_env.py CHANGED Viewed

@@ -20,7 +20,7 @@ def is_cuda_v2():
 PACKAGE_LIST = [
     "sglang",
     "sgl_kernel",
-    "flashinfer",
+    "flashinfer_python",
     "triton",
     "transformers",
     "torchao",
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
     "packaging",
     "psutil",
     "pydantic",
-    "multipart",
-    "zmq",
+    "python-multipart",
+    "pyzmq",
     "torchao",
     "uvicorn",
     "uvloop",

sglang/srt/configs/__init__.py CHANGED Viewed

@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
 from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
 from sglang.srt.configs.exaone import ExaoneConfig
 from sglang.srt.configs.janus_pro import MultiModalityConfig
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
 __all__ = [
     "ExaoneConfig",
@@ -10,4 +12,6 @@ __all__ = [
     "DbrxConfig",
     "DeepseekVL2Config",
     "MultiModalityConfig",
+    "KimiVLConfig",
+    "MoonViTConfig",
 ]

sglang/srt/configs/kimi_vl.py ADDED Viewed

@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+        text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+        super().__init__(pad_token_id=pad_token_id, **kwargs)

sglang/srt/configs/kimi_vl_moonvit.py ADDED Viewed

@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+    def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -47,6 +47,7 @@ class ModelConfig:
         dtype: str = "auto",
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
+        is_draft_model: bool = False,
     ) -> None:
         self.model_path = model_path
@@ -85,6 +86,12 @@ class ModelConfig:
             else:
                 enable_multimodal = True
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -169,6 +176,13 @@ class ModelConfig:
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_text_config.kv_lora_rank
             self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+        elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_text_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
         else:
             self.attention_arch = AttentionArch.MHA
@@ -523,6 +537,7 @@ multimodal_model_archs = [
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
     "CLIPModel",
+    "KimiVLForConditionalGeneration",
 ]

sglang/srt/conversation.py CHANGED Viewed

@@ -17,7 +17,7 @@
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 import dataclasses
 from enum import IntEnum, auto
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 from sglang.srt.openai_api.protocol import ChatCompletionRequest
@@ -407,6 +407,7 @@ class Conversation:
 # A global registry for all conversation templates
 chat_templates: Dict[str, Conversation] = {}
+matching_function_registry: List[Callable] = []
 def register_conv_template(template: Conversation, override: bool = False):
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
     chat_templates[template.name] = template
+def register_conv_template_matching_function(func):
+    matching_function_registry.append(func)
+def get_conv_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        conv_name = matching_func(model_path)
+        if conv_name is not None:
+            return conv_name
+    return None
 def chat_template_exists(template_name: str) -> bool:
     return template_name in chat_templates
@@ -792,3 +805,111 @@ register_conv_template(
         audio_token="(<audio>./</audio>)",
     )
 )
+# Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_message="You are a helpful assistant",
+        system_template="<|im_system|>system<|im_middle|>{system_message}",
+        roles=(
+            "<|im_user|>user<|im_middle|>",
+            "<|im_assistant|>assistant<|im_middle|>",
+        ),
+        messages=[],
+        sep="<|im_end|>",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str="<|im_end|>",
+        image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
+    )
+)
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if (
+        "llama" in model_path.lower()
+        and "3.2" in model_path.lower()
+        and "vision" in model_path.lower()
+    ):
+        return "llama_3_vision"
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if "janus" in model_path.lower():
+        return "janus-pro"
+@register_conv_template_matching_function
+def match_vicuna(model_path: str):
+    if "vicuna" in model_path.lower():
+        return "vicuna_v1.1"
+    if "llava-v1.5" in model_path.lower():
+        return "vicuna_v1.1"
+    if "llava-next-video-7b" in model_path.lower():
+        return "vicuna_v1.1"
+@register_conv_template_matching_function
+def match_llama2_chat(model_path: str):
+    model_path = model_path.lower()
+    if "llama-2" in model_path and "chat" in model_path:
+        return "llama-2"
+    if (
+        "mistral" in model_path or "mixtral" in model_path
+    ) and "instruct" in model_path:
+        return "llama-2"
+    if "codellama" in model_path and "instruct" in model_path:
+        return "llama-2"
+@register_conv_template_matching_function
+def match_deepseek_vl(model_path: str):
+    model_path = model_path.lower()
+    if "deepseek" in model_path and "vl2" in model_path:
+        return "deepseek-vl2"
+@register_conv_template_matching_function
+def match_chat_ml(model_path: str):
+    # import pdb;pdb.set_trace()
+    model_path = model_path.lower()
+    # Now the suffix for qwen2 chat model is "instruct"
+    if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
+        return "gme-qwen2-vl"
+    if "qwen" in model_path and "vl" in model_path:
+        return "qwen2-vl"
+    if (
+        "llava-v1.6-34b" in model_path
+        or "llava-v1.6-yi-34b" in model_path
+        or "llava-next-video-34b" in model_path
+        or "llava-onevision-qwen2" in model_path
+    ):
+        return "chatml-llava"
+@register_conv_template_matching_function
+def match_gemma_it(model_path: str):
+    model_path = model_path.lower()
+    if "gemma" in model_path and "it" in model_path:
+        return "gemma-it"
+    if "gemma-3" in model_path and "1b" not in model_path:
+        # gemma-3-1b-it is completion model
+        return "gemma-it"
+@register_conv_template_matching_function
+def match_openbmb_minicpm(model_path: str):
+    model_path = model_path.lower()
+    if "minicpm-v" in model_path:
+        return "minicpmv"
+    elif "minicpm-o" in model_path:
+        return "minicpmo"
+@register_conv_template_matching_function
+def match_moonshot_kimivl(model_path: str):
+    model_path = model_path.lower()
+    if "kimi" in model_path and "vl" in model_path:
+        return "kimi-vl"

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -58,7 +58,10 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
-from sglang.srt.openai_api.adapter import load_chat_template_for_openai_api
+from sglang.srt.openai_api.adapter import (
+    guess_chat_template_name_from_model_path,
+    load_chat_template_for_openai_api,
+)
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
@@ -123,7 +126,6 @@ class Engine(EngineBase):
             server_args=server_args,
             port_args=port_args,
         )
         self.server_args = server_args
         self.tokenizer_manager = tokenizer_manager
         self.scheduler_info = scheduler_info
@@ -298,7 +300,6 @@ class Engine(EngineBase):
         internal_states = loop.run_until_complete(
             self.tokenizer_manager.get_internal_state()
         )
         return {
             **dataclasses.asdict(self.tokenizer_manager.server_args),
             **self.scheduler_info,
@@ -450,7 +451,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.3",
+            "0.2.5",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -458,7 +459,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.1.0",
+            "0.1.1",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
@@ -517,25 +518,44 @@ def _launch_subprocesses(
         )
         scheduler_pipe_readers = []
-        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
         tp_rank_range = range(
-            tp_size_per_node * server_args.node_rank,
-            tp_size_per_node * (server_args.node_rank + 1),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
         )
-        for tp_rank in tp_rank_range:
-            reader, writer = mp.Pipe(duplex=False)
-            gpu_id = (
-                server_args.base_gpu_id
-                + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
-            )
-            proc = mp.Process(
-                target=run_scheduler_process,
-                args=(server_args, port_args, gpu_id, tp_rank, None, writer),
-            )
-            with memory_saver_adapter.configure_subprocess():
-                proc.start()
-            scheduler_procs.append(proc)
-            scheduler_pipe_readers.append(reader)
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
+        )
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        server_args,
+                        port_args,
+                        gpu_id,
+                        tp_rank,
+                        pp_rank,
+                        None,
+                        writer,
+                    ),
+                )
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
     else:
         # Launch the data parallel controller
         reader, writer = mp.Pipe(duplex=False)
@@ -584,6 +604,8 @@ def _launch_subprocesses(
         load_chat_template_for_openai_api(
             tokenizer_manager, server_args.chat_template, server_args.model_path
         )
+    else:
+        guess_chat_template_name_from_model_path(server_args.model_path)
     if server_args.completion_template:
         load_completion_template_for_openai_api(server_args.completion_template)

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import ast
 import json
 import logging
 import re
@@ -664,6 +665,101 @@ class MultiFormatParser:
         return final_normal_text, final_calls
+class PythonicDetector(BaseFormatDetector):
+    """
+    Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
+    Assumes function call format:
+      [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
+    Arguments are Python literals (not JSON).
+    """
+    def __init__(self):
+        super().__init__()
+        self.tool_call_regex = re.compile(
+            r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+            re.DOTALL,
+        )
+    def has_tool_call(self, text: str) -> bool:
+        return bool(self.tool_call_regex.match(text.strip()))
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        # Try parsing the text as a Python list of function calls
+        text = text.strip()
+        if not (text.startswith("[") and text.endswith("]")):
+            # Not a pythonic tool call format
+            return StreamingParseResult(normal_text=text, calls=[])
+        try:
+            module = ast.parse(text)
+            parsed = getattr(module.body[0], "value", None)
+            if not (
+                isinstance(parsed, ast.List)
+                and all(isinstance(e, ast.Call) for e in parsed.elts)
+            ):
+                return StreamingParseResult(normal_text=text, calls=[])
+            calls = []
+            tool_indices = {
+                tool.function.name: i
+                for i, tool in enumerate(tools)
+                if tool.function.name
+            }
+            for call in parsed.elts:
+                if not isinstance(call.func, ast.Name):
+                    continue
+                function_name = call.func.id
+                arguments = {}
+                for keyword in call.keywords:
+                    arguments[keyword.arg] = self._get_parameter_value(keyword.value)
+                calls.append(
+                    ToolCallItem(
+                        tool_index=tool_indices.get(function_name, -1),
+                        name=function_name,
+                        parameters=json.dumps(arguments, ensure_ascii=False),
+                    )
+                )
+            return StreamingParseResult(normal_text="", calls=calls)
+        except Exception:
+            logger.exception("Error in pythonic tool call parsing.")
+            return StreamingParseResult(normal_text=text, calls=[])
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for pythonic tool calls.
+        Buffers input until a complete pythonic tool call (from [ to ]) is found,
+        then parses and emits any detected calls.
+        """
+        self._buffer += new_text
+        start = self._buffer.find("[")
+        end = self._buffer.find("]", start)
+        if start != -1 and end != -1:
+            call_text = self._buffer[start : end + 1]
+            result = self.detect_and_parse(call_text, tools)
+            self._buffer = self._buffer[end + 1 :]
+            return result
+        return StreamingParseResult(normal_text="")
+    def _get_parameter_value(self, val):
+        if isinstance(val, ast.Constant):
+            return val.value
+        elif isinstance(val, ast.Dict):
+            return {
+                k.value: self._get_parameter_value(v)
+                for k, v in zip(val.keys, val.values)
+            }
+        elif isinstance(val, ast.List):
+            return [self._get_parameter_value(v) for v in val.elts]
+        else:
+            raise ValueError("Tool call arguments must be literals")
+    def structure_info(self) -> _GetInfoFunc:
+        def info(name: str):
+            return StructureInfo(begin="[", end="]", trigger="")
+        return info
 class FunctionCallParser:
     """
     In streaming scenarios, each time new_text is received, it calls multi_format_parser.parse_streaming_increment
@@ -675,6 +771,7 @@ class FunctionCallParser:
         "qwen25": Qwen25Detector,
         "mistral": MistralDetector,
         "deepseekv3": DeepSeekV3Detector,
+        "pythonic": PythonicDetector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -35,6 +35,7 @@ from sglang.srt.configs import (
     DbrxConfig,
     DeepseekVL2Config,
     ExaoneConfig,
+    KimiVLConfig,
     MultiModalityConfig,
 )
 from sglang.srt.connector import create_remote_connector
@@ -46,6 +47,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     ExaoneConfig.model_type: ExaoneConfig,
     DeepseekVL2Config.model_type: DeepseekVL2Config,
     MultiModalityConfig.model_type: MultiModalityConfig,
+    KimiVLConfig.model_type: KimiVLConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():

sglang/srt/layers/attention/cutlass_mla_backend.py CHANGED Viewed

@@ -268,7 +268,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
         reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
         o = cutlass_mla_decode(
-            q_nope_and_q_pe=reshape_q,
+            q_nope_and_q_pe=reshape_q.to(self.q_data_type),
             kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
             seq_lens=forward_batch.seq_lens.to(torch.int32),
             page_table=self.forward_metadata.block_kv_indices,

sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post2py3-none-any.whl