PyPI - sglang - Versions diffs - 0.4.6__tar.gz → 0.4.6.post2__tar.gz - Mend

sglang 0.4.6tar.gz → 0.4.6.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (641) hide show

{sglang-0.4.6/sglang.egg-info → sglang-0.4.6.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.6
+Version: 0.4.6.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
-Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
+Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
 Requires-Dist: transformers==4.51.1; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
+Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
+Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
 Requires-Dist: torch==2.6.0; extra == "srt"
 Requires-Dist: torchvision==0.21.0; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"

{sglang-0.4.6 → sglang-0.4.6.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.6"
+version = "0.4.6.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -37,17 +37,18 @@ runtime_common = [
     "python-multipart",
     "pyzmq>=25.1.2",
     "soundfile==0.13.1",
-    "torchao>=0.7.0",
+    "torchao>=0.9.0",
     "transformers==4.51.1",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.17",
+    "blobfile==3.0.0"
 ]
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.9.post2",
-    "flashinfer_python==0.2.3",
+    "sgl-kernel==0.1.1",
+    "flashinfer_python==0.2.5",
     "torch==2.6.0",
     "torchvision==0.21.0",
     "cuda-python",

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/bench_one_batch.py RENAMED Viewed

@@ -154,6 +154,8 @@ def load_model(server_args, port_args, tp_rank):
         gpu_id=tp_rank,
         tp_rank=tp_rank,
         tp_size=server_args.tp_size,
+        pp_rank=0,
+        pp_size=1,
         nccl_port=port_args.nccl_port,
         server_args=server_args,
     )

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/check_env.py RENAMED Viewed

@@ -20,7 +20,7 @@ def is_cuda_v2():
 PACKAGE_LIST = [
     "sglang",
     "sgl_kernel",
-    "flashinfer",
+    "flashinfer_python",
     "triton",
     "transformers",
     "torchao",
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
     "packaging",
     "psutil",
     "pydantic",
-    "multipart",
-    "zmq",
+    "python-multipart",
+    "pyzmq",
     "torchao",
     "uvicorn",
     "uvloop",

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/__init__.py RENAMED Viewed

@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
 from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
 from sglang.srt.configs.exaone import ExaoneConfig
 from sglang.srt.configs.janus_pro import MultiModalityConfig
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
 __all__ = [
     "ExaoneConfig",
@@ -10,4 +12,6 @@ __all__ = [
     "DbrxConfig",
     "DeepseekVL2Config",
     "MultiModalityConfig",
+    "KimiVLConfig",
+    "MoonViTConfig",
 ]

sglang-0.4.6.post2/sglang/srt/configs/kimi_vl.py ADDED Viewed

@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+        text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+        super().__init__(pad_token_id=pad_token_id, **kwargs)

sglang-0.4.6.post2/sglang/srt/configs/kimi_vl_moonvit.py ADDED Viewed

@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+    def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -47,6 +47,7 @@ class ModelConfig:
         dtype: str = "auto",
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
+        is_draft_model: bool = False,
     ) -> None:
         self.model_path = model_path
@@ -85,6 +86,12 @@ class ModelConfig:
             else:
                 enable_multimodal = True
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -169,6 +176,13 @@ class ModelConfig:
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_text_config.kv_lora_rank
             self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+        elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_text_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
         else:
             self.attention_arch = AttentionArch.MHA
@@ -523,6 +537,7 @@ multimodal_model_archs = [
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
     "CLIPModel",
+    "KimiVLForConditionalGeneration",
 ]

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/conversation.py RENAMED Viewed

@@ -17,7 +17,7 @@
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 import dataclasses
 from enum import IntEnum, auto
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 from sglang.srt.openai_api.protocol import ChatCompletionRequest
@@ -407,6 +407,7 @@ class Conversation:
 # A global registry for all conversation templates
 chat_templates: Dict[str, Conversation] = {}
+matching_function_registry: List[Callable] = []
 def register_conv_template(template: Conversation, override: bool = False):
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
     chat_templates[template.name] = template
+def register_conv_template_matching_function(func):
+    matching_function_registry.append(func)
+def get_conv_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        conv_name = matching_func(model_path)
+        if conv_name is not None:
+            return conv_name
+    return None
 def chat_template_exists(template_name: str) -> bool:
     return template_name in chat_templates
@@ -792,3 +805,111 @@ register_conv_template(
         audio_token="(<audio>./</audio>)",
     )
 )
+# Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_message="You are a helpful assistant",
+        system_template="<|im_system|>system<|im_middle|>{system_message}",
+        roles=(
+            "<|im_user|>user<|im_middle|>",
+            "<|im_assistant|>assistant<|im_middle|>",
+        ),
+        messages=[],
+        sep="<|im_end|>",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str="<|im_end|>",
+        image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
+    )
+)
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if (
+        "llama" in model_path.lower()
+        and "3.2" in model_path.lower()
+        and "vision" in model_path.lower()
+    ):
+        return "llama_3_vision"
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if "janus" in model_path.lower():
+        return "janus-pro"
+@register_conv_template_matching_function
+def match_vicuna(model_path: str):
+    if "vicuna" in model_path.lower():
+        return "vicuna_v1.1"
+    if "llava-v1.5" in model_path.lower():
+        return "vicuna_v1.1"
+    if "llava-next-video-7b" in model_path.lower():
+        return "vicuna_v1.1"
+@register_conv_template_matching_function
+def match_llama2_chat(model_path: str):
+    model_path = model_path.lower()
+    if "llama-2" in model_path and "chat" in model_path:
+        return "llama-2"
+    if (
+        "mistral" in model_path or "mixtral" in model_path
+    ) and "instruct" in model_path:
+        return "llama-2"
+    if "codellama" in model_path and "instruct" in model_path:
+        return "llama-2"
+@register_conv_template_matching_function
+def match_deepseek_vl(model_path: str):
+    model_path = model_path.lower()
+    if "deepseek" in model_path and "vl2" in model_path:
+        return "deepseek-vl2"
+@register_conv_template_matching_function
+def match_chat_ml(model_path: str):
+    # import pdb;pdb.set_trace()
+    model_path = model_path.lower()
+    # Now the suffix for qwen2 chat model is "instruct"
+    if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
+        return "gme-qwen2-vl"
+    if "qwen" in model_path and "vl" in model_path:
+        return "qwen2-vl"
+    if (
+        "llava-v1.6-34b" in model_path
+        or "llava-v1.6-yi-34b" in model_path
+        or "llava-next-video-34b" in model_path
+        or "llava-onevision-qwen2" in model_path
+    ):
+        return "chatml-llava"
+@register_conv_template_matching_function
+def match_gemma_it(model_path: str):
+    model_path = model_path.lower()
+    if "gemma" in model_path and "it" in model_path:
+        return "gemma-it"
+    if "gemma-3" in model_path and "1b" not in model_path:
+        # gemma-3-1b-it is completion model
+        return "gemma-it"
+@register_conv_template_matching_function
+def match_openbmb_minicpm(model_path: str):
+    model_path = model_path.lower()
+    if "minicpm-v" in model_path:
+        return "minicpmv"
+    elif "minicpm-o" in model_path:
+        return "minicpmo"
+@register_conv_template_matching_function
+def match_moonshot_kimivl(model_path: str):
+    model_path = model_path.lower()
+    if "kimi" in model_path and "vl" in model_path:
+        return "kimi-vl"

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/decode.py RENAMED Viewed

@@ -32,6 +32,7 @@ from torch.distributed import ProcessGroup
 from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVArgs, KVPoll
 from sglang.srt.disaggregation.utils import (
     DisaggregationMode,
+    FakeBootstrapHost,
     KVClassType,
     ReqToMetadataIdxAllocator,
     TransferBackend,
@@ -133,8 +134,13 @@ class DecodePreallocQueue:
     def add(self, req: Req) -> None:
         """Add a request to the pending queue."""
-        kv_receiver_class = get_kv_class(self.transfer_backend, KVClassType.RECEIVER)
+        if req.bootstrap_host == FakeBootstrapHost:
+            # Fake transfer for warmup reqs
+            kv_receiver_class = get_kv_class(TransferBackend.FAKE, KVClassType.RECEIVER)
+        else:
+            kv_receiver_class = get_kv_class(
+                self.transfer_backend, KVClassType.RECEIVER
+            )
         kv_receiver = kv_receiver_class(
             mgr=self.kv_manager,
             bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",

sglang-0.4.6.post2/sglang/srt/disaggregation/fake/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .conn import FakeKVReceiver, FakeKVSender

sglang-0.4.6.post2/sglang/srt/disaggregation/fake/conn.py ADDED Viewed

@@ -0,0 +1,88 @@
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import numpy.typing as npt
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVArgs,
+    KVPoll,
+)
+logger = logging.getLogger(__name__)
+# For warmup reqs, we don't kv transfer, we use the fake sender and receiver
+class FakeKVSender(BaseKVSender):
+    def __init__(self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int):
+        self.has_sent = False
+    def poll(self) -> KVPoll:
+        if self.has_sent is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.info("FakeKVSender poll success")
+            return KVPoll.Success
+    def init(
+        self,
+        kv_indices: list[int],
+        aux_index: Optional[int] = None,
+        dest_ranks: Optional[list[int]] = None,
+    ):
+        logger.info(
+            f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}, dest_ranks: {dest_ranks}"
+        )
+        pass
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int64],
+        index_slice: slice,
+        is_last: bool,
+    ):
+        logger.info(
+            f"FakeKVSender send with kv_indices: {kv_indices}, index_slice: {index_slice}, is_last: {is_last}"
+        )
+        if is_last:
+            self.has_sent = True
+            logger.info(f"FakeKVSender send success")
+        else:
+            self.has_sent = False
+            logger.info(f"FakeKVSender send fake transfering")
+    def failure_exception(self):
+        raise Exception("Fake KVSender Exception")
+class FakeKVReceiver(BaseKVReceiver):
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+    ):
+        self.has_init = False
+    def poll(self) -> KVPoll:
+        if self.has_init is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.info("FakeKVReceiver poll success")
+            return KVPoll.Success
+    def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
+        self.has_init = True
+        logger.info(
+            f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
+        )
+    def failure_exception(self):
+        raise Exception("Fake KVReceiver Exception")

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/prefill.py RENAMED Viewed

@@ -20,6 +20,7 @@ Life cycle of a request in the prefill server
 from __future__ import annotations
 import logging
+import threading
 from collections import deque
 from typing import TYPE_CHECKING, List, Optional
@@ -28,6 +29,7 @@ import torch
 from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
 from sglang.srt.disaggregation.utils import (
     DisaggregationMode,
+    FakeBootstrapHost,
     KVClassType,
     ReqToMetadataIdxAllocator,
     TransferBackend,
@@ -115,7 +117,11 @@ class PrefillBootstrapQueue:
         return kv_manager
     def add(self, req: Req) -> None:
-        kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
+        if req.bootstrap_host == FakeBootstrapHost:
+            # Fake transfer for warmup reqs
+            kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
+        else:
+            kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
         req.disagg_kv_sender = kv_sender_class(
             mgr=self.kv_manager,
             bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
@@ -256,7 +262,10 @@ class SchedulerDisaggregationPrefillMixin:
             self.running_batch.batch_is_full = False
     def process_batch_result_disagg_prefill(
-        self: Scheduler, batch: ScheduleBatch, result: GenerationBatchResult
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+        launch_done: Optional[threading.Event] = None,
     ) -> None:
         """
         Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
@@ -280,7 +289,7 @@ class SchedulerDisaggregationPrefillMixin:
         # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
         if self.enable_overlap:
             # wait
-            _, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+            _, next_token_ids = self.tp_worker.resolve_last_batch_result(launch_done)
         else:
             next_token_ids = result.next_token_ids.tolist()

{sglang-0.4.6 → sglang-0.4.6.post2}/sglang/srt/disaggregation/utils.py RENAMED Viewed

@@ -15,6 +15,9 @@ class DisaggregationMode(Enum):
     DECODE = "decode"
+FakeBootstrapHost = "2.2.2.2"
 def poll_and_all_reduce(pollers, gloo_group):
     polls = [int(poller.poll()) for poller in pollers]
     tensor_to_reduce = torch.tensor(polls, dtype=torch.uint8, device="cpu")
@@ -59,6 +62,8 @@ class KVClassType(Enum):
 def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
+    from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
     if transfer_backend == TransferBackend.MOONCAKE:
         from sglang.srt.disaggregation.mooncake import (
             MooncakeKVBootstrapServer,
@@ -70,7 +75,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
         class_mapping = {
             KVClassType.MANAGER: MooncakeKVManager,
             KVClassType.SENDER: MooncakeKVSender,
-            KVClassType.RECEIVER: MooncakeKVReceiver,
+            KVClassType.RECEIVER: (MooncakeKVReceiver),
             KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
         }
         return class_mapping.get(class_type)
@@ -85,10 +90,19 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
         class_mapping = {
             KVClassType.MANAGER: NixlKVManager,
             KVClassType.SENDER: NixlKVSender,
-            KVClassType.RECEIVER: NixlKVReceiver,
+            KVClassType.RECEIVER: (NixlKVReceiver),
             KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
         }
         return class_mapping.get(class_type)
+    if transfer_backend == TransferBackend.FAKE:
+        from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
+        class_mapping = {
+            KVClassType.SENDER: FakeKVSender,
+            KVClassType.RECEIVER: (FakeKVReceiver),
+        }
+        return class_mapping.get(class_type)
     raise ValueError(f"Unsupported transfer backend: {transfer_backend}")

sglang 0.4.6__tar.gz → 0.4.6.post2__tar.gz

sglang 0.4.6tar.gz → 0.4.6.post2tar.gz