PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/profiler.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""
+Run live profiling.
+Usage:
+python3 -m sglang.profiler
+"""
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import List, Optional
+import requests
+PARENT_FOLDER = "/tmp/sglang-profile"
+def _run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+) -> str:
+    if output_dir is None:
+        output_dir = PARENT_FOLDER
+    output_dir = os.path.normpath(output_dir)
+    output_dir = os.path.abspath(output_dir)
+    output_dir = Path(output_dir)
+    # Add "profile_name/timestamp" to the path.
+    if profile_name:
+        output_dir = output_dir / profile_name
+    output_dir = output_dir / str(time.time())
+    output_dir.mkdir(exist_ok=True, parents=True)
+    print(f"Dump profiling traces to {output_dir}")
+    print(
+        f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
+    )
+    # Dump server args.
+    file_path = Path(output_dir) / "server_args.json"
+    if not file_path.exists():
+        response = requests.get(url + "/get_server_info")
+        response.raise_for_status()
+        server_args_data = response.json()
+        with open(file_path, "w") as file:
+            file.write(json.dumps(server_args_data))
+    # Start profiler. The API replies when all steps are processed
+    # and files are generated.
+    json_data = {
+        "output_dir": str(output_dir),
+        "num_steps": str(num_steps),
+        "activities": activities,
+        "profile_by_stage": profile_by_stage,
+    }
+    response = requests.post(url=url + "/start_profile", json=json_data)
+    response.raise_for_status()
+    trace_link = str(output_dir)
+    return trace_link
+def run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+):
+    # step based profile will self terminate on num_steps constraints
+    link = _run_profile(
+        url, num_steps, activities, output_dir, profile_name, profile_by_stage
+    )
+    return link
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Profile directory to dump profile traces.",
+    )
+    parser.add_argument(
+        "--profile-name",
+        type=str,
+        default=None,
+        help="The name of this profile run.",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=5,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--profile-by-stage",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--cpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile CPU activity",
+    )
+    parser.add_argument(
+        "--gpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile GPU activity",
+    )
+    parser.add_argument(
+        "--mem",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to memory usage (https://pytorch.org/memory_viz)",
+    )
+    parser.add_argument(
+        "--rpd",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
+    )
+    args = parser.parse_args()
+    activities = []
+    if args.cpu:
+        activities.append("CPU")
+    if args.gpu:
+        activities.append("GPU")
+    if args.mem:
+        activities.append("MEM")
+    if args.rpd:
+        activities.append("RPD")
+    run_profile(
+        args.url,
+        args.num_steps,
+        activities,
+        args.output_dir,
+        args.profile_name,
+        args.profile_by_stage,
+    )

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -113,3 +113,37 @@ else:
     def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
         return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
+def mscclpp_generate_unique_id() -> bytes:
+    return sgl_kernel.allreduce.mscclpp_generate_unique_id()
+def mscclpp_init_context(
+    unique_id: bytes,
+    rank: int,
+    world_size: int,
+    scratch: torch.Tensor,
+    put_buffer: torch.Tensor,
+    nranks_per_node: int,
+    rank_to_node: List[int],
+    rank_to_ib: List[int],
+    context_selection: int,
+) -> int:
+    return sgl_kernel.allreduce.mscclpp_init_context(
+        unique_id,
+        rank,
+        world_size,
+        scratch,
+        put_buffer,
+        nranks_per_node,
+        rank_to_node,
+        rank_to_ib,
+        context_selection,
+    )
+def mscclpp_allreduce(
+    context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+) -> None:
+    return sgl_kernel.allreduce.mscclpp_allreduce(context, inp, out, nthreads, nblocks)

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -7,11 +7,8 @@ import sentencepiece as spm
 from transformers import (
     TOKENIZER_MAPPING,
     LlamaConfig,
-    Phi3Config,
     PretrainedConfig,
     PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    Qwen2Config,
 )
 from sglang.utils import logger
@@ -302,24 +299,23 @@ class InternVLChatConfig(PretrainedConfig):
             )
         if llm_config is None:
-            # TODO: There might still be a bug in transformers version 4.44 and above.
-            llm_config = {"architectures": [""]}
+            llm_config = {"architectures": ["InternLM2ForCausalLM"]}
             logger.info(
                 "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
             )
         self.vision_config = InternVisionConfig(**vision_config)
-        if llm_config["architectures"][0] == "LlamaForCausalLM":
+        if llm_config.get("architectures")[0] == "LlamaForCausalLM":
             self.llm_config = LlamaConfig(**llm_config)
-        elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
+        elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
             self.llm_config = InternLM2Config(**llm_config)
-        elif llm_config["architectures"][0] == "Phi3ForCausalLM":
-            self.llm_config = Phi3Config(**llm_config)
-        elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
-            self.llm_config = Qwen2Config(**llm_config)
         else:
             raise ValueError(
-                "Unsupported architecture: {}".format(llm_config["architectures"][0])
+                "Unsupported architecture: {}".format(
+                    llm_config.get("architectures")[0]
+                )
             )
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
         self.pad2square = pad2square

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -16,7 +16,7 @@ import json
 import logging
 import math
 import os
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from typing import List, Optional, Set, Union
 import torch
@@ -39,6 +39,12 @@ class AttentionArch(IntEnum):
     MHA = auto()
+class ModelImpl(str, Enum):
+    AUTO = "auto"
+    SGLANG = "sglang"
+    TRANSFORMERS = "transformers"
 class ModelConfig:
     def __init__(
         self,
@@ -53,11 +59,13 @@ class ModelConfig:
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
+        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
+        self.impl = impl
         # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
@@ -196,6 +204,22 @@ class ModelConfig:
             self.v_head_dim = self.hf_text_config.v_head_dim
             self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
         else:
+            if (
+                "MistralModel" in self.hf_config.architectures
+                or "MixtralForCausalLM" in self.hf_config.architectures
+                or "MistralForCausalLM" in self.hf_config.architectures
+            ):
+                if getattr(self, "head_dim", None) is None:
+                    self.head_dim = (
+                        self.hf_config.hidden_size // self.hf_config.num_attention_heads
+                    )
+                    # In transformers==4.52.3, the head_dim is null in MistralConfig
+                    if (
+                        not hasattr(self.hf_text_config, "head_dim")
+                        or self.hf_text_config.head_dim is None
+                    ):
+                        setattr(self.hf_text_config, "head_dim", self.head_dim)
             self.attention_arch = AttentionArch.MHA
         self.num_attention_heads = self.hf_text_config.num_attention_heads
@@ -240,6 +264,7 @@ class ModelConfig:
             enable_multimodal=server_args.enable_multimodal,
             dtype=server_args.dtype,
             quantization=server_args.quantization,
+            impl=server_args.impl,
             **kwargs,
         )
@@ -552,6 +577,7 @@ multimodal_model_archs = [
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
+    "Phi4MMForCausalLM",
 ]

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -60,7 +60,7 @@ class BaseGrammarObject:
         raise NotImplementedError()
     def copy(self) -> "BaseGrammarObject":
-        raise NotImplementedError()
+        return self
     @property
     def finished(self):
@@ -99,9 +99,12 @@ class BaseGrammarObject:
         raise NotImplementedError()
+INVALID_GRAMMAR_OBJ = BaseGrammarObject()
 @dataclass
 class CacheEntry:
-    value: Optional[BaseGrammarObject]
+    value: BaseGrammarObject
     event: Event

sglang/srt/constrained/llguidance_backend.py CHANGED Viewed

@@ -28,6 +28,7 @@ from llguidance.torch import (
 )
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -126,8 +127,8 @@ class GuidanceBackend(BaseGrammarBackend):
                 serialized_grammar=serialized_grammar,
             )
         except Exception as e:
-            logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
-            return None
+            logger.error(f"Hit invalid grammar: {serialized_grammar=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
     def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
         try:
@@ -138,8 +139,8 @@ class GuidanceBackend(BaseGrammarBackend):
                 },
             )
         except Exception as e:
-            logger.warning(f"Skip invalid grammar: {key_string=}, {e=}")
-            return None
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_serialized(serialized_grammar)
     def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
@@ -151,8 +152,8 @@ class GuidanceBackend(BaseGrammarBackend):
             serialized_grammar = grammar_from("ebnf", key_string)
             return self._from_serialized(serialized_grammar)
         except ValueError as e:
-            logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
-            return None
+            logger.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
     def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
         try:
@@ -169,5 +170,5 @@ class GuidanceBackend(BaseGrammarBackend):
             g = StructTag.to_grammar(tags)
             return self._from_serialized(g)
         except Exception as e:
-            logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ

sglang/srt/constrained/outlines_backend.py CHANGED Viewed

@@ -24,6 +24,7 @@ from outlines.models.transformers import TransformerTokenizer
 from pydantic import BaseModel
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -151,8 +152,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                 # outlines <= 0.0.46
                 guide = RegexGuide(regex, self.outlines_tokenizer)
         except interegular.patterns.InvalidSyntax as e:
-            logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
-            return None
+            logger.error(f"Hit invalid regex schema: {regex=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         jump_forward_map = None
         return OutlinesGrammar(guide, jump_forward_map)
@@ -170,8 +171,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                 whitespace_pattern=self.whitespace_pattern,
             )
         except (NotImplementedError, json.decoder.JSONDecodeError, ValueError) as e:
-            logger.warning(f"Skip invalid json_schema: {key_string=}, {e=}")
-            return None
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._compile_regex(regex)
     def dispatch_regex(self, key_string: str):

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -28,6 +28,7 @@ from xgrammar import (
 )
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -152,10 +153,11 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     ):
         super().__init__()
-        tokenizer_info = TokenizerInfo.from_huggingface(
-            tokenizer, vocab_size=vocab_size
-        )
-        override_stop_tokens = None
+        if True:
+            tokenizer_info = TokenizerInfo.from_huggingface(
+                tokenizer, vocab_size=vocab_size
+            )
+            override_stop_tokens = None
         self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size
@@ -178,25 +180,26 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
                 ctx = self.grammar_compiler.compile_builtin_json_grammar()
             else:
                 ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
-        except RuntimeError as e:
-            logging.warning(f"Skip invalid json_schema: json_schema={key_string}, {e=}")
-            return None
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             ctx = self.grammar_compiler.compile_grammar(key_string)
         except RuntimeError as e:
-            logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             ctx = self.grammar_compiler.compile_regex(key_string)
         except RuntimeError as e:
-            logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid regex: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
@@ -213,13 +216,10 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
             ctx = self.grammar_compiler.compile_structural_tag(
                 tags, structural_tag["triggers"]
             )
-        except RuntimeError as e:
-            logging.warning(
-                f"Skip invalid structural_tag: structural_tag={key_string}, {e=}"
-            )
-            return None
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def reset(self):
-        if self.grammar_compiler:
-            self.grammar_compiler.clear_cache()
+        self.grammar_compiler.clear_cache()

sglang/srt/conversation.py CHANGED Viewed

@@ -21,6 +21,7 @@ from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
 from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.utils import read_system_prompt_from_file
 class SeparatorStyle(IntEnum):
@@ -561,14 +562,11 @@ def generate_chat_conv(
                     if content.type == "image_url":
                         num_image_url += 1
                         conv.modalities.append(content.modalities)
-                if num_image_url > 1:
-                    image_token = conv.image_token
-                else:
-                    image_token = (
-                        conv.image_token + "\n"
-                        if conv.name != "qwen2-vl"
-                        else conv.image_token
-                    )
+                image_token = (
+                    conv.image_token + "\n"
+                    if conv.name != "qwen2-vl"
+                    else conv.image_token
+                )
                 add_token_as_needed: bool = (
                     conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
                 )
@@ -648,6 +646,20 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="devstral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        system_message=read_system_prompt_from_file("mistralai/Devstral-Small-2505"),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
 # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
 register_conv_template(
     Conversation(
@@ -661,6 +673,20 @@ register_conv_template(
     )
 )
+# TODO (lifuhuang): Refactor BaseMultimodalProcessor to support the default image token "<|image_{index}|>" in the future.
+register_conv_template(
+    Conversation(
+        name="phi-4-mm",
+        system_message="",
+        system_template="{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|end|>",
+        stop_str="<|end|>",
+        image_token="<|endoftext10|>",
+    )
+)
 register_conv_template(
     Conversation(
         name="chatml",
@@ -945,3 +971,15 @@ def match_openbmb_minicpm(model_path: str):
 def match_moonshot_kimivl(model_path: str):
     if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
         return "kimi-vl"
+@register_conv_template_matching_function
+def match_devstral(model_path: str):
+    if re.search(r"devstral", model_path, re.IGNORECASE):
+        return "devstral"
+@register_conv_template_matching_function
+def match_phi_4_mm(model_path: str):
+    if "phi-4-multimodal" in model_path.lower():
+        return "phi-4-mm"

sglang/srt/custom_op.py CHANGED Viewed

@@ -1,6 +1,3 @@
-from typing import Optional
-import torch
 from torch import nn
 from sglang.srt.utils import is_cuda, is_hip
@@ -14,6 +11,44 @@ class CustomOp(nn.Module):
         super().__init__()
         self._forward_method = self.dispatch_forward()
+        # States for torch.compile
+        self._original_forward_method = None
+        self.is_torch_compile = False
+    def enter_torch_compile(self, num_tokens: int):
+        # Skip if Op is already entered compile mode.
+        # NOTE(alcanderian): Some Ops(for example RotaryEmbedding) will be reused
+        # among layers and `enter_torch_compile` will be called many times.
+        # We should prevent `self._original_forward_method` from being overridden when
+        # it is not the first time `enter_torch_compile` called.
+        if self.is_torch_compile:
+            return
+        self._original_forward_method = self._forward_method
+        # NOTE: Temporarily workaround MoE
+        if "FusedMoE" in self.__class__.__name__:
+            if num_tokens == 1:
+                from sglang.srt.layers.moe.fused_moe_native import (
+                    fused_moe_forward_native,
+                )
+                # The performance of torch.compile on this layer is not always good when bs > 1,
+                # so we decide to only use torch.compile when bs =1
+                self._forward_method = fused_moe_forward_native
+        else:
+            self._forward_method = self.forward_native
+        self.is_torch_compile = True
+    def leave_torch_compile(self):
+        # Skip if Op is already exited compile mode.
+        if not self.is_torch_compile:
+            return
+        self._forward_method = self._original_forward_method
+        self._original_forward_method = None
+        self.is_torch_compile = False
+    # Please do not override this method, because `self._forward_method` can change when in torch compile mode
     def forward(self, *args, **kwargs):
         return self._forward_method(*args, **kwargs)

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl