PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/lang/interpreter.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVariable,
     SglVarScopeBegin,
     SglVarScopeEnd,
@@ -472,6 +473,8 @@ class StreamExecutor:
                 self._execute_concatenate_and_append_kv_cache(other)
             else:
                 self._execute_concatenate_and_append_text(other)
+        elif isinstance(other, SglSeparateReasoning):
+            self._execute_separate_reasoning(other)
         else:
             raise ValueError(f"Unknown type: {type(other)}")
@@ -724,8 +727,44 @@ class StreamExecutor:
         src_rids = [state.stream_executor.sid for state in expr.states]
         self.backend.concatenate_and_append(src_rids, self.sid)
+    def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
+        if self.stream:
+            # separate reasoning for stream is not supported
+            return
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+        ):
+            # Execute the stored lazy generation calls
+            self.backend.role_end_generate(self)
+        from sglang.srt.reasoning_parser import ReasoningParser
+        reasoning_parser = ReasoningParser(expr.model_type)
+        other = expr.expr
+        if not other:
+            return
+        elif isinstance(other, SglGen) or isinstance(other, SglSelect):
+            cur_text = self.get_var(other.name)
+            reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
+            reasoning_name = expr.process_name_for_reasoning(other.name)
+            self.set_var(other.name, normal_text)
+            self.set_var(reasoning_name, reasoning)
+            # the variable is ready to be used
+            self.variable_event[reasoning_name].set()
+            self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute_separate_reasoning(
+                    SglSeparateReasoning(expr.model_type, x)
+                )
     def _init_var_event(self, expr):
-        if isinstance(expr, (SglGen, SglSelect, SglVarScopeBegin)):
+        if isinstance(
+            expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
+        ):
             self.variable_event[expr.name] = threading.Event()
             if self.stream:
                 self.stream_var_event[expr.name] = threading.Event()

sglang/lang/ir.py CHANGED Viewed

@@ -606,3 +606,30 @@ class SglCommitLazy(SglExpr):
     def __repr__(self):
         return "CommitLazy()"
+class SglSeparateReasoning(SglExpr):
+    def __init__(self, model_type: str, expr: SglExpr):
+        super().__init__()
+        self.model_type = model_type
+        self.expr = expr
+        self.name = None
+        self._process_expr(expr)
+    def process_name_for_reasoning(self, name):
+        if not name:
+            raise ValueError("name must be provided")
+        return f"{name}_reasoning_content"
+    def _process_expr(self, expr):
+        if isinstance(expr, SglGen):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglSelect):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglExprList):
+            for x in expr.expr_list:
+                self._process_expr(x)
+    def __repr__(self):
+        return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"

sglang/math_utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+# COPIED FROM DeepGEMM
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+# COPIED FROM DeepGEMM
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y

sglang/profiler.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""
+Run live profiling.
+Usage:
+python3 -m sglang.profiler
+"""
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import List, Optional
+import requests
+PARENT_FOLDER = "/tmp/sglang-profile"
+def _run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+) -> str:
+    if output_dir is None:
+        output_dir = PARENT_FOLDER
+    output_dir = os.path.normpath(output_dir)
+    output_dir = os.path.abspath(output_dir)
+    output_dir = Path(output_dir)
+    # Add "profile_name/timestamp" to the path.
+    if profile_name:
+        output_dir = output_dir / profile_name
+    output_dir = output_dir / str(time.time())
+    output_dir.mkdir(exist_ok=True, parents=True)
+    print(f"Dump profiling traces to {output_dir}")
+    print(
+        f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
+    )
+    # Dump server args.
+    file_path = Path(output_dir) / "server_args.json"
+    if not file_path.exists():
+        response = requests.get(url + "/get_server_info")
+        response.raise_for_status()
+        server_args_data = response.json()
+        with open(file_path, "w") as file:
+            file.write(json.dumps(server_args_data))
+    # Start profiler. The API replies when all steps are processed
+    # and files are generated.
+    json_data = {
+        "output_dir": str(output_dir),
+        "num_steps": str(num_steps),
+        "activities": activities,
+        "profile_by_stage": profile_by_stage,
+    }
+    response = requests.post(url=url + "/start_profile", json=json_data)
+    response.raise_for_status()
+    trace_link = str(output_dir)
+    return trace_link
+def run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+):
+    # step based profile will self terminate on num_steps constraints
+    link = _run_profile(
+        url, num_steps, activities, output_dir, profile_name, profile_by_stage
+    )
+    return link
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Profile directory to dump profile traces.",
+    )
+    parser.add_argument(
+        "--profile-name",
+        type=str,
+        default=None,
+        help="The name of this profile run.",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=5,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--profile-by-stage",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--cpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile CPU activity",
+    )
+    parser.add_argument(
+        "--gpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile GPU activity",
+    )
+    parser.add_argument(
+        "--mem",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to memory usage (https://pytorch.org/memory_viz)",
+    )
+    parser.add_argument(
+        "--rpd",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
+    )
+    args = parser.parse_args()
+    activities = []
+    if args.cpu:
+        activities.append("CPU")
+    if args.gpu:
+        activities.append("GPU")
+    if args.mem:
+        activities.append("MEM")
+    if args.rpd:
+        activities.append("RPD")
+    run_profile(
+        args.url,
+        args.num_steps,
+        activities,
+        args.output_dir,
+        args.profile_name,
+        args.profile_by_stage,
+    )

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -113,3 +113,37 @@ else:
     def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
         return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
+def mscclpp_generate_unique_id() -> bytes:
+    return sgl_kernel.allreduce.mscclpp_generate_unique_id()
+def mscclpp_init_context(
+    unique_id: bytes,
+    rank: int,
+    world_size: int,
+    scratch: torch.Tensor,
+    put_buffer: torch.Tensor,
+    nranks_per_node: int,
+    rank_to_node: List[int],
+    rank_to_ib: List[int],
+    context_selection: int,
+) -> int:
+    return sgl_kernel.allreduce.mscclpp_init_context(
+        unique_id,
+        rank,
+        world_size,
+        scratch,
+        put_buffer,
+        nranks_per_node,
+        rank_to_node,
+        rank_to_ib,
+        context_selection,
+    )
+def mscclpp_allreduce(
+    context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+) -> None:
+    return sgl_kernel.allreduce.mscclpp_allreduce(context, inp, out, nthreads, nblocks)

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -7,11 +7,8 @@ import sentencepiece as spm
 from transformers import (
     TOKENIZER_MAPPING,
     LlamaConfig,
-    Phi3Config,
     PretrainedConfig,
     PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    Qwen2Config,
 )
 from sglang.utils import logger
@@ -302,24 +299,23 @@ class InternVLChatConfig(PretrainedConfig):
             )
         if llm_config is None:
-            # TODO: There might still be a bug in transformers version 4.44 and above.
-            llm_config = {"architectures": [""]}
+            llm_config = {"architectures": ["InternLM2ForCausalLM"]}
             logger.info(
                 "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
             )
         self.vision_config = InternVisionConfig(**vision_config)
-        if llm_config["architectures"][0] == "LlamaForCausalLM":
+        if llm_config.get("architectures")[0] == "LlamaForCausalLM":
             self.llm_config = LlamaConfig(**llm_config)
-        elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
+        elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
             self.llm_config = InternLM2Config(**llm_config)
-        elif llm_config["architectures"][0] == "Phi3ForCausalLM":
-            self.llm_config = Phi3Config(**llm_config)
-        elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
-            self.llm_config = Qwen2Config(**llm_config)
         else:
             raise ValueError(
-                "Unsupported architecture: {}".format(llm_config["architectures"][0])
+                "Unsupported architecture: {}".format(
+                    llm_config.get("architectures")[0]
+                )
             )
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
         self.pad2square = pad2square

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -16,7 +16,7 @@ import json
 import logging
 import math
 import os
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from typing import List, Optional, Set, Union
 import torch
@@ -39,6 +39,12 @@ class AttentionArch(IntEnum):
     MHA = auto()
+class ModelImpl(str, Enum):
+    AUTO = "auto"
+    SGLANG = "sglang"
+    TRANSFORMERS = "transformers"
 class ModelConfig:
     def __init__(
         self,
@@ -53,11 +59,13 @@ class ModelConfig:
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
+        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
+        self.impl = impl
         # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
@@ -196,6 +204,22 @@ class ModelConfig:
             self.v_head_dim = self.hf_text_config.v_head_dim
             self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
         else:
+            if (
+                "MistralModel" in self.hf_config.architectures
+                or "MixtralForCausalLM" in self.hf_config.architectures
+                or "MistralForCausalLM" in self.hf_config.architectures
+            ):
+                if getattr(self, "head_dim", None) is None:
+                    self.head_dim = (
+                        self.hf_config.hidden_size // self.hf_config.num_attention_heads
+                    )
+                    # In transformers==4.52.3, the head_dim is null in MistralConfig
+                    if (
+                        not hasattr(self.hf_text_config, "head_dim")
+                        or self.hf_text_config.head_dim is None
+                    ):
+                        setattr(self.hf_text_config, "head_dim", self.head_dim)
             self.attention_arch = AttentionArch.MHA
         self.num_attention_heads = self.hf_text_config.num_attention_heads
@@ -240,6 +264,7 @@ class ModelConfig:
             enable_multimodal=server_args.enable_multimodal,
             dtype=server_args.dtype,
             quantization=server_args.quantization,
+            impl=server_args.impl,
             **kwargs,
         )
@@ -525,6 +550,11 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "Qwen2ForRewardModel" in model_architectures
         or "Qwen2ForSequenceClassification" in model_architectures
         or "CLIPModel" in model_architectures
+        or "BertModel" in model_architectures
+        or "Contriever" in model_architectures
+        or "BertForSequenceClassification" in model_architectures
+        or "XLMRobertaModel" in model_architectures
+        or "XLMRobertaForSequenceClassification" in model_architectures
     ):
         return False
     else:
@@ -552,6 +582,8 @@ multimodal_model_archs = [
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
+    "Phi4MMForCausalLM",
+    "VILAForConditionalGeneration",
 ]

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -60,7 +60,7 @@ class BaseGrammarObject:
         raise NotImplementedError()
     def copy(self) -> "BaseGrammarObject":
-        raise NotImplementedError()
+        return self
     @property
     def finished(self):
@@ -99,9 +99,12 @@ class BaseGrammarObject:
         raise NotImplementedError()
+INVALID_GRAMMAR_OBJ = BaseGrammarObject()
 @dataclass
 class CacheEntry:
-    value: Optional[BaseGrammarObject]
+    value: BaseGrammarObject
     event: Event

sglang/srt/constrained/llguidance_backend.py CHANGED Viewed

@@ -28,6 +28,7 @@ from llguidance.torch import (
 )
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -126,8 +127,8 @@ class GuidanceBackend(BaseGrammarBackend):
                 serialized_grammar=serialized_grammar,
             )
         except Exception as e:
-            logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
-            return None
+            logger.error(f"Hit invalid grammar: {serialized_grammar=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
     def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
         try:
@@ -138,8 +139,8 @@ class GuidanceBackend(BaseGrammarBackend):
                 },
             )
         except Exception as e:
-            logger.warning(f"Skip invalid grammar: {key_string=}, {e=}")
-            return None
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_serialized(serialized_grammar)
     def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
@@ -151,8 +152,8 @@ class GuidanceBackend(BaseGrammarBackend):
             serialized_grammar = grammar_from("ebnf", key_string)
             return self._from_serialized(serialized_grammar)
         except ValueError as e:
-            logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
-            return None
+            logger.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
     def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
         try:
@@ -169,5 +170,5 @@ class GuidanceBackend(BaseGrammarBackend):
             g = StructTag.to_grammar(tags)
             return self._from_serialized(g)
         except Exception as e:
-            logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ

sglang/srt/constrained/outlines_backend.py CHANGED Viewed

@@ -24,6 +24,7 @@ from outlines.models.transformers import TransformerTokenizer
 from pydantic import BaseModel
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -151,8 +152,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                 # outlines <= 0.0.46
                 guide = RegexGuide(regex, self.outlines_tokenizer)
         except interegular.patterns.InvalidSyntax as e:
-            logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
-            return None
+            logger.error(f"Hit invalid regex schema: {regex=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         jump_forward_map = None
         return OutlinesGrammar(guide, jump_forward_map)
@@ -170,8 +171,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                 whitespace_pattern=self.whitespace_pattern,
             )
         except (NotImplementedError, json.decoder.JSONDecodeError, ValueError) as e:
-            logger.warning(f"Skip invalid json_schema: {key_string=}, {e=}")
-            return None
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._compile_regex(regex)
     def dispatch_regex(self, key_string: str):

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -28,6 +28,7 @@ from xgrammar import (
 )
 from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
     BaseGrammarBackend,
     BaseGrammarObject,
 )
@@ -152,10 +153,11 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     ):
         super().__init__()
-        tokenizer_info = TokenizerInfo.from_huggingface(
-            tokenizer, vocab_size=vocab_size
-        )
-        override_stop_tokens = None
+        if True:
+            tokenizer_info = TokenizerInfo.from_huggingface(
+                tokenizer, vocab_size=vocab_size
+            )
+            override_stop_tokens = None
         self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size
@@ -178,25 +180,26 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
                 ctx = self.grammar_compiler.compile_builtin_json_grammar()
             else:
                 ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
-        except RuntimeError as e:
-            logging.warning(f"Skip invalid json_schema: json_schema={key_string}, {e=}")
-            return None
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             ctx = self.grammar_compiler.compile_grammar(key_string)
         except RuntimeError as e:
-            logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             ctx = self.grammar_compiler.compile_regex(key_string)
         except RuntimeError as e:
-            logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
-            return None
+            logging.error(f"Hit invalid regex: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
@@ -213,13 +216,10 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
             ctx = self.grammar_compiler.compile_structural_tag(
                 tags, structural_tag["triggers"]
             )
-        except RuntimeError as e:
-            logging.warning(
-                f"Skip invalid structural_tag: structural_tag={key_string}, {e=}"
-            )
-            return None
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
         return self._from_context(ctx, key_string)
     def reset(self):
-        if self.grammar_compiler:
-            self.grammar_compiler.clear_cache()
+        self.grammar_compiler.clear_cache()

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl