PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.api import (
     get_server_info,
     image,
     select,
+    separate_reasoning,
     set_default_backend,
     system,
     system_begin,
@@ -54,6 +55,7 @@ __all__ = [
     "get_server_info",
     "image",
     "select",
+    "separate_reasoning",
     "set_default_backend",
     "system",
     "system_begin",

sglang/api.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVideo,
 )
@@ -277,3 +278,9 @@ def assistant_begin():
 def assistant_end():
     return SglRoleEnd("assistant")
+def separate_reasoning(
+    expr: Optional[SglExpr] = None, model_type: Optional[str] = None
+):
+    return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])

sglang/bench_serving.py CHANGED Viewed

@@ -399,7 +399,7 @@ async def async_request_sglang_generate(
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["text"]:
+                            if "text" in data and data["text"]:
                                 timestamp = time.perf_counter()
                                 generated_text = data["text"]
                                 output_len = data["meta_info"]["completion_tokens"]

sglang/lang/interpreter.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVariable,
     SglVarScopeBegin,
     SglVarScopeEnd,
@@ -472,6 +473,8 @@ class StreamExecutor:
                 self._execute_concatenate_and_append_kv_cache(other)
             else:
                 self._execute_concatenate_and_append_text(other)
+        elif isinstance(other, SglSeparateReasoning):
+            self._execute_separate_reasoning(other)
         else:
             raise ValueError(f"Unknown type: {type(other)}")
@@ -724,8 +727,44 @@ class StreamExecutor:
         src_rids = [state.stream_executor.sid for state in expr.states]
         self.backend.concatenate_and_append(src_rids, self.sid)
+    def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
+        if self.stream:
+            # separate reasoning for stream is not supported
+            return
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+        ):
+            # Execute the stored lazy generation calls
+            self.backend.role_end_generate(self)
+        from sglang.srt.reasoning_parser import ReasoningParser
+        reasoning_parser = ReasoningParser(expr.model_type)
+        other = expr.expr
+        if not other:
+            return
+        elif isinstance(other, SglGen) or isinstance(other, SglSelect):
+            cur_text = self.get_var(other.name)
+            reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
+            reasoning_name = expr.process_name_for_reasoning(other.name)
+            self.set_var(other.name, normal_text)
+            self.set_var(reasoning_name, reasoning)
+            # the variable is ready to be used
+            self.variable_event[reasoning_name].set()
+            self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute_separate_reasoning(
+                    SglSeparateReasoning(expr.model_type, x)
+                )
     def _init_var_event(self, expr):
-        if isinstance(expr, (SglGen, SglSelect, SglVarScopeBegin)):
+        if isinstance(
+            expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
+        ):
             self.variable_event[expr.name] = threading.Event()
             if self.stream:
                 self.stream_var_event[expr.name] = threading.Event()

sglang/lang/ir.py CHANGED Viewed

@@ -606,3 +606,30 @@ class SglCommitLazy(SglExpr):
     def __repr__(self):
         return "CommitLazy()"
+class SglSeparateReasoning(SglExpr):
+    def __init__(self, model_type: str, expr: SglExpr):
+        super().__init__()
+        self.model_type = model_type
+        self.expr = expr
+        self.name = None
+        self._process_expr(expr)
+    def process_name_for_reasoning(self, name):
+        if not name:
+            raise ValueError("name must be provided")
+        return f"{name}_reasoning_content"
+    def _process_expr(self, expr):
+        if isinstance(expr, SglGen):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglSelect):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglExprList):
+            for x in expr.expr_list:
+                self._process_expr(x)
+    def __repr__(self):
+        return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"

sglang/math_utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+# COPIED FROM DeepGEMM
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+# COPIED FROM DeepGEMM
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -550,6 +550,11 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "Qwen2ForRewardModel" in model_architectures
         or "Qwen2ForSequenceClassification" in model_architectures
         or "CLIPModel" in model_architectures
+        or "BertModel" in model_architectures
+        or "Contriever" in model_architectures
+        or "BertForSequenceClassification" in model_architectures
+        or "XLMRobertaModel" in model_architectures
+        or "XLMRobertaForSequenceClassification" in model_architectures
     ):
         return False
     else:
@@ -578,6 +583,7 @@ multimodal_model_archs = [
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
     "Phi4MMForCausalLM",
+    "VILAForConditionalGeneration",
 ]

sglang/srt/conversation.py CHANGED Viewed

@@ -983,3 +983,9 @@ def match_devstral(model_path: str):
 def match_phi_4_mm(model_path: str):
     if "phi-4-multimodal" in model_path.lower():
         return "phi-4-mm"
+@register_conv_template_matching_function
+def match_vila(model_path: str):
+    if re.search(r"vila", model_path, re.IGNORECASE):
+        return "chatml"

sglang/srt/disaggregation/base/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .conn import (
+from sglang.srt.disaggregation.base.conn import (
     BaseKVBootstrapServer,
     BaseKVManager,
     BaseKVReceiver,

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -1,23 +1,32 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import numpy.typing as npt
-from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.server_args import ServerArgs
+if TYPE_CHECKING:
+    from sglang.srt.disaggregation.utils import DisaggregationMode
 class KVArgs:
     engine_rank: int
-    kv_data_ptrs: list[int]
-    kv_data_lens: list[int]
-    kv_item_lens: list[int]
-    aux_data_ptrs: list[int]
-    aux_data_lens: list[int]
-    aux_item_lens: list[int]
+    kv_data_ptrs: List[int]
+    kv_data_lens: List[int]
+    kv_item_lens: List[int]
+    aux_data_ptrs: List[int]
+    aux_data_lens: List[int]
+    aux_item_lens: List[int]
     ib_device: str
+    ib_traffic_class: str
     gpu_id: int
+    # for different tp
+    decode_tp_size: int
+    # for pp prefill
+    prefill_pp_size: int
 class KVPoll:
@@ -45,7 +54,12 @@ class BaseKVSender(ABC):
     @abstractmethod
     def __init__(
-        self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
     ): ...
     @abstractmethod
@@ -56,7 +70,7 @@ class BaseKVSender(ABC):
         ...
     @abstractmethod
-    def send(self, kv_indices: npt.NDArray[np.int64]):
+    def send(self, kv_indices: npt.NDArray[np.int32]):
         """
         Send the kv cache at the given kv indices to the decoder server
         """
@@ -88,7 +102,7 @@ class BaseKVReceiver(ABC):
     ): ...
     @abstractmethod
-    def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
         """
         Notify the prefill server about the kv indices and aux index
         """

sglang/srt/disaggregation/common/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
-from .conn import CommonKVBootstrapServer, CommonKVManager, CommonKVReceiver
+from sglang.srt.disaggregation.common.conn import (
+    CommonKVBootstrapServer,
+    CommonKVManager,
+    CommonKVReceiver,
+)

sglang/srt/disaggregation/common/utils.py ADDED Viewed

@@ -0,0 +1,42 @@
+import threading
+from collections import deque
+from typing import List, Tuple
+import numpy as np
+import numpy.typing as npt
+class FastQueue:
+    def __init__(self):
+        self._buf = deque()
+        self._cond = threading.Condition()
+    def put(self, item):
+        with self._cond:
+            self._buf.append(item)
+            # wake up a thread of wait()
+            self._cond.notify()
+    def get(self):
+        with self._cond:
+            # if queue is empty  ,block until is notified()
+            while not self._buf:
+                self._cond.wait()
+            return self._buf.popleft()
+def group_concurrent_contiguous(
+    src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
+) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
+    """Vectorised NumPy implementation."""
+    if src_indices.size == 0:
+        return [], []
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
+    return src_groups, dst_groups

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl