PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/openai_api/utils.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""
+Utility functions for OpenAI API adapter.
+"""
+import logging
+from typing import Dict, List
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
+logger = logging.getLogger(__name__)
+# ============================================================================
+# JINJA TEMPLATE CONTENT FORMAT DETECTION
+# ============================================================================
+#
+# This adapts vLLM's approach for detecting chat template content format:
+# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
+# - Analyzes Jinja template AST to detect content iteration patterns
+# - 'openai' format: templates with {%- for content in message['content'] -%} loops
+# - 'string' format: templates that expect simple string content
+# - Processes content accordingly to match template expectations
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    """Check if node is a variable access like {{ varname }}"""
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+    return False
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+    return False
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str = None,
+) -> bool:
+    """Check if node accesses varname or varname[key] with filters/tests"""
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+def _try_extract_ast(chat_template: str):
+    """Try to parse the Jinja template into an AST"""
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception as e:
+        logger.debug(f"Error when compiling Jinja template: {e}")
+        return None
+def detect_template_content_format(chat_template: str) -> str:
+    """
+    Detect whether a chat template expects 'string' or 'openai' content format.
+    - 'string': content is a simple string (like DeepSeek templates)
+    - 'openai': content is a list of structured dicts (like Llama4 templates)
+    Detection logic:
+    - If template has loops like {%- for content in message['content'] -%} → 'openai'
+    - Otherwise → 'string'
+    """
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return "string"
+    try:
+        # Look for patterns like: {%- for content in message['content'] -%}
+        for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
+            loop_iter = loop_ast.iter
+            # Check if iterating over message['content'] or similar
+            if _is_var_or_elems_access(loop_iter, "message", "content"):
+                return "openai"  # Found content iteration → openai format
+        return "string"  # No content loops found → string format
+    except Exception as e:
+        logger.debug(f"Error when parsing AST of Jinja template: {e}")
+        return "string"
+def process_content_for_template_format(
+    msg_dict: dict,
+    content_format: str,
+    image_data: list,
+    audio_data: list,
+    modalities: list,
+) -> dict:
+    """
+    Process message content based on detected template format.
+    Args:
+        msg_dict: Message dictionary with content
+        content_format: 'string' or 'openai' (detected via AST analysis)
+        image_data: List to append extracted image URLs
+        audio_data: List to append extracted audio URLs
+        modalities: List to append modalities
+    Returns:
+        Processed message dictionary
+    """
+    if not isinstance(msg_dict.get("content"), list):
+        # Already a string or None, no processing needed
+        return {k: v for k, v in msg_dict.items() if v is not None}
+    if content_format == "openai":
+        # OpenAI format: preserve structured content list, normalize types
+        processed_content_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict):
+                chunk_type = chunk.get("type")
+                if chunk_type == "image_url":
+                    image_data.append(chunk["image_url"]["url"])
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'image' type for template compatibility
+                    processed_content_parts.append({"type": "image"})
+                elif chunk_type == "audio_url":
+                    audio_data.append(chunk["audio_url"]["url"])
+                    # Normalize to simple 'audio' type
+                    processed_content_parts.append({"type": "audio"})
+                else:
+                    # Keep other content as-is (text, etc.)
+                    processed_content_parts.append(chunk)
+        new_msg = {
+            k: v for k, v in msg_dict.items() if v is not None and k != "content"
+        }
+        new_msg["content"] = processed_content_parts
+        return new_msg
+    else:  # content_format == "string"
+        # String format: flatten to text only (for templates like DeepSeek)
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk["text"])
+            # Note: For string format, we ignore images/audio since the template
+            # doesn't expect structured content - multimodal placeholders would
+            # need to be inserted differently
+        new_msg = msg_dict.copy()
+        new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        new_msg = {k: v for k, v in new_msg.items() if v is not None}
+        return new_msg

sglang/srt/operations.py CHANGED Viewed

@@ -12,7 +12,7 @@ if _ENABLE_PROFILE:
 def execute_operations(inputs, operations):
-    stages = _convert_operations_to_stages(decorate_operations(operations))
+    stages = _convert_operations_to_stages(operations)
     executor = _StageExecutor("primary", stages, inputs=inputs)
     for _ in range(executor.num_stages):
         executor.next()
@@ -20,6 +20,37 @@ def execute_operations(inputs, operations):
     return executor.output
+def execute_overlapped_operations(
+    inputs_arr: Sequence,
+    operations_arr: Sequence,
+    delta_stages: Sequence[int],
+) -> Sequence:
+    # Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
+    inputs_a, inputs_b = inputs_arr
+    operations_a, operations_b = operations_arr
+    delta_stage_a, delta_stage_b = delta_stages
+    assert delta_stage_a == 0
+    delta_stage = delta_stage_b
+    stages_a = _convert_operations_to_stages(operations_a)
+    stages_b = _convert_operations_to_stages(operations_b)
+    executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
+    executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
+    for _ in range(delta_stage):
+        executor_a.next()
+    for _ in range(executor_a.num_stages - delta_stage):
+        executor_a.next()
+        executor_b.next()
+    for _ in range(delta_stage):
+        executor_b.next()
+    assert executor_a.done and executor_b.done
+    return [executor_a.output, executor_b.output]
 class YieldOperation:
     pass
@@ -109,6 +140,9 @@ class _StateDict:
         for k, v in values.items():
             setattr(self, k, v)
+    def get(self, item):
+        return self._data.get(item)
     def clear(self, expect_keys: Sequence[str]):
         if set(self._data.keys()) != set(expect_keys):
             raise Exception(
@@ -119,6 +153,7 @@ class _StateDict:
 def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
+    operations = _decorate_operations(operations)
     operation_chunks = list(
         _chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
     )
@@ -140,7 +175,7 @@ def _chunk_by_separator(
         yield pending_items
-def decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
+def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
     return [_decorate_operation(op, debug_name_prefix) for op in operations]

sglang/srt/operations_strategy.py CHANGED Viewed

@@ -1,31 +1,207 @@
+from dataclasses import dataclass
+from typing import List, Optional
 import torch
+from sglang.srt import operations
+from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.operations import Operation
+@dataclass
+class OperationsStrategy:
+    operations: List[Operation]
+    deep_gemm_num_sms: Optional[int] = None
+    tbo_delta_stages: Optional[int] = None
+    @classmethod
+    def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
+        return OperationsStrategy(
+            operations=[x for item in items for x in item.operations],
+            deep_gemm_num_sms=_assert_all_same(
+                [item.deep_gemm_num_sms for item in items]
+            ),
+            tbo_delta_stages=_assert_all_same(
+                [item.tbo_delta_stages for item in items]
+            ),
+        )
+    @staticmethod
+    def init_new_tbo(
+        layers: torch.nn.ModuleList,
+        forward_mode: ForwardMode,
+    ) -> "OperationsStrategy":
+        layer_name = layers[0].__class__.__name__
+        if layer_name == "DeepseekV2DecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_deepseek_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        elif layer_name == "Qwen3MoeDecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_qwen3_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        else:
+            raise NotImplementedError
+def _assert_all_same(items: List):
+    assert all(item == items[0] for item in items)
+    return items[0]
+# -------------------------------- Strategy for DeepSeek ---------------------------------------
+# TODO can refactor to make it more fancy if we have more complex strategies
+def _compute_moe_deepseek_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_deepseek_blog_prefill(layer)
+    elif forward_mode == ForwardMode.DECODE:
+        return _compute_moe_deepseek_blog_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+def _compute_moe_deepseek_blog_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_shared_experts,
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+def _compute_moe_deepseek_blog_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            layer.mlp.op_shared_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            operations.YieldOperation(),
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+# -------------------------------- Strategy for Qwen3 ---------------------------------------
-def compute_layer_operations(
+# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
+# convenience to adjust strategy
+def _compute_moe_qwen3_layer_operations_strategy_tbo(
     layer: torch.nn.Module,
-):
-    if not layer.is_layer_sparse:
-        return [
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_qwen3_prefill(layer)
+    elif forward_mode == ForwardMode.DECODE:
+        return _compute_moe_qwen3_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+def _compute_moe_qwen3_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+def _compute_moe_qwen3_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
             layer.op_comm_prepare_attn,
-            layer.op_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
             layer.op_comm_prepare_mlp,
-            layer.op_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
             layer.op_comm_postprocess_layer,
-        ]
-    # Will add TBO operation orders here
-    return [
-        layer.op_comm_prepare_attn,
-        layer.op_attn,
-        layer.op_comm_prepare_mlp,
-        layer.mlp.op_gate,
-        layer.mlp.op_shared_experts,
-        layer.mlp.op_select_experts,
-        layer.mlp.op_dispatch_a,
-        layer.mlp.op_dispatch_b,
-        layer.mlp.op_experts,
-        layer.mlp.op_combine_a,
-        layer.mlp.op_combine_b,
-        layer.mlp.op_output,
-        layer.op_comm_postprocess_layer,
-    ]
+            operations.YieldOperation(),
+        ],
+    )

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -9,10 +9,13 @@ import torch
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.sampling_params import TOP_K_ALL
+from sglang.srt.utils import merge_bias_tensor
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
 logger = logging.getLogger(__name__)
@@ -27,6 +30,12 @@ class SamplingBatchInfo:
     # Whether all requests use greedy sampling
     is_all_greedy: bool
+    # Whether any requests use top_p sampling
+    need_top_p_sampling: bool
+    # Whether any requests use top_k sampling
+    need_top_k_sampling: bool
     # Whether any request needs min_p sampling
     need_min_p_sampling: bool
@@ -55,6 +64,9 @@ class SamplingBatchInfo:
     # Device
     device: str = "cuda"
+    # Handle logit bias
+    logit_bias: Optional[torch.Tensor] = None
     @classmethod
     def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
         reqs = batch.reqs
@@ -77,6 +89,14 @@ class SamplingBatchInfo:
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        logit_bias = None
+        if any(r.sampling_params.logit_bias is not None for r in reqs):
+            logit_bias = torch.zeros(len(reqs), vocab_size, device=device)
+            for i, r in enumerate(reqs):
+                if r.sampling_params.logit_bias is not None:
+                    for key, value in r.sampling_params.logit_bias.items():
+                        logit_bias[i, int(key)] = value
         # Check if any request has custom logit processor
         has_custom_logit_processor = (
             batch.enable_custom_logit_processor  # check the flag first.
@@ -133,6 +153,8 @@ class SamplingBatchInfo:
             top_ks=top_ks,
             min_ps=min_ps,
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
+            need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             vocab_size=vocab_size,
             penalizer_orchestrator=penalizer_orchestrator,
@@ -140,6 +162,7 @@ class SamplingBatchInfo:
             custom_params=custom_params,
             custom_logit_processor=merged_custom_logit_processor,
             device=device,
+            logit_bias=logit_bias,
         )
         return ret
@@ -167,7 +190,7 @@ class SamplingBatchInfo:
         # Apply the mask
         for i, grammar in enumerate(self.grammars):
-            if grammar and not grammar.finished:
+            if grammar and not grammar.finished and not grammar.is_terminated():
                 grammar.fill_vocab_mask(self.vocab_mask, i)
         # Move the mask to the device if needed
@@ -196,6 +219,9 @@ class SamplingBatchInfo:
         if self.vocab_mask is not None:
             self.apply_mask_func(logits=logits, vocab_mask=self.vocab_mask)
+        if self.logit_bias is not None:
+            logits.add_(self.logit_bias)
     def filter_batch(self, keep_indices: List[int], keep_indices_device: torch.Tensor):
         self.penalizer_orchestrator.filter(keep_indices_device)
@@ -211,6 +237,9 @@ class SamplingBatchInfo:
             value = getattr(self, item, None)
             setattr(self, item, value[keep_indices_device])
+        if self.logit_bias is not None:
+            self.logit_bias = self.logit_bias[keep_indices_device]
     def _filter_batch_custom_logit_processor(
         self, keep_indices: List[int], keep_indices_device: torch.Tensor
     ):
@@ -308,4 +337,11 @@ class SamplingBatchInfo:
             setattr(self, item, torch.cat([self_val, other_val]))
         self.is_all_greedy &= other.is_all_greedy
+        self.need_top_p_sampling |= other.need_top_p_sampling
+        self.need_top_k_sampling |= other.need_top_k_sampling
         self.need_min_p_sampling |= other.need_min_p_sampling
+        # Merge logit bias
+        self.logit_bias = merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
+        )

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from typing import Any, Dict, List, Optional, Union
 _SAMPLING_EPS = 1e-6
+TOP_K_ALL = 1 << 30
 class SamplingParams:
@@ -51,6 +52,7 @@ class SamplingParams:
         no_stop_trim: bool = False,
         custom_params: Optional[Dict[str, Any]] = None,
         stream_interval: Optional[int] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
     ) -> None:
         self.max_new_tokens = max_new_tokens
         self.stop_strs = stop
@@ -77,6 +79,7 @@ class SamplingParams:
         self.no_stop_trim = no_stop_trim
         self.custom_params = custom_params
         self.stream_interval = stream_interval
+        self.logit_bias = logit_bias
         # Process some special cases
         if 0 <= self.temperature < _SAMPLING_EPS:
@@ -84,7 +87,7 @@ class SamplingParams:
             self.temperature = 1.0
             self.top_k = 1
         if self.top_k == -1:
-            self.top_k = 1 << 30  # whole vocabulary
+            self.top_k = TOP_K_ALL  # whole vocabulary
     def verify(self):
         if self.temperature < 0.0:

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl