PyPI - sglang - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +23 -2
sglang/bench_serving.py +6 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +37 -5
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +80 -11
sglang/srt/disaggregation/mini_lb.py +58 -123
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +585 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
sglang/srt/disaggregation/prefill.py +82 -22
sglang/srt/disaggregation/utils.py +46 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +42 -13
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +430 -257
sglang/srt/layers/attention/flashinfer_backend.py +18 -9
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +18 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +63 -45
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +13 -5
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +131 -136
sglang/srt/layers/quantization/fp8_kernel.py +328 -46
sglang/srt/layers/quantization/fp8_utils.py +206 -253
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
sglang/srt/layers/quantization/w8a8_int8.py +8 -7
sglang/srt/layers/radix_attention.py +28 -1
sglang/srt/layers/rotary_embedding.py +15 -3
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +255 -97
sglang/srt/managers/mm_utils.py +7 -5
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
sglang/srt/managers/schedule_batch.py +64 -25
sglang/srt/managers/scheduler.py +80 -82
sglang/srt/managers/tokenizer_manager.py +18 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +21 -3
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -6
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +67 -35
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/bert.py +398 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +494 -366
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +6 -5
sglang/srt/models/llama4.py +101 -34
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +30 -200
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +102 -29
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +5 -1
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +15 -13
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +55 -19
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +10 -9
sglang/srt/utils.py +136 -10
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +224 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/disaggregation/conn.py +0 -81
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -11,10 +11,11 @@ from sglang.srt.custom_op import CustomOp
 from sglang.srt.utils import is_cuda_available
 _is_cuda_available = is_cuda_available()
 if _is_cuda_available:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -159,7 +160,7 @@ class RotaryEmbedding(CustomOp):
             )
         else:
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-            ops.rotary_embedding(
+            vllm_rotary_embedding(
                 positions,
                 query,
                 key,
@@ -645,7 +646,18 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         cache = torch.cat((cos, sin), dim=-1)
         return cache
-    def forward(
+    def forward_hip(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+    def forward(self, *args, **kwargs):
+        if torch.compiler.is_compiling():
+            return self.forward_native(*args, **kwargs)
+        if _is_cuda_available:
+            return self.forward_cuda(*args, **kwargs)
+        else:
+            return self.forward_native(*args, **kwargs)
+    def forward_native(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -93,28 +93,23 @@ class Sampler(nn.Module):
                     ).clamp(min=torch.finfo(probs.dtype).min)
                 max_top_k_round, batch_size = 32, probs.shape[0]
-                uniform_samples = torch.rand(
-                    (max_top_k_round, batch_size), device=probs.device
-                )
                 if sampling_info.need_min_p_sampling:
                     probs = top_k_renorm_prob(probs, sampling_info.top_ks)
                     probs = top_p_renorm_prob(probs, sampling_info.top_ps)
                     batch_next_token_ids = min_p_sampling_from_probs(
-                        probs, uniform_samples, sampling_info.min_ps
+                        probs, sampling_info.min_ps
                     )
                 else:
-                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                    # Check Nan will throw exception, only check when crash_on_warnings is True
+                    check_nan = self.use_nan_detection and crash_on_warnings()
+                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
                         probs,
-                        uniform_samples,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
                         filter_apply_order="joint",
+                        check_nan=check_nan,
                     )
-                    if self.use_nan_detection and not torch.all(success):
-                        logger.warning("Detected errors during sampling!")
-                        batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":
                 # A slower fallback implementation with torch native operations.
                 batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -75,7 +75,7 @@ class BaseLoRABackend:
         qkv_lora_a: torch.Tensor,
         qkv_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for QKV Layer.
@@ -98,7 +98,7 @@ class BaseLoRABackend:
         gate_up_lora_a: torch.Tensor,
         gate_up_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         """Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
@@ -115,3 +115,19 @@ class BaseLoRABackend:
     def set_batch_info(self, batch_info: LoRABatchInfo):
         self.batch_info = batch_info
+def get_backend_from_name(name: str) -> BaseLoRABackend:
+    """
+    Get corresponding backend class from backend's name
+    """
+    if name == "triton":
+        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
+        return TritonLoRABackend
+    elif name == "flashinfer":
+        from sglang.srt.lora.backend.flashinfer_backend import FlashInferLoRABackend
+        return FlashInferLoRABackend
+    else:
+        raise ValueError(f"Invalid backend: {name}")

sglang/srt/lora/backend/flashinfer_backend.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Tuple
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.utils import is_flashinfer_available

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.triton_ops import (
     gate_up_lora_b_fwd,
     qkv_lora_b_fwd,

sglang/srt/lora/layers.py CHANGED Viewed

@@ -16,7 +16,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 class BaseLayerWithLoRA(nn.Module):

sglang/srt/lora/lora.py CHANGED Viewed

@@ -27,7 +27,7 @@ from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torch
 from sglang.srt.configs.load_config import LoadConfig
 from sglang.srt.hf_transformers_utils import AutoConfig
-from sglang.srt.lora.backend import BaseLoRABackend, get_backend_from_name
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
 from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -14,7 +14,6 @@
 """DetokenizerManager is a process that detokenizes the token ids."""
 import dataclasses
-import json
 import logging
 import os
 import signal

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -20,7 +20,13 @@ import copy
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+# handle serialization of Image for pydantic
+if TYPE_CHECKING:
+    from PIL.Image import Image
+else:
+    Image = Any
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -42,10 +48,16 @@ class GenerateReqInput:
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
     # The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
-    # The image input. It can be a file name, a url, or base64 encoded string.
-    # See also python/sglang/srt/utils.py:load_image.
-    image_data: Optional[Union[List[str], str]] = None
-    # The audio input. Like image data, tt can be a file name, a url, or base64 encoded string.
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[
+        Union[List[List[Union[Image, str]]], List[Union[Image, str]], Union[Image, str]]
+    ] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
     audio_data: Optional[Union[List[str], str]] = None
     # The sampling_params. See descriptions below.
     sampling_params: Optional[Union[List[Dict], Dict]] = None
@@ -83,7 +95,36 @@ class GenerateReqInput:
     # Whether to return hidden states
     return_hidden_states: bool = False
+    # For disaggregated inference
+    bootstrap_host: Optional[str] = None
+    bootstrap_room: Optional[int] = None
     def normalize_batch_and_arguments(self):
+        """
+        Normalize the batch size and arguments for the request.
+        This method resolves various input formats and ensures all parameters
+        are properly formatted as either single values or batches depending on the input.
+        It also handles parallel sampling expansion and sets default values for
+        unspecified parameters.
+        Raises:
+            ValueError: If inputs are not properly specified (e.g., none or all of
+                       text, input_ids, input_embeds are provided)
+        """
+        self._validate_inputs()
+        self._determine_batch_size()
+        self._handle_parallel_sampling()
+        if self.is_single:
+            self._normalize_single_inputs()
+        else:
+            self._normalize_batch_inputs()
+        self._validate_session_params()
+    def _validate_inputs(self):
+        """Validate that the input configuration is valid."""
         if (
             self.text is None and self.input_ids is None and self.input_embeds is None
         ) or (
@@ -95,7 +136,8 @@ class GenerateReqInput:
                 "Either text, input_ids or input_embeds should be provided."
             )
-        # Derive the batch size
+    def _determine_batch_size(self):
+        """Determine if this is a single example or a batch and the batch size."""
         if self.text is not None:
             if isinstance(self.text, str):
                 self.is_single = True
@@ -119,21 +161,25 @@ class GenerateReqInput:
                 self.is_single = True
                 self.batch_size = 1
             else:
+                self.is_single = False
                 self.batch_size = len(self.input_embeds)
-        # Handle parallel sampling
-        # When parallel sampling is used, we always treat the input as a batch.
+    def _handle_parallel_sampling(self):
+        """Handle parallel sampling parameters and adjust batch size if needed."""
+        # Determine parallel sample count
         if self.sampling_params is None:
             self.parallel_sample_num = 1
         elif isinstance(self.sampling_params, dict):
             self.parallel_sample_num = self.sampling_params.get("n", 1)
         else:  # isinstance(self.sampling_params, list):
             self.parallel_sample_num = self.sampling_params[0].get("n", 1)
-            assert all(
-                self.parallel_sample_num == sampling_params.get("n", 1)
-                for sampling_params in self.sampling_params
-            ), "The parallel_sample_num should be the same for all samples in sample params."
+            for sampling_params in self.sampling_params:
+                if self.parallel_sample_num != sampling_params.get("n", 1):
+                    raise ValueError(
+                        "The parallel_sample_num should be the same for all samples in sample params."
+                    )
+        # If using parallel sampling with a single example, convert to batch
         if self.parallel_sample_num > 1 and self.is_single:
             self.is_single = False
             if self.text is not None:
@@ -141,97 +187,190 @@ class GenerateReqInput:
             if self.input_ids is not None:
                 self.input_ids = [self.input_ids]
-        # Fill in default arguments
-        if self.is_single:
-            if self.sampling_params is None:
-                self.sampling_params = {}
-            if self.rid is None:
-                self.rid = uuid.uuid4().hex
-            if self.return_logprob is None:
-                self.return_logprob = False
-            if self.logprob_start_len is None:
-                self.logprob_start_len = -1
-            if self.top_logprobs_num is None:
-                self.top_logprobs_num = 0
-            if not self.token_ids_logprob:  # covers both None and []
-                self.token_ids_logprob = None
+    def _normalize_single_inputs(self):
+        """Normalize inputs for a single example."""
+        if self.sampling_params is None:
+            self.sampling_params = {}
+        if self.rid is None:
+            self.rid = uuid.uuid4().hex
+        if self.return_logprob is None:
+            self.return_logprob = False
+        if self.logprob_start_len is None:
+            self.logprob_start_len = -1
+        if self.top_logprobs_num is None:
+            self.top_logprobs_num = 0
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = None
+    def _normalize_batch_inputs(self):
+        """Normalize inputs for a batch of examples, including parallel sampling expansion."""
+        # Calculate expanded batch size
+        if self.parallel_sample_num == 1:
+            num = self.batch_size
         else:
-            if self.parallel_sample_num == 1:
-                num = self.batch_size
+            # Expand parallel_sample_num
+            num = self.batch_size * self.parallel_sample_num
+        # Expand input based on type
+        self._expand_inputs(num)
+        self._normalize_lora_paths(num)
+        self._normalize_image_data(num)
+        self._normalize_audio_data(num)
+        self._normalize_sampling_params(num)
+        self._normalize_rid(num)
+        self._normalize_logprob_params(num)
+        self._normalize_custom_logit_processor(num)
+    def _expand_inputs(self, num):
+        """Expand the main inputs (text, input_ids, input_embeds) for parallel sampling."""
+        if self.text is not None:
+            if not isinstance(self.text, list):
+                raise ValueError("Text should be a list for batch processing.")
+            self.text = self.text * self.parallel_sample_num
+        elif self.input_ids is not None:
+            if not isinstance(self.input_ids, list) or not isinstance(
+                self.input_ids[0], list
+            ):
+                raise ValueError(
+                    "input_ids should be a list of lists for batch processing."
+                )
+            self.input_ids = self.input_ids * self.parallel_sample_num
+        elif self.input_embeds is not None:
+            if not isinstance(self.input_embeds, list):
+                raise ValueError("input_embeds should be a list for batch processing.")
+            self.input_embeds = self.input_embeds * self.parallel_sample_num
+    def _normalize_lora_paths(self, num):
+        """Normalize LoRA paths for batch processing."""
+        if self.lora_path is not None:
+            if isinstance(self.lora_path, str):
+                self.lora_path = [self.lora_path] * num
+            elif isinstance(self.lora_path, list):
+                self.lora_path = self.lora_path * self.parallel_sample_num
             else:
+                raise ValueError("lora_path should be a list or a string.")
+    def _normalize_image_data(self, num):
+        """Normalize image data for batch processing."""
+        if self.image_data is None:
+            self.image_data = [None] * num
+        elif not isinstance(self.image_data, list):
+            # Single image, convert to list of single-image lists
+            self.image_data = [[self.image_data]] * num
+            self.modalities = ["image"] * num
+        elif isinstance(self.image_data, list):
+            if len(self.image_data) != self.batch_size:
+                raise ValueError(
+                    "The length of image_data should be equal to the batch size."
+                )
+            self.modalities = []
+            if len(self.image_data) > 0 and isinstance(self.image_data[0], list):
+                # Already a list of lists, keep as is
+                for i in range(len(self.image_data)):
+                    if self.image_data[i] is None or self.image_data[i] == [None]:
+                        self.modalities.append(None)
+                    elif len(self.image_data[i]) == 1:
+                        self.modalities.append("image")
+                    elif len(self.image_data[i]) > 1:
+                        self.modalities.append("multi-images")
                 # Expand parallel_sample_num
-                num = self.batch_size * self.parallel_sample_num
-            if not self.image_data:
-                self.image_data = [None] * num
-            elif not isinstance(self.image_data, list):
-                self.image_data = [self.image_data] * num
-            elif isinstance(self.image_data, list):
-                pass
-            if self.audio_data is None:
-                self.audio_data = [None] * num
-            elif not isinstance(self.audio_data, list):
-                self.audio_data = [self.audio_data] * num
-            elif isinstance(self.audio_data, list):
-                pass
-            if self.sampling_params is None:
-                self.sampling_params = [{}] * num
-            elif not isinstance(self.sampling_params, list):
-                self.sampling_params = [self.sampling_params] * num
-            if self.rid is None:
-                self.rid = [uuid.uuid4().hex for _ in range(num)]
+                self.image_data = self.image_data * self.parallel_sample_num
+                self.modalities = self.modalities * self.parallel_sample_num
             else:
-                assert isinstance(self.rid, list), "The rid should be a list."
-            if self.return_logprob is None:
-                self.return_logprob = [False] * num
-            elif not isinstance(self.return_logprob, list):
-                self.return_logprob = [self.return_logprob] * num
-            else:
-                assert self.parallel_sample_num == 1
-            if self.logprob_start_len is None:
-                self.logprob_start_len = [-1] * num
-            elif not isinstance(self.logprob_start_len, list):
-                self.logprob_start_len = [self.logprob_start_len] * num
+                # List of images for a batch, wrap each in a list
+                wrapped_images = [[img] for img in self.image_data]
+                # Expand for parallel sampling
+                self.image_data = wrapped_images * self.parallel_sample_num
+                self.modalities = ["image"] * num
+    def _normalize_audio_data(self, num):
+        """Normalize audio data for batch processing."""
+        if self.audio_data is None:
+            self.audio_data = [None] * num
+        elif not isinstance(self.audio_data, list):
+            self.audio_data = [self.audio_data] * num
+        elif isinstance(self.audio_data, list):
+            self.audio_data = self.audio_data * self.parallel_sample_num
+    def _normalize_sampling_params(self, num):
+        """Normalize sampling parameters for batch processing."""
+        if self.sampling_params is None:
+            self.sampling_params = [{}] * num
+        elif isinstance(self.sampling_params, dict):
+            self.sampling_params = [self.sampling_params] * num
+        else:  # Already a list
+            self.sampling_params = self.sampling_params * self.parallel_sample_num
+    def _normalize_rid(self, num):
+        """Normalize request IDs for batch processing."""
+        if self.rid is None:
+            self.rid = [uuid.uuid4().hex for _ in range(num)]
+        elif not isinstance(self.rid, list):
+            raise ValueError("The rid should be a list for batch processing.")
+    def _normalize_logprob_params(self, num):
+        """Normalize logprob-related parameters for batch processing."""
+        # Helper function to normalize a parameter
+        def normalize_param(param, default_value, param_name):
+            if param is None:
+                return [default_value] * num
+            elif not isinstance(param, list):
+                return [param] * num
             else:
-                assert self.parallel_sample_num == 1
+                if self.parallel_sample_num > 1:
+                    raise ValueError(
+                        f"Cannot use list {param_name} with parallel_sample_num > 1"
+                    )
+                return param
+        # Normalize each logprob parameter
+        self.return_logprob = normalize_param(
+            self.return_logprob, False, "return_logprob"
+        )
+        self.logprob_start_len = normalize_param(
+            self.logprob_start_len, -1, "logprob_start_len"
+        )
+        self.top_logprobs_num = normalize_param(
+            self.top_logprobs_num, 0, "top_logprobs_num"
+        )
-            if self.top_logprobs_num is None:
-                self.top_logprobs_num = [0] * num
-            elif not isinstance(self.top_logprobs_num, list):
-                self.top_logprobs_num = [self.top_logprobs_num] * num
-            else:
-                assert self.parallel_sample_num == 1
-            if not self.token_ids_logprob:  # covers both None and []
-                self.token_ids_logprob = [None] * num
-            elif not isinstance(self.token_ids_logprob, list):
-                self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
-            elif not isinstance(self.token_ids_logprob[0], list):
-                self.token_ids_logprob = [
-                    copy.deepcopy(self.token_ids_logprob) for _ in range(num)
-                ]
-            else:
-                assert self.parallel_sample_num == 1
+        # Handle token_ids_logprob specially due to its nested structure
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = [None] * num
+        elif not isinstance(self.token_ids_logprob, list):
+            self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
+        elif not isinstance(self.token_ids_logprob[0], list):
+            self.token_ids_logprob = [
+                copy.deepcopy(self.token_ids_logprob) for _ in range(num)
+            ]
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list token_ids_logprob with parallel_sample_num > 1"
+            )
-            if self.custom_logit_processor is None:
-                self.custom_logit_processor = [None] * num
-            elif not isinstance(self.custom_logit_processor, list):
-                self.custom_logit_processor = [self.custom_logit_processor] * num
-            else:
-                assert self.parallel_sample_num == 1
+    def _normalize_custom_logit_processor(self, num):
+        """Normalize custom logit processor for batch processing."""
+        if self.custom_logit_processor is None:
+            self.custom_logit_processor = [None] * num
+        elif not isinstance(self.custom_logit_processor, list):
+            self.custom_logit_processor = [self.custom_logit_processor] * num
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list custom_logit_processor with parallel_sample_num > 1"
+            )
-        # Other checks
+    def _validate_session_params(self):
+        """Validate that session parameters are properly formatted."""
         if self.session_params is not None:
-            assert isinstance(self.session_params, dict) or isinstance(
+            if not isinstance(self.session_params, dict) and not isinstance(
                 self.session_params[0], dict
-            )
+            ):
+                raise ValueError("Session params must be a dict or a list of dicts.")
     def regenerate_rid(self):
+        """Generate a new request ID and return it."""
         self.rid = uuid.uuid4().hex
         return self.rid
@@ -300,13 +439,24 @@ class TokenizedGenerateReqInput:
     # Whether to return hidden states
     return_hidden_states: bool = False
+    # For disaggregated inference
+    bootstrap_host: Optional[str] = None
+    bootstrap_room: Optional[int] = None
 @dataclass
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[str], str]] = None
-    # The image input. It can be a file name, a url, or base64 encoded string.
-    image_data: Optional[Union[List[str], str]] = None
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[
+        Union[List[List[Union[Image, str]]], List[Union[Image, str]], Union[Image, str]]
+    ] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
     # The request id.
@@ -550,10 +700,17 @@ class UpdateWeightsFromDistributedReqOutput:
 @dataclass
 class UpdateWeightsFromTensorReqInput:
-    # List containing one serialized Dict[str, torch.Tensor] per TP worker
-    serialized_named_tensors: List[bytes]
-    load_format: Optional[str]
-    flush_cache: bool
+    """Update model weights from tensor input.
+    - Tensors are serialized for transmission
+    - Data is structured in JSON for easy transmission over HTTP
+    """
+    serialized_named_tensors: List[Union[str, bytes]]
+    # Optional format specification for loading
+    load_format: Optional[str] = None
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
 @dataclass
@@ -677,6 +834,7 @@ class ProfileReq:
     activities: Optional[List[str]] = None
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
+    profile_id: Optional[str] = None
 @dataclass

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-    Multi-modality utils
+Multi-modality utils
 """
+import logging
 from abc import abstractmethod
 from typing import Callable, List, Optional, Tuple
@@ -12,11 +13,11 @@ from sglang.srt.managers.schedule_batch import (
     MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
-    logger,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import print_warning_once
-from sglang.utils import logger
+logger = logging.getLogger(__name__)
 class MultiModalityDataPaddingPattern:
@@ -148,7 +149,8 @@ def get_embedding_and_mask(
         placeholder_tensor,
     ).unsqueeze(-1)
-    num_mm_tokens_in_input_ids = special_multimodal_mask.sum()
+    num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
     if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
         logger.warning(
             f"Number of tokens in multimodal embedding does not match those in the input text."
@@ -172,7 +174,7 @@ def get_embedding_and_mask(
                 embedding = embedding[-num_multimodal:, :]
         else:
             raise RuntimeError(
-                "Insufficient multimodal embedding length. This is an internal error"
+                f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
             )
     return embedding, special_multimodal_mask

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -5,8 +5,6 @@ import logging
 import pkgutil
 from functools import lru_cache
-from transformers import PROCESSOR_MAPPING
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
 )

sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl