PyPI - sglang - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl - Mend

sglang 0.4.8py3-none-any.whl → 0.4.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

sglang/srt/configs/model_config.py +1 -0
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +7 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +289 -48
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +94 -46
sglang/srt/disaggregation/prefill.py +3 -2
sglang/srt/disaggregation/utils.py +12 -11
sglang/srt/entrypoints/engine.py +5 -3
sglang/srt/entrypoints/openai/protocol.py +47 -4
sglang/srt/entrypoints/openai/serving_chat.py +52 -76
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/layers/activation.py +7 -0
sglang/srt/layers/attention/flashattention_backend.py +24 -14
sglang/srt/layers/layernorm.py +15 -0
sglang/srt/layers/linear.py +18 -1
sglang/srt/layers/logits_processor.py +12 -3
sglang/srt/layers/moe/ep_moe/layer.py +79 -12
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +73 -14
sglang/srt/layers/moe/topk.py +26 -0
sglang/srt/layers/quantization/fp8_utils.py +5 -4
sglang/srt/layers/rotary_embedding.py +103 -11
sglang/srt/layers/vocab_parallel_embedding.py +14 -1
sglang/srt/managers/expert_distribution.py +21 -0
sglang/srt/managers/io_struct.py +10 -2
sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
sglang/srt/managers/schedule_batch.py +9 -1
sglang/srt/managers/scheduler.py +42 -6
sglang/srt/model_executor/cuda_graph_runner.py +1 -1
sglang/srt/model_executor/model_runner.py +5 -2
sglang/srt/model_loader/loader.py +45 -10
sglang/srt/model_loader/weight_utils.py +89 -0
sglang/srt/models/deepseek_nextn.py +7 -4
sglang/srt/models/deepseek_v2.py +147 -4
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1009 -0
sglang/srt/models/gemma3n_mm.py +511 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/server_args.py +16 -2
sglang/srt/two_batch_overlap.py +4 -1
sglang/srt/utils.py +71 -0
sglang/version.py +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +54 -49
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -18,7 +18,14 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_hip,
+    set_weight_attrs,
+)
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -28,6 +35,8 @@ else:
 import logging
 _is_hip = is_hip()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
@@ -117,6 +126,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
+        # Pack weight for get better performance on CPU
+        if _is_cpu and _is_cpu_amx_available:
+            _process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
         return
     def apply(
@@ -248,19 +262,64 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
-        return moe_forward_native(
-            layer,
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            num_fused_shared_experts,
-            custom_routing_function,
-            correction_bias,
-        )
+        assert activation == "silu", f"activation = {activation} is not supported."
+        if (
+            getattr(layer, "use_intel_amx_backend", False)
+            and not apply_router_weight_on_input
+        ):
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                custom_routing_function=custom_routing_function,
+                correction_bias=correction_bias,
+                routed_scaling_factor=routed_scaling_factor,
+            )
+            # TODO: support apply_router_weight_on_input in the fused_experts_cpu kernel
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights.to(
+                    torch.float
+                ),  # TODO: the topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
+                topk_ids,
+                True,  # inplace
+                False,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                None,  # w1_scale
+                None,  # w2_scale
+                None,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
+        else:
+            return moe_forward_native(
+                layer,
+                x,
+                use_grouped_topk,
+                top_k,
+                router_logits,
+                renormalize,
+                topk_group,
+                num_expert_group,
+                num_fused_shared_experts,
+                custom_routing_function,
+                correction_bias,
+                activation,
+                apply_router_weight_on_input,
+                inplace,
+                no_combine,
+                routed_scaling_factor,
+            )
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -30,6 +30,7 @@ from sglang.srt.managers.expert_location_dispatch import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.utils import (
     cpu_has_amx_support,
+    get_bool_env_var,
     get_compiler_backend,
     is_cpu,
     is_cuda,
@@ -38,6 +39,7 @@ from sglang.srt.utils import (
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
@@ -46,6 +48,11 @@ if _is_cuda:
 if _is_cuda or _is_hip:
     from sgl_kernel import topk_softmax
+if _use_aiter:
+    try:
+        from aiter import biased_grouped_topk as aiter_biased_grouped_topk
+    except ImportError:
+        raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
 def fused_topk_torch_native(
@@ -347,6 +354,25 @@ def biased_grouped_topk_gpu(
                 topk_ids, expert_location_dispatch_info, num_token_non_padded
             )
         return topk_weights, topk_ids
+    elif _use_aiter:
+        token = gating_output.shape[0]
+        device = gating_output.device
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch: hidden_states.shape[0] = {hidden_states.shape[0]}, gating_output.shape[0] = {gating_output.shape[0]}"
+        topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
+        topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+        aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            routed_scaling_factor,
+        )
+        return topk_weights, topk_ids
     else:
         biased_grouped_topk_fn = (
             torch.compile(

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -42,7 +42,10 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
-    from aiter import gemm_a8w8_blockscale_CK
+    import aiter
+    from aiter import gemm_a8w8_blockscale_CK, get_hip_quant
+    aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
 if _is_cuda:
     from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -271,9 +274,7 @@ def aiter_w8a8_block_fp8_linear(
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
-    q_input, x_scale = per_token_group_quant_fp8(
-        input_2d, block_size[1], column_major_scales=False
-    )
+    q_input, x_scale = aiter_per1x128_quant(input_2d, quant_dtype=aiter.dtypes.fp8)
     output = gemm_a8w8_blockscale_CK(
         q_input, weight, x_scale, weight_scale, dtype=input.dtype
     )

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,16 +8,29 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_npu
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
+if _use_aiter:
+    from aiter.rotary_embedding import get_rope as aiter_get_rope
+if is_npu():
+    import torch_npu
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -152,6 +165,36 @@ class RotaryEmbedding(CustomOp):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-npu implementation of forward()."""
+        import os
+        if get_bool_env_var("SGLANG_ENABLE_TORCH_COMPILE"):
+            return self.forward_native(positions, query, key, offsets)
+        else:
+            rotary_mode = "half"
+            if self.is_neox_style:
+                rotary_mode = "half"
+            else:
+                rotary_mode = "interleave"
+            mrope_section = [0, 0, 0]
+            query_out, key_out = torch_npu.npu_mrope(
+                positions,
+                query,
+                key,
+                self.cos_sin_cache,
+                self.head_size,
+                mrope_section=mrope_section,
+                rotary_mode=rotary_mode,
+            )
+            return query_out, key_out
     def forward_cpu(
         self,
         positions: torch.Tensor,
@@ -847,6 +890,43 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         return query_out.type_as(query), key_out.type_as(key)
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""
@@ -1191,15 +1271,26 @@ def get_rope(
             )
         elif scaling_type == "dynamic":
             scaling_factor = rope_scaling["factor"]
-            rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                scaling_factor,
-                dtype,
-            )
+            if "alpha" in rope_scaling:
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    rope_scaling["alpha"],
+                    dtype,
+                )
+            else:
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                )
         elif scaling_type == "yarn":
             scaling_factor = rope_scaling["factor"]
             original_max_position = rope_scaling["original_max_position_embeddings"]
@@ -1388,7 +1479,8 @@ def get_rope_wrapper(
     device: Optional[str] = None,
 ):
     if device != "cpu":
-        return get_rope(
+        wrapper = aiter_get_rope if _use_aiter else get_rope
+        return wrapper(
             head_size,
             rotary_dim,
             max_position,

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -20,10 +20,18 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
     method_has_implemented_embedding,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    PackWeightMethod,
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+)
 DEFAULT_VOCAB_PADDING_SIZE = 64
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
@@ -549,6 +557,11 @@ class ParallelLMHead(VocabParallelEmbedding):
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
+        # We only support pack LMHead if it's not quantized. For LMHead with quant_config, the weight_name will be "qweight"
+        if self.quant_config is None and _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)

sglang/srt/managers/expert_distribution.py CHANGED Viewed

@@ -61,6 +61,10 @@ class ExpertDistributionRecorder(ABC):
     def with_debug_name(self, debug_name):
         yield
+    @contextmanager
+    def disable_this_region(self):
+        yield
     @contextmanager
     def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
         yield
@@ -116,6 +120,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
         self._expert_location_metadata = expert_location_metadata
         self._recording = False
+        self._disable_all = False
         self._current_forward_pass_id = Withable()
         self._current_layer_idx = Withable()
         self._current_debug_name = Withable()
@@ -148,6 +153,16 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
             finally:
                 self._on_forward_pass_end(forward_pass_id)
+    @contextmanager
+    def disable_this_region(self):
+        """Context manager to temporarily disable recording."""
+        previous_disable_all = self._disable_all
+        self._disable_all = True
+        try:
+            yield
+        finally:
+            self._disable_all = previous_disable_all
     def _on_forward_pass_start(self, forward_batch: ForwardBatch):
         if not self._recording:
             return
@@ -189,6 +204,8 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
         )
     def _on_hook(self, hook_name: str, **kwargs):
+        if self._disable_all:
+            return
         if not (self._recording or torch.cuda.is_current_stream_capturing()):
             return
         gatherer = self._single_pass_gatherers[
@@ -462,6 +479,10 @@ class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
     def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
         topk_ids = topk_ids.flatten()
         mask = topk_ids != -1
+        assert self._data[layer_idx, :].shape == topk_ids.shape, (
+            "Shape mismatch between data and topk_ids."
+            "Selecting expert is not supported for multiple token prediction at the moment."
+        )
         self._data[layer_idx, :].scatter_add_(
             dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
         )

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -319,8 +319,16 @@ class GenerateReqInput:
         """Normalize request IDs for batch processing."""
         if self.rid is None:
             self.rid = [uuid.uuid4().hex for _ in range(num)]
-        elif not isinstance(self.rid, list):
-            raise ValueError("The rid should be a list for batch processing.")
+        elif isinstance(self.rid, str):
+            new_rids = [f"{self.rid}_{i}" for i in range(num)]
+            self.rid = new_rids
+        elif isinstance(self.rid, list):
+            if len(self.rid) != num:
+                raise ValueError(
+                    "The specified rids length mismatch with the batch_size for batch processing."
+                )
+        else:
+            raise ValueError("The rid should be a string or a list of strings.")
     def _normalize_logprob_params(self, num):
         """Normalize logprob-related parameters for batch processing."""

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -23,6 +23,7 @@ class MultimodalInputFormat(Enum):
     RAW_IMAGES = "raw_images"
     PRECOMPUTED_FEATURES = "precomputed_features"
     PIXEL_VALUES = "pixel_values"
+    AUDIO = "audio"
 @dataclasses.dataclass
@@ -441,10 +442,13 @@ class BaseMultimodalProcessor(ABC):
                 has_image = False
                 has_pixel_values = False
                 has_precomputed_features = False
+                has_audio = False
                 for mm_input in mm_inputs:
                     if isinstance(mm_input, Image.Image):
                         has_image = True
+                    elif isinstance(mm_input, np.ndarray):
+                        has_audio = True
                     elif isinstance(mm_input, dict):
                         if mm_input.get("precomputed_features", None) is not None:
                             has_precomputed_features = True
@@ -461,13 +465,13 @@ class BaseMultimodalProcessor(ABC):
                 # Validate format consistency
                 format_count = sum(
-                    [has_image, has_pixel_values, has_precomputed_features]
+                    [has_image, has_pixel_values, has_precomputed_features, has_audio]
                 )
                 if format_count > 1:
                     raise ValueError(
                         "Unsupported: mixture of multimodal input formats. "
                         f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
-                        f"precomputed_features={has_precomputed_features}"
+                        f"precomputed_features={has_precomputed_features}, audio={has_audio}"
                     )
                 if has_image:
@@ -476,6 +480,8 @@ class BaseMultimodalProcessor(ABC):
                     return MultimodalInputFormat.PRECOMPUTED_FEATURES
                 elif has_pixel_values:
                     return MultimodalInputFormat.PIXEL_VALUES
+                elif has_audio:
+                    return MultimodalInputFormat.AUDIO
                 else:
                     raise ValueError("No valid multimodal input format found")
             except Exception as e:
@@ -521,20 +527,47 @@ class BaseMultimodalProcessor(ABC):
             input_ids = tokenize_text(base_output.input_text)
             return combined_mm_item, input_ids
+        def process_audio(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with audio."""
+            ret = self.process_mm_data(
+                input_text=base_output.input_text,
+                audio=base_output.audios,  # Note: "audio" is for gemma3n only
+            )
+            combined_mm_item = MultimodalDataItem(modality=Modality.AUDIO)
+            for key, value in ret.items():
+                if key != "input_ids" and hasattr(combined_mm_item, key):
+                    setattr(combined_mm_item, key, value)
+            input_ids = ret["input_ids"].flatten()
+            return combined_mm_item, input_ids
         def finalize_mm_item(
             combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
         ) -> MultimodalDataItem:
             """Apply common post-processing to the multimodal item."""
-            combined_mm_item.image_offsets = self.get_mm_items_offset(
-                input_ids=input_ids,
-                mm_token_id=self.IM_TOKEN_ID,
-            )
+            if combined_mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
+                combined_mm_item.image_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.IM_TOKEN_ID,
+                )
+            elif combined_mm_item.modality == Modality.AUDIO:
+                combined_mm_item.audio_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.AUDIO_TOKEN_ID,
+                )
+            elif combined_mm_item.modality == Modality.VIDEO:
+                combined_mm_item.video_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.VIDEO_TOKEN_ID,
+                )
+            else:
+                raise ValueError(f"Unknown modality: {combined_mm_item.modality}")
             return combined_mm_item
-        # Main logic
-        mm_inputs = base_output.images
+        # Main logic - determine input type and handle text-only case
+        mm_inputs = base_output.images or base_output.audios
         if not mm_inputs:
-            # Return text-only case
             input_ids = tokenize_text(base_output.input_text)
             return None, input_ids
@@ -548,6 +581,8 @@ class BaseMultimodalProcessor(ABC):
             combined_mm_item, input_ids = process_precomputed_features(base_output)
         elif input_format == MultimodalInputFormat.PIXEL_VALUES:
             combined_mm_item, input_ids = process_pixel_values(base_output)
+        elif input_format == MultimodalInputFormat.AUDIO:
+            combined_mm_item, input_ids = process_audio(base_output)
         else:
             raise ValueError(f"Unknown input format: {input_format}")

sglang/srt/managers/multimodal_processors/gemma3n.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import re
+from typing import Dict, List, Optional, Union
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
+)
+from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
+class Gemma3nSGLangProcessor(SGLangBaseProcessor):
+    """Multimodal processor for Gemma3n supporting image and audio inputs."""
+    models = [Gemma3nForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<image_soft_token>"
+        self.IMAGE_TOKEN_REGEX = re.compile(
+            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+        )
+        self.AUDIO_TOKEN = "<audio_soft_token>"
+        self.AUDIO_TOKEN_REGEX = re.compile(
+            r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
+        )
+        self.IM_TOKEN_ID = hf_config.image_token_id
+        self.IM_START_TOKEN_ID = hf_config.boi_token_id
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_id
+        self.AUDIO_TOKEN_ID = hf_config.audio_token_id
+        self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
+        self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        input_text: str = "",
+        request_obj=None,
+        max_req_input_len: int = 0,
+        *args,
+        **kwargs,
+    ):
+        """Process multimodal data including images and audio."""
+        audio_data = request_obj.audio_data
+        if not image_data and not audio_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        if isinstance(audio_data, str):
+            audio_data = [audio_data]
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            audio_data=audio_data,
+            max_req_input_len=max_req_input_len,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN,
+                image_token_regex=self.IMAGE_TOKEN_REGEX,
+                audio_token=self.AUDIO_TOKEN,
+                audio_token_regex=self.AUDIO_TOKEN_REGEX,
+            ),
+        )
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "audio_start_id": self.AUDIO_START_TOKEN_ID,
+            "audio_end_id": self.AUDIO_END_TOKEN_ID,
+        }

sglang 0.4.8__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

sglang 0.4.8py3-none-any.whl → 0.4.8.post1py3-none-any.whl