PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -20,19 +20,18 @@ import copy
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
-from sglang.srt.mm_utils import has_valid_data
+from sglang.srt.managers.schedule_batch import BaseFinishReason
+from sglang.srt.multimodal.mm_utils import has_valid_data
+from sglang.srt.sampling.sampling_params import SamplingParams
-# handle serialization of Image for pydantic
+# Handle serialization of Image for pydantic
 if TYPE_CHECKING:
     from PIL.Image import Image
 else:
     Image = Any
-from sglang.srt.managers.schedule_batch import BaseFinishReason
-from sglang.srt.sampling.sampling_params import SamplingParams
 @dataclass
 class SessionParams:
@@ -40,6 +39,7 @@ class SessionParams:
     rid: Optional[str] = None
     offset: Optional[int] = None
     replace: Optional[bool] = None
+    drop_previous_output: Optional[bool] = None
 AudioDataItem = Union[str, Dict]
@@ -182,6 +182,7 @@ class GenerateReqInput:
         # Determine parallel sample count
         if self.sampling_params is None:
             self.parallel_sample_num = 1
+            return
         elif isinstance(self.sampling_params, dict):
             self.parallel_sample_num = self.sampling_params.get("n", 1)
         else:  # isinstance(self.sampling_params, list):
@@ -199,6 +200,8 @@ class GenerateReqInput:
                 self.text = [self.text]
             if self.input_ids is not None:
                 self.input_ids = [self.input_ids]
+            if self.input_embeds is not None:
+                self.input_embeds = [self.input_embeds]
     def _normalize_single_inputs(self):
         """Normalize inputs for a single example."""
@@ -323,7 +326,9 @@ class GenerateReqInput:
             new_rids = [f"{self.rid}_{i}" for i in range(num)]
             self.rid = new_rids
         elif isinstance(self.rid, list):
-            if len(self.rid) != num:
+            # Note: the length of rid shall be the same as the batch_size,
+            # as the rid would be expanded for parallel sampling in tokenizer_manager
+            if len(self.rid) != self.batch_size:
                 raise ValueError(
                     "The specified rids length mismatch with the batch_size for batch processing."
                 )
@@ -399,6 +404,9 @@ class GenerateReqInput:
         return GenerateReqInput(
             text=self.text[i] if self.text is not None else None,
             input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            input_embeds=(
+                self.input_embeds[i] if self.input_embeds is not None else None
+            ),
             image_data=self.image_data[i],
             audio_data=self.audio_data[i],
             sampling_params=self.sampling_params[i],
@@ -516,9 +524,6 @@ class EmbeddingReqInput:
     # For cross-encoder requests
     is_cross_encoder_request: bool = False
-    def contains_mm_input(self) -> bool:
-        return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
     def normalize_batch_and_arguments(self):
         # at least one of text, input_ids, or image should be provided
         if self.text is None and self.input_ids is None and self.image_data is None:
@@ -572,6 +577,9 @@ class EmbeddingReqInput:
         self.rid = uuid.uuid4().hex
         return self.rid
+    def contains_mm_input(self) -> bool:
+        return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
     def __getitem__(self, i):
         if self.is_cross_encoder_request:
             return EmbeddingReqInput(
@@ -740,6 +748,8 @@ class UpdateWeightFromDiskReqInput:
     model_path: str
     # The format to load the weights
     load_format: Optional[str] = None
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
 @dataclass
@@ -752,9 +762,15 @@ class UpdateWeightFromDiskReqOutput:
 @dataclass
 class UpdateWeightsFromDistributedReqInput:
-    name: str
-    dtype: str
-    shape: List[int]
+    names: List[str]
+    dtypes: List[str]
+    shapes: List[List[int]]
+    # The group name
+    group_name: str = "weight_update_group"
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
 @dataclass
@@ -776,6 +792,8 @@ class UpdateWeightsFromTensorReqInput:
     load_format: Optional[str] = None
     # Whether to flush the cache after updating weights
     flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
 @dataclass
@@ -854,7 +872,9 @@ class SlowDownReqOutput:
 @dataclass
 class AbortReq:
     # The request id
-    rid: str
+    rid: str = ""
+    # Whether to abort all requests
+    abort_all: bool = False
 @dataclass
@@ -1002,3 +1022,27 @@ class RpcReqInput:
 class RpcReqOutput:
     success: bool
     message: str
+@dataclass
+class LoadLoRAAdapterReqInput:
+    # The name of the lora module to newly loaded.
+    lora_name: str
+    # The path of loading.
+    lora_path: str
+@dataclass
+class UnloadLoRAAdapterReqInput:
+    # The name of lora module to unload.
+    lora_name: str
+@dataclass
+class LoRAUpdateResult:
+    success: bool
+    error_message: Optional[str] = None
+    loaded_adapters: Dict[str, str] = field(default_factory=dict)
+LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -2,14 +2,15 @@
 Multi-modality utils
 """
-import dataclasses
-import logging
+import hashlib
 from abc import abstractmethod
 from typing import Callable, List, Optional, Tuple
+import numpy as np
 import torch
 from torch import nn
+from sglang.srt.layers.multimodal import gpu_tensor_hash
 from sglang.srt.managers.schedule_batch import (
     Modality,
     MultimodalDataItem,
@@ -124,74 +125,38 @@ class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPa
     e.g. <image><image>....<image>, or <audio><audio>...<audio>
     """
-    def __init__(self, token_ids: List[int]) -> None:
-        self.token_ids = token_ids
     def pad_input_tokens(
         self, input_ids: List[int], mm_inputs: MultimodalInputs
     ) -> List[int]:
         """
-        Finds contiguous regions of tokens matching `self.token_ids` in `input_ids`
-        and replaces each region with the corresponding `pad_value` from `mm_inputs.mm_items`.
+        Replaces multimodal tokens in input_ids with corresponding pad_values from mm_items.
+        Each modality (image, audio, video) is handled separately based on its token_id.
         """
-        pad_values = [item.pad_value for item in mm_inputs.mm_items]
-        if not pad_values:
-            # No multimodal items, return original input_ids
+        if not input_ids or not mm_inputs.mm_items:
             return input_ids
-        if not input_ids:
-            return []
         input_ids_tensor = torch.tensor(input_ids)
-        device = input_ids_tensor.device
-        token_ids_tensor = torch.tensor(self.token_ids, device=device)
-        mask = torch.isin(input_ids_tensor, token_ids_tensor)
-        if not mask.any():
-            # No tokens match token_ids, return original input_ids
-            return input_ids
+        # Create mapping of token_ids to pad_values for each modality
+        token_to_pad_mapping = {}
-        # Find contiguous regions
-        padded_mask = torch.cat(
-            (
-                torch.tensor([False], device=device),
-                mask,
-                torch.tensor([False], device=device),
-            )
-        )
-        # Find indices where the mask value changes
-        diff_indices = torch.where(padded_mask[1:] != padded_mask[:-1])[0]
-        # Start indices are where False changes to True
-        starts = diff_indices[::2]
-        # End indices are where True changes to False (exclusive index)
-        ends = diff_indices[1::2]
-        # Check if the number of regions matches the number of pad values
-        if len(starts) != len(pad_values):
-            # Maybe log a warning here?
-            num_regions = len(starts)
-            num_pad_values = len(pad_values)
-            if num_regions > 0 and num_pad_values > 0:
-                pad_values = (pad_values * (num_regions // num_pad_values + 1))[
-                    :num_regions
-                ]
-            else:  # If no regions or no pad_values, this loop won't run anyway.
-                pad_values = []  # Ensure pad_values is empty if starts is empty
-        # Create a copy to modify
-        output_ids_tensor = input_ids_tensor.clone()
-        # Replace tokens in each region with the corresponding pad value
-        # Ensure we don't iterate if pad_values became empty due to mismatch and num_regions=0
-        for i in range(min(len(starts), len(pad_values))):
-            start_idx = starts[i]
-            end_idx = ends[i]
-            pad_value = pad_values[i]
-            if pad_value is not None:  # Ensure pad_value is not None before assignment
-                output_ids_tensor[start_idx:end_idx] = pad_value
+        for item in mm_inputs.mm_items:
+            if item.is_image() and mm_inputs.im_token_id is not None:
+                token_to_pad_mapping[mm_inputs.im_token_id] = item.pad_value
+            elif item.is_audio() and mm_inputs.audio_token_id is not None:
+                token_to_pad_mapping[mm_inputs.audio_token_id] = item.pad_value
+            elif item.is_video() and mm_inputs.video_token_id is not None:
+                token_to_pad_mapping[mm_inputs.video_token_id] = item.pad_value
             else:
-                logger.warning(f"Skipping region {i} due to None pad_value.")
-        return output_ids_tensor.tolist()
+                raise ValueError(f"No multimodal token id provided for {item.modality}")
+        # Apply replacements for all tokens at once
+        for token_id, pad_value in token_to_pad_mapping.items():
+            input_ids_tensor[input_ids_tensor == token_id] = pad_value
+        ret_input_ids = input_ids_tensor.tolist()
+        return ret_input_ids
 embedding_cache = None
@@ -283,7 +248,9 @@ def _get_chunked_prefill_embedding(
 ) -> Optional[torch.Tensor]:
     # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
     embedding_list = []
-    for i in range(len(items_size) - 1):
+    # FIXME(Xinyuan): temporary workaround for eagle3, which may have len(items_size) > len(prefix_length)
+    max_iterations = min(len(items_size) - 1, len(prefix_length))
+    for i in range(max_iterations):
         if items_size[i] == items_size[i + 1]:
             continue
         embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
@@ -304,7 +271,7 @@ def _get_chunked_prefill_embedding(
         embedding_per_req_chunk, _, end_index = get_embedding_chunk(
             embedding=embedding_per_req,
             extend_prefix_len=prefix_length[i],
-            extend_seq_len=extend_length[i],
+            extend_seq_len=extend_length[i] if i < len(extend_length) else 0,
             items_offset=items_offset,
         )
         # remove this item from cache if chunk reaches to the end
@@ -680,3 +647,52 @@ def get_multimodal_data_bounds(
     # Convert valid pairs to tensor
     valid_pairs_tensor = torch.tensor(valid_pairs, device=input_ids.device)
     return valid_pairs_tensor
+def data_hash(data) -> int:
+    hash_bytes = hashlib.sha256(data).digest()[:8]
+    return int.from_bytes(hash_bytes, byteorder="big", signed=False)
+def tensor_hash(tensor_list) -> int:
+    """
+    hash a tensor or a tensor list
+    """
+    tensor = tensor_list
+    if isinstance(tensor_list, list):
+        tensor_list = flatten_nested_list(tensor_list)
+        tensor_list = [
+            x.flatten() if isinstance(x, torch.Tensor) else x for x in tensor_list
+        ]
+        tensor = torch.concat(tensor_list)
+    if tensor.is_cuda:
+        return gpu_tensor_hash(tensor)
+    tensor = tensor.detach().contiguous()
+    if tensor.dtype == torch.bfloat16:
+        # memoryview() doesn't support PyTorch's BFloat16 dtype
+        tensor = tensor.float()
+    assert isinstance(tensor, torch.Tensor)
+    if tensor.is_cuda:
+        # TODO: improve this
+        tensor_cpu = tensor.cpu()
+    else:
+        tensor_cpu = tensor
+    mv = memoryview(tensor_cpu.numpy())
+    return data_hash(mv.tobytes())
+def hash_feature(f):
+    if isinstance(f, list):
+        if isinstance(f[0], torch.Tensor):
+            return tensor_hash(f)
+        return data_hash(tuple(flatten_nested_list(f)))
+    elif isinstance(f, np.ndarray):
+        arr = np.ascontiguousarray(f)
+        arr_bytes = arr.tobytes()
+        return data_hash(arr_bytes)
+    elif isinstance(f, torch.Tensor):
+        return tensor_hash([f])
+    return data_hash(f)

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -3,11 +3,8 @@ import importlib
 import inspect
 import logging
 import pkgutil
-from functools import lru_cache
-from sglang.srt.managers.multimodal_processors.base_processor import (
-    BaseMultimodalProcessor,
-)
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
 from sglang.srt.server_args import ServerArgs
 logger = logging.getLogger(__name__)
@@ -27,9 +24,8 @@ def get_dummy_processor():
     return DummyMultimodalProcessor()
-@lru_cache()
 def import_processors():
-    package_name = "sglang.srt.managers.multimodal_processors"
+    package_name = "sglang.srt.multimodal.processors"
     package = importlib.import_module(package_name)
     for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
         if not ispkg:

sglang/srt/managers/multimodal_processors/qwen_audio.py ADDED Viewed

@@ -0,0 +1,94 @@
+import re
+from typing import List, Union
+import torch
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
+class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
+    models = [Qwen2AudioForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+        self.AUDIO_TOKEN_REGEX = re.compile(
+            r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
+        )
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        **kwargs,
+    ):
+        audio_data = request_obj.audio_data
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            max_req_input_len=max_req_input_len,
+            audio_data=audio_data,
+            multimodal_tokens=MultimodalSpecialTokens(
+                audio_token=self.AUDIO_TOKEN,
+                audio_token_regex=self.AUDIO_TOKEN_REGEX,
+            ),
+        )
+        if base_output is None:
+            return None
+        res = self.process_mm_data(
+            input_text=base_output.input_text,
+            audio=base_output.audios,
+        )
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
+        audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
+        audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
+        items = []
+        input_ids = res["input_ids"].flatten()
+        if (
+            "input_features" in res
+            and res["input_features"] is not None
+            and len(res["input_features"]) != 0
+        ):
+            if audio_start_id is not None and audio_end_id is not None:
+                audio_offsets = self.get_mm_items_offset_by_pair(
+                    input_ids=input_ids,
+                    mm_start_id=audio_start_id,
+                    mm_end_id=audio_end_id,
+                )
+            else:
+                audio_offsets = None
+            input_lengths = res["feature_attention_mask"].sum(dim=-1)
+            input_lengths = (input_lengths - 1) // 2 + 1
+            output_lengths = (input_lengths - 2) // 2 + 1
+            item = MultimodalDataItem(
+                audio_features=res["input_features"],
+                audio_feature_lens=output_lengths,
+                audio_offsets=audio_offsets,
+                modality=Modality.AUDIO,
+            )
+            items += [item]
+        return {
+            "mm_items": items,
+            "input_ids": input_ids.tolist(),
+            "audio_start_id": audio_start_id,
+            "audio_token_id": audio_token_id,
+            "audio_end_id": audio_end_id,
+        }

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl