PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +3 -1
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +667 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +63 -11
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/parallel_state.py +10 -3
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +71 -12
sglang/srt/function_call_parser.py +133 -54
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +295 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +32 -21
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +31 -18
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +184 -126
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +24 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -9
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +66 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/layers.py +68 -0
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +47 -23
sglang/srt/lora/mem_pool.py +110 -51
sglang/srt/lora/utils.py +12 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +39 -3
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +133 -30
sglang/srt/managers/scheduler.py +273 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -23
sglang/srt/managers/tp_worker.py +1 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +27 -13
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +70 -6
sglang/srt/model_loader/loader.py +160 -2
sglang/srt/model_loader/weight_utils.py +45 -0
sglang/srt/models/deepseek_janus_pro.py +29 -86
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +208 -77
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +684 -0
sglang/srt/models/gemma3_mm.py +462 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +124 -28
sglang/srt/openai_api/protocol.py +23 -2
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +99 -9
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +139 -53
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +182 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +2 -0
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +55 -4
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/base_processor.py ADDED Viewed

@@ -0,0 +1,275 @@
+import concurrent
+import concurrent.futures
+import dataclasses
+import multiprocessing as mp
+import os
+from abc import ABC, abstractmethod
+from typing import Optional
+import numpy as np
+import PIL
+import transformers
+from decord import VideoReader, cpu
+from openai import BadRequestError
+from PIL import Image
+from sglang.srt.utils import load_audio, load_image, logger
+global global_processor
+def get_global_processor():
+    global global_processor
+    return global_processor
+@dataclasses.dataclass
+class BaseMultiModalProcessorOutput:
+    # input_text, with each frame of video/image represented with a image_token
+    input_text: str
+    mm_data_hashes: Optional[list[int]]
+    # images
+    image_sizes: Optional[list[int]]
+    # frames loaded from image and video, in given order
+    images: Optional[list[PIL.Image]] = None
+    # audios
+    audios: Optional[list[np.ndarray]] = None
+    def normalize(self):
+        for field_name in ["data_hashes", "image_sizes", "images", "audios"]:
+            field = getattr(self, field_name, None)
+            if field is not None and isinstance(field, list) and len(field) == 0:
+                setattr(self, field_name, None)
+@dataclasses.dataclass
+class MultimodalSpecialTokens:
+    image_token: Optional[str] = None
+    video_token: Optional[str] = None
+    audio_token: Optional[str] = None
+    def collect(self) -> list[str]:
+        return [
+            token
+            for token in [self.image_token, self.video_token, self.audio_token]
+            if token
+        ]
+class BaseMultimodalProcessor(ABC):
+    models = []
+    def __init__(self, hf_config, server_args, _processor):
+        self.hf_config = hf_config
+        self._processor = _processor
+        self.server_args = server_args
+        # FIXME: not accurate, model and image specific
+        self.NUM_TOKEN_PER_FRAME = 330
+        # Initialize global processor first
+        init_global_processor(self, server_args)
+        self.executor = concurrent.futures.ProcessPoolExecutor(
+            initializer=init_global_processor,
+            mp_context=mp.get_context("fork"),
+            initargs=(
+                self,
+                server_args,
+            ),
+            max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
+        )
+    def _build_processor(self, server_args):
+        """Init the global processor for multi modal models."""
+        from sglang.srt.hf_transformers_utils import get_processor
+        return get_processor(
+            server_args.tokenizer_path,
+            tokenizer_mode=server_args.tokenizer_mode,
+            trust_remote_code=server_args.trust_remote_code,
+        )
+    @abstractmethod
+    async def process_mm_data_async(
+        self, image_data, input_text, max_req_input_len, **kwargs
+    ):
+        pass
+    def get_estimated_frames_list(self, image_data):
+        """
+        estimate the total frame count from all visual input
+        """
+        # Before processing inputs
+        estimated_frames_list = []
+        for image in image_data:
+            if isinstance(image, str) and image.startswith("video:"):
+                path = image[len("video:") :]
+                # Estimate frames for the video
+                vr = VideoReader(path, ctx=cpu(0))
+                num_frames = len(vr)
+            else:
+                # For images, each contributes one frame
+                num_frames = 1
+            estimated_frames_list.append(num_frames)
+        return estimated_frames_list
+    @staticmethod
+    def encode_video(video_path, frame_count_limit=None):
+        if not os.path.exists(video_path):
+            logger.error(f"Video {video_path} does not exist")
+            return []
+        if frame_count_limit == 0:
+            return []
+        def uniform_sample(l, n):
+            gap = len(l) / n
+            idxs = [int(i * gap + gap / 2) for i in range(n)]
+            return [l[i] for i in idxs]
+        vr = VideoReader(video_path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+        frame_indices = [i for i in range(0, len(vr), sample_fps)]
+        if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
+            frame_indices = uniform_sample(frame_indices, frame_count_limit)
+        frames = vr.get_batch(frame_indices).asnumpy()
+        frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+        return frames
+    def load_mm_data(
+        self,
+        input_ids: list[int],
+        multimodal_tokens: MultimodalSpecialTokens,
+        max_req_input_len: int,
+        image_data: Optional[list] = None,
+        audio_data: Optional[list] = None,
+        return_text: Optional[bool] = True,
+        discard_alpha_channel: bool = True,
+    ) -> BaseMultiModalProcessorOutput:
+        """
+        Each frame of video/image will be replaced by a single image token
+        Args:
+            multimodal_tokens (list[str]): list of special token which denoting a single multimodal data
+                e.g. image token or audio token
+            discard_alpha_channel: if True, discards the alpha channel in the returned images
+        """
+        if isinstance(multimodal_tokens.image_token, int):
+            multimodal_tokens.image_token = (
+                self._processor.tokenizer.convert_ids_to_tokens(
+                    multimodal_tokens.image_token
+                )
+            )
+        else:
+            multimodal_tokens.image_token = multimodal_tokens.image_token
+        if isinstance(input_ids, list) and return_text:
+            assert len(input_ids) and isinstance(input_ids[0], int)
+            input_text = self._processor.tokenizer.decode(input_ids)
+        else:
+            input_text = input_ids
+        if return_text:
+            import re
+            pattern = (
+                "("
+                + "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
+                + ")"
+            )
+            # split text into list of normal text and special tokens
+            text_parts = re.split(pattern, input_text)
+        # TODO(mick): load from server_args, env, or sampling_params
+        MAX_NUM_FRAMES = 30
+        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
+        total_frame_count = sum(estimated_frames_list)
+        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
+        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
+        scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
+        assert len(image_data) == len(estimated_frames_list)
+        image_index, audio_index = 0, 0
+        hashes, image_sizes, images, audios = [], [], [], []
+        new_text = ""
+        for index, text_part in enumerate(text_parts):
+            try:
+                if text_part == multimodal_tokens.image_token:
+                    # load as image
+                    if len(images) >= MAX_NUM_FRAMES:
+                        frames_to_process = 0
+                    else:
+                        estimated_frames = estimated_frames_list[image_index]
+                        frames_to_process = max(
+                            1, int(estimated_frames * scaling_factor)
+                        )
+                    if frames_to_process == 0:
+                        frames = []
+                    else:
+                        image_file = image_data[image_index]
+                        if isinstance(image_file, str) and image_file.startswith(
+                            "video:"
+                        ):
+                            # video
+                            path = image_file[len("video:") :]
+                            frames = BaseMultimodalProcessor.encode_video(
+                                path, frame_count_limit=frames_to_process
+                            )
+                        else:
+                            # image
+                            raw_image, _size = load_image(image_file)
+                            if discard_alpha_channel:
+                                raw_image = raw_image.convert("RGB")
+                            frames = [raw_image]
+                        if len(frames) == 0:
+                            continue
+                    image_sizes += frames[0].size * len(frames)
+                    hashes += [hash(image_file)] * len(frames)
+                    images += frames
+                    image_index += 1
+                    if frames_to_process != 0:
+                        new_text += multimodal_tokens.image_token * len(frames)
+                    assert frames_to_process == len(frames)
+                elif text_part == multimodal_tokens.audio_token:
+                    # load as audio
+                    audio_file = audio_data[audio_index]
+                    audio = load_audio(audio_file)
+                    hashes += [hash(audio_file)]
+                    audios += [audio]
+                    audio_index += 1
+                    new_text += multimodal_tokens.audio_token
+                else:
+                    # TODO(mick): handle video
+                    # normal text
+                    new_text += text_part
+            except Exception as e:
+                logger.error(f"An exception occurred while loading images: {e}")
+                raise BadRequestError(
+                    f"An exception occurred while loading images: {e}"
+                )
+        out = BaseMultiModalProcessorOutput(
+            mm_data_hashes=hashes,
+            image_sizes=image_sizes,
+            images=images,
+            audios=audios,
+            input_text=new_text,
+        )
+        out.normalize()
+        return out
+def init_global_processor(sglang_processor: BaseMultimodalProcessor, server_args):
+    """
+    Init the global processor for multimodal models."""
+    global global_processor
+    transformers.logging.set_verbosity_error()
+    global_processor = sglang_processor._build_processor(server_args=server_args)

sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py ADDED Viewed

@@ -0,0 +1,119 @@
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import asyncio
+import torch
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+    get_global_processor,
+)
+from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
+class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
+    models = [DeepseekVL2ForCausalLM]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<image>"
+    @staticmethod
+    def _process_images_task(image, input_text, max_req_input_len):
+        processor = get_global_processor()
+        res = processor.__call__(
+            conversations=input_text, images=image, max_req_input_len=max_req_input_len
+        )
+        image_token_id = processor.image_token_id
+        res["im_token_id"] = image_token_id
+        return res
+    async def _process_images(self, image_data, input_text, max_req_input_len):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                DeepseekVL2ImageProcessor._process_images_task,
+                image_data,
+                input_text,
+                max_req_input_len,
+            )
+        else:
+            image_inputs = self._process_images_task(
+                image_data, input_text, max_req_input_len
+            )
+        return image_inputs
+    async def _process_images(self, image_data, input_text, max_req_input_len):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                DeepseekVL2ImageProcessor._process_images_task,
+                image_data,
+                input_text,
+                max_req_input_len,
+            )
+        else:
+            image_inputs = self._process_images_task(
+                image_data, input_text, max_req_input_len
+            )
+        return image_inputs
+    async def process_mm_data_async(
+        self, image_data, input_ids, request_obj, max_req_input_len, *args, **kwargs
+    ):
+        if not image_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        images, image_sizes = [], []
+        image_token = self.IMAGE_TOKEN
+        base_output = self.load_mm_data(
+            input_ids,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            max_req_input_len=max_req_input_len,
+        )
+        res = await self._process_images(
+            base_output.images, base_output.input_text, max_req_input_len
+        )
+        images_seq_mask = res["images_seq_mask"]
+        images_spatial_crop = res["images_spatial_crop"]
+        batched_images_spatial_crop = []
+        batched_images_spatial_crop.append(images_spatial_crop)
+        batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
+        return {
+            "input_ids": res["input_ids"].tolist(),
+            "pixel_values": res["images"],
+            "im_token_id": res["im_token_id"],
+            "data_hashes": base_output.mm_data_hashes,
+            "image_sizes": image_sizes,
+            "images_emb_mask": images_seq_mask,
+            "image_spatial_crop": batched_images_spatial_crop,
+            "modalities": request_obj.modalities or ["image"],
+        }

sglang/srt/managers/multimodal_processors/gemma3.py ADDED Viewed

@@ -0,0 +1,83 @@
+from typing import List, Union
+from transformers.utils import logging
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
+    get_global_processor,
+)
+from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+# will be removed in the future
+logger = logging.get_logger(__name__)
+class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
+    models = [Gemma3ForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<start_of_image>"
+        self.IM_START_TOKEN_ID = hf_config.boi_token_index
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+    async def _process_single_image(self, images, input_text) -> dict:
+        if isinstance(images, list) and len(images) == 0:
+            images = None
+        processor = get_global_processor()
+        result = processor.__call__(
+            text=[input_text],
+            images=images,
+            padding=True,
+            return_tensors="pt",
+            # if RGBA, this needs to be set
+            # images_kwargs={
+            #     "input_data_format": ChannelDimension.FIRST
+            # }
+        )
+        pixel_values = getattr(result, "pixel_values", None)
+        return {
+            "input_ids": result.input_ids,
+            "pixel_values": pixel_values,
+        }
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_ids,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        image_token = self.IMAGE_TOKEN
+        base_output = self.load_mm_data(
+            input_ids=input_ids,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            max_req_input_len=max_req_input_len,
+            discard_alpha_channel=True,
+        )
+        ret = await self._process_single_image(
+            input_text=base_output.input_text, images=base_output.images
+        )
+        return {
+            "input_ids": ret["input_ids"].flatten().tolist(),
+            "pixel_values": ret["pixel_values"],
+            "data_hashes": base_output.mm_data_hashes,
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+        }

sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py RENAMED Viewed

@@ -1,16 +1,17 @@
 import asyncio
 from typing import List, Union
-from sglang.srt.managers.image_processors.base_image_processor import (
-    BaseImageProcessor as SGLangBaseImageProcessor,
-)
-from sglang.srt.managers.image_processors.base_image_processor import (
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
     get_global_processor,
 )
 from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
-class JanusProProcessor(SGLangBaseImageProcessor):
+class JanusProImageProcessor(BaseMultimodalProcessor):
+    models = [MultiModalityCausalLM]
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
@@ -34,7 +35,7 @@ class JanusProProcessor(SGLangBaseImageProcessor):
             loop = asyncio.get_event_loop()
             image_inputs = await loop.run_in_executor(
                 self.executor,
-                JanusProProcessor._process_images_task,
+                JanusProImageProcessor._process_images_task,
                 images,
                 input_text,
             )
@@ -45,7 +46,7 @@ class JanusProProcessor(SGLangBaseImageProcessor):
         return image_inputs
-    async def process_images_async(
+    async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_ids,
@@ -59,21 +60,25 @@ class JanusProProcessor(SGLangBaseImageProcessor):
         if not isinstance(image_data, list):
             image_data = [image_data]
-        base_out = self.load_images(
-            input_ids, image_data, "<image_placeholder>", max_req_input_len
+        base_out = self.load_mm_data(
+            input_ids=input_ids,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token="<image_placeholder>"
+            ),
+            max_req_input_len=max_req_input_len,
         )
-        images = base_out.all_frames
+        images = base_out.images
         res = await self._process_images(images=images, input_text=base_out.input_text)
+        # print(res)
+        # print(base_out)
+        # print("", res["images_emb_mask"].shape)
         return {
             "input_ids": res["input_ids"].flatten().tolist(),
             "pixel_values": res["pixel_values"],
             "images_emb_mask": res["images_emb_mask"],
-            "image_hashes": base_out.image_hashes,
+            "data_hashes": base_out.mm_data_hashes,
             "im_start_id": res["im_start_id"],
             "im_end_id": res["im_end_id"],
             "im_token_id": res["im_token_id"],
         }
-ImageProcessorMapping = {MultiModalityCausalLM: JanusProProcessor}

sglang/srt/managers/{image_processors → multimodal_processors}/llava.py RENAMED Viewed

@@ -3,8 +3,8 @@ from typing import List, Optional, Union
 import numpy as np
-from sglang.srt.managers.image_processor import BaseImageProcessor
-from sglang.srt.managers.image_processors.base_image_processor import (
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
     get_global_processor,
 )
 from sglang.srt.mm_utils import expand2square, process_anyres_image
@@ -14,7 +14,9 @@ from sglang.srt.utils import load_image, logger
 from sglang.utils import get_exception_traceback
-class LlavaImageProcessor(BaseImageProcessor):
+class LlavaImageProcessor(BaseMultimodalProcessor):
+    models = [LlavaVidForCausalLM, LlavaQwenForCausalLM, LlavaMistralForCausalLM]
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
@@ -84,7 +86,7 @@ class LlavaImageProcessor(BaseImageProcessor):
                 image_data, aspect_ratio, grid_pinpoints
             )
-    async def process_images_async(
+    async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_text,
@@ -111,7 +113,7 @@ class LlavaImageProcessor(BaseImageProcessor):
             if "multi-images" in modalities or "video" in modalities:
                 # Multiple images
                 aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
-                pixel_values, image_hashes, image_sizes = [], [], []
+                pixel_values, data_hashes, image_sizes = [], [], []
                 res = []
                 for img_data in image_data:
                     res.append(
@@ -122,7 +124,7 @@ class LlavaImageProcessor(BaseImageProcessor):
                 res = await asyncio.gather(*res)
                 for pixel_v, image_h, image_s in res:
                     pixel_values.append(pixel_v)
-                    image_hashes.append(image_h)
+                    data_hashes.append(image_h)
                     image_sizes.append(image_s)
                 if isinstance(pixel_values[0], np.ndarray):
@@ -132,21 +134,14 @@ class LlavaImageProcessor(BaseImageProcessor):
                 pixel_values, image_hash, image_size = await self._process_single_image(
                     image_data[0], aspect_ratio, grid_pinpoints
                 )
-                image_hashes = [image_hash]
+                data_hashes = [image_hash]
                 image_sizes = [image_size]
         else:
             raise ValueError(f"Invalid image data: {image_data}")
         return {
             "pixel_values": pixel_values,
-            "image_hashes": image_hashes,
+            "data_hashes": data_hashes,
             "image_sizes": image_sizes,
             "modalities": request_obj.modalities or ["image"],
         }
-ImageProcessorMapping = {
-    LlavaVidForCausalLM: LlavaImageProcessor,
-    LlavaQwenForCausalLM: LlavaImageProcessor,
-    LlavaMistralForCausalLM: LlavaImageProcessor,
-}

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl