PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/bench_serving.py +72 -10
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +6 -16
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +582 -125
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +79 -6
sglang/srt/layers/quantization/__init__.py +137 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +44 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -127
sglang/srt/managers/scheduler.py +29 -23
sglang/srt/managers/tokenizer_manager.py +1 -2
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +16 -13
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +64 -59
sglang/srt/model_loader/loader.py +19 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +568 -0
sglang/srt/models/deepseek_janus_pro.py +12 -17
sglang/srt/models/deepseek_v2.py +339 -123
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +20 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +106 -93
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +120 -25
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +94 -25
sglang/srt/utils.py +137 -51
sglang/test/runners.py +27 -2
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +14 -27
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,19 +8,10 @@ from typing import Optional
 import numpy as np
 import PIL
-import transformers
 from decord import VideoReader, cpu
-from openai import BadRequestError
 from PIL import Image
-from sglang.srt.utils import load_audio, load_image, logger
-global global_processor
-def get_global_processor():
-    global global_processor
-    return global_processor
+from sglang.srt.utils import encode_video, load_audio, load_image, logger
 @dataclasses.dataclass
@@ -28,9 +19,6 @@ class BaseMultiModalProcessorOutput:
     # input_text, with each frame of video/image represented with a image_token
     input_text: str
-    mm_data_hashes: Optional[list[int]]
-    # images
-    image_sizes: Optional[list[int]]
     # frames loaded from image and video, in given order
     images: Optional[list[PIL.Image]] = None
@@ -38,7 +26,7 @@ class BaseMultiModalProcessorOutput:
     audios: Optional[list[np.ndarray]] = None
     def normalize(self):
-        for field_name in ["data_hashes", "image_sizes", "images", "audios"]:
+        for field_name in ["image_sizes", "images", "audios"]:
             field = getattr(self, field_name, None)
             if field is not None and isinstance(field, list) and len(field) == 0:
                 setattr(self, field_name, None)
@@ -68,28 +56,35 @@ class BaseMultimodalProcessor(ABC):
         # FIXME: not accurate, model and image specific
         self.NUM_TOKEN_PER_FRAME = 330
-        # Initialize global processor first
-        init_global_processor(self, server_args)
-        self.executor = concurrent.futures.ProcessPoolExecutor(
-            initializer=init_global_processor,
+        self.io_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=int(os.environ.get("SGLANG_IO_WORKERS", 4))
+        )
+        self.cpu_executor = concurrent.futures.ProcessPoolExecutor(
             mp_context=mp.get_context("fork"),
-            initargs=(
-                self,
-                server_args,
-            ),
-            max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
+            max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
         )
-    def _build_processor(self, server_args):
-        """Init the global processor for multi modal models."""
-        from sglang.srt.hf_transformers_utils import get_processor
-        return get_processor(
-            server_args.tokenizer_path,
-            tokenizer_mode=server_args.tokenizer_mode,
-            trust_remote_code=server_args.trust_remote_code,
+    def process_mm_data(
+        self, input_text, images=None, videos=None, audios=None, **kwargs
+    ):
+        """
+        process multimodal data with transformers AutoProcessor
+        """
+        if images is not None:
+            kwargs["images"] = images
+        if videos is not None:
+            kwargs["videos"] = videos
+        if audios is not None:
+            kwargs["audios"] = audios
+        processor = self._processor
+        result = processor.__call__(
+            text=[input_text],
+            padding=True,
+            return_tensors="pt",
+            **kwargs,
         )
+        return result
     @abstractmethod
     async def process_mm_data_async(
@@ -116,33 +111,9 @@ class BaseMultimodalProcessor(ABC):
         return estimated_frames_list
-    @staticmethod
-    def encode_video(video_path, frame_count_limit=None):
-        if not os.path.exists(video_path):
-            logger.error(f"Video {video_path} does not exist")
-            return []
-        if frame_count_limit == 0:
-            return []
-        def uniform_sample(l, n):
-            gap = len(l) / n
-            idxs = [int(i * gap + gap / 2) for i in range(n)]
-            return [l[i] for i in idxs]
-        vr = VideoReader(video_path, ctx=cpu(0))
-        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
-        frame_indices = [i for i in range(0, len(vr), sample_fps)]
-        if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
-            frame_indices = uniform_sample(frame_indices, frame_count_limit)
-        frames = vr.get_batch(frame_indices).asnumpy()
-        frames = [Image.fromarray(v.astype("uint8")) for v in frames]
-        return frames
     def load_mm_data(
         self,
-        input_ids: list[int],
+        prompt: str,
         multimodal_tokens: MultimodalSpecialTokens,
         max_req_input_len: int,
         image_data: Optional[list] = None,
@@ -168,11 +139,11 @@ class BaseMultimodalProcessor(ABC):
         else:
             multimodal_tokens.image_token = multimodal_tokens.image_token
-        if isinstance(input_ids, list) and return_text:
-            assert len(input_ids) and isinstance(input_ids[0], int)
-            input_text = self._processor.tokenizer.decode(input_ids)
+        if isinstance(prompt, list) and return_text:
+            assert len(prompt) and isinstance(prompt[0], int)
+            prompt = self._processor.tokenizer.decode(prompt)
         else:
-            input_text = input_ids
+            prompt = prompt
         if return_text:
             import re
@@ -182,7 +153,7 @@ class BaseMultimodalProcessor(ABC):
                 + ")"
             )
             # split text into list of normal text and special tokens
-            text_parts = re.split(pattern, input_text)
+            text_parts = re.split(pattern, prompt)
         # TODO(mick): load from server_args, env, or sampling_params
         MAX_NUM_FRAMES = 30
@@ -218,7 +189,7 @@ class BaseMultimodalProcessor(ABC):
                         ):
                             # video
                             path = image_file[len("video:") :]
-                            frames = BaseMultimodalProcessor.encode_video(
+                            frames = encode_video(
                                 path, frame_count_limit=frames_to_process
                             )
                         else:
@@ -231,7 +202,16 @@ class BaseMultimodalProcessor(ABC):
                             continue
                     image_sizes += frames[0].size * len(frames)
-                    hashes += [hash(image_file)] * len(frames)
+                    # Generate a hashable value for the image file
+                    if isinstance(image_file, Image.Image):
+                        # For PIL.Image objects, use the ID as a hashable value
+                        hash_value = hash(id(image_file))
+                    else:
+                        # For other types (strings, etc.), use the regular hash
+                        hash_value = hash(image_file)
+                    hashes += [hash_value] * len(frames)
                     images += frames
                     image_index += 1
                     if frames_to_process != 0:
@@ -252,24 +232,12 @@ class BaseMultimodalProcessor(ABC):
             except Exception as e:
                 logger.error(f"An exception occurred while loading images: {e}")
-                raise BadRequestError(
-                    f"An exception occurred while loading images: {e}"
-                )
+                raise RuntimeError(f"An exception occurred while loading images: {e}")
         out = BaseMultiModalProcessorOutput(
-            mm_data_hashes=hashes,
-            image_sizes=image_sizes,
             images=images,
             audios=audios,
             input_text=new_text,
         )
         out.normalize()
         return out
-def init_global_processor(sglang_processor: BaseMultimodalProcessor, server_args):
-    """
-    Init the global processor for multimodal models."""
-    global global_processor
-    transformers.logging.set_verbosity_error()
-    global_processor = sglang_processor._build_processor(server_args=server_args)

sglang/srt/managers/multimodal_processors/clip.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import List, Union
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.clip import CLIPModel
+from sglang.srt.utils import load_image
+class ClipImageProcessor(BaseMultimodalProcessor):
+    models = [CLIPModel]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+    async def process_mm_data_async(
+        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
+    ):
+        if not image_data:
+            return None
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        if len(image_data) > 0:
+            images = [load_image(image)[0] for image in image_data]
+        else:
+            images = load_image(image_data[0])[0]
+        image_inputs = self.process_mm_data(input_text=input_text, images=images)
+        image_inputs["data_hashes"] = [hash(str(image_data))]
+        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
+        image_inputs["mm_items"] = [
+            MultimodalDataItem(
+                pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
+            )
+        ]
+        return image_inputs

sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py CHANGED Viewed

@@ -16,15 +16,14 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-import asyncio
 import torch
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
-    get_global_processor,
 )
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
@@ -35,51 +34,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         super().__init__(hf_config, server_args, _processor)
         self.IMAGE_TOKEN = "<image>"
-    @staticmethod
-    def _process_images_task(image, input_text, max_req_input_len):
-        processor = get_global_processor()
-        res = processor.__call__(
-            conversations=input_text, images=image, max_req_input_len=max_req_input_len
-        )
-        image_token_id = processor.image_token_id
-        res["im_token_id"] = image_token_id
-        return res
-    async def _process_images(self, image_data, input_text, max_req_input_len):
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            image_inputs = await loop.run_in_executor(
-                self.executor,
-                DeepseekVL2ImageProcessor._process_images_task,
-                image_data,
-                input_text,
-                max_req_input_len,
-            )
-        else:
-            image_inputs = self._process_images_task(
-                image_data, input_text, max_req_input_len
-            )
-        return image_inputs
-    async def _process_images(self, image_data, input_text, max_req_input_len):
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            image_inputs = await loop.run_in_executor(
-                self.executor,
-                DeepseekVL2ImageProcessor._process_images_task,
-                image_data,
-                input_text,
-                max_req_input_len,
-            )
-        else:
-            image_inputs = self._process_images_task(
-                image_data, input_text, max_req_input_len
-            )
-        return image_inputs
     async def process_mm_data_async(
         self, image_data, input_ids, request_obj, max_req_input_len, *args, **kwargs
     ):
@@ -89,8 +43,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         if not isinstance(image_data, list):
             image_data = [image_data]
-        images, image_sizes = [], []
         image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
             input_ids,
@@ -98,8 +50,11 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
             multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
             max_req_input_len=max_req_input_len,
         )
-        res = await self._process_images(
-            base_output.images, base_output.input_text, max_req_input_len
+        res = self.process_mm_data(
+            input_text=base_output.input_text,
+            images=base_output.images,
+            max_req_input_len=max_req_input_len,
+            conversations=base_output.input_text,
         )
         images_seq_mask = res["images_seq_mask"]
         images_spatial_crop = res["images_spatial_crop"]
@@ -107,13 +62,17 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         batched_images_spatial_crop.append(images_spatial_crop)
         batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
+        items = []
+        item = MultimodalDataItem(
+            pixel_values=res["images"],
+            modality=Modality.IMAGE,
+            image_emb_mask=images_seq_mask,
+            image_spatial_crop=batched_images_spatial_crop,
+        )
+        items += [item]
         return {
+            "mm_items": items,
             "input_ids": res["input_ids"].tolist(),
-            "pixel_values": res["images"],
-            "im_token_id": res["im_token_id"],
-            "data_hashes": base_output.mm_data_hashes,
-            "image_sizes": image_sizes,
-            "images_emb_mask": images_seq_mask,
-            "image_spatial_crop": batched_images_spatial_crop,
-            "modalities": request_obj.modalities or ["image"],
+            "im_token_id": self._processor.image_token_id,
         }

sglang/srt/managers/multimodal_processors/gemma3.py CHANGED Viewed

@@ -7,8 +7,8 @@ from sglang.srt.managers.multimodal_processor import (
 )
 from sglang.srt.managers.multimodal_processors.base_processor import (
     MultimodalSpecialTokens,
-    get_global_processor,
 )
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
 # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -25,28 +25,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         self.IM_START_TOKEN_ID = hf_config.boi_token_index
         self.IM_END_TOKEN_ID = hf_config.eoi_token_index
-    async def _process_single_image(self, images, input_text) -> dict:
-        if isinstance(images, list) and len(images) == 0:
-            images = None
-        processor = get_global_processor()
-        result = processor.__call__(
-            text=[input_text],
-            images=images,
-            padding=True,
-            return_tensors="pt",
-            # if RGBA, this needs to be set
-            # images_kwargs={
-            #     "input_data_format": ChannelDimension.FIRST
-            # }
-        )
-        pixel_values = getattr(result, "pixel_values", None)
-        return {
-            "input_ids": result.input_ids,
-            "pixel_values": pixel_values,
-        }
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
@@ -63,21 +41,28 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
-            input_ids=input_ids,
+            prompt=input_ids,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
             max_req_input_len=max_req_input_len,
             discard_alpha_channel=True,
         )
-        ret = await self._process_single_image(
+        ret = self.process_mm_data(
             input_text=base_output.input_text, images=base_output.images
         )
+        items = []
+        for i, image in enumerate(base_output.images):
+            item = MultimodalDataItem(
+                pixel_values=ret["pixel_values"][i],
+                modality=Modality.IMAGE,
+            )
+            items += [item]
         return {
+            "mm_items": items,
             "input_ids": ret["input_ids"].flatten().tolist(),
-            "pixel_values": ret["pixel_values"],
-            "data_hashes": base_output.mm_data_hashes,
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
         }

sglang/srt/managers/multimodal_processors/janus_pro.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import asyncio
 from typing import List, Union
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
-    get_global_processor,
 )
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
@@ -15,37 +14,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-    @staticmethod
-    def _process_images_task(images, input_text):
-        processor = get_global_processor()
-        result = processor.__call__(
-            prompt=input_text, images=images, return_tensors="pt"
-        )
-        return {
-            "input_ids": result["input_ids"],
-            "pixel_values": result["pixel_values"],
-            "images_emb_mask": result["images_emb_mask"],
-            "im_start_id": processor.image_start_id,
-            "im_end_id": processor.image_end_id,
-            "im_token_id": processor.image_id,
-        }
-    async def _process_images(self, images, input_text):
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            image_inputs = await loop.run_in_executor(
-                self.executor,
-                JanusProImageProcessor._process_images_task,
-                images,
-                input_text,
-            )
-        else:
-            image_inputs = self._processor(
-                images=images, text=input_text, return_tensors="pt"
-            )
-        return image_inputs
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
@@ -60,25 +28,31 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
         if not isinstance(image_data, list):
             image_data = [image_data]
+        processor = self._processor
         base_out = self.load_mm_data(
-            input_ids=input_ids,
+            prompt=input_ids,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token="<image_placeholder>"
-            ),
+            multimodal_tokens=MultimodalSpecialTokens(image_token=processor.image_tag),
             max_req_input_len=max_req_input_len,
         )
         images = base_out.images
-        res = await self._process_images(images=images, input_text=base_out.input_text)
-        # print(res)
-        # print(base_out)
-        # print("", res["images_emb_mask"].shape)
+        res = self.process_mm_data(
+            input_text=base_out.input_text,
+            prompt=base_out.input_text,
+            images=images,
+        )
         return {
+            "mm_items": [
+                MultimodalDataItem(
+                    pixel_values=res["pixel_values"],
+                    image_emb_mask=res["images_emb_mask"],
+                    modality=Modality.IMAGE,
+                )
+            ],
             "input_ids": res["input_ids"].flatten().tolist(),
-            "pixel_values": res["pixel_values"],
-            "images_emb_mask": res["images_emb_mask"],
-            "data_hashes": base_out.mm_data_hashes,
-            "im_start_id": res["im_start_id"],
-            "im_end_id": res["im_end_id"],
-            "im_token_id": res["im_token_id"],
+            "im_start_id": processor.image_start_id,
+            "im_end_id": processor.image_end_id,
+            "im_token_id": processor.image_id,
         }

sglang/srt/managers/multimodal_processors/llava.py CHANGED Viewed

@@ -5,17 +5,26 @@ import numpy as np
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
-    get_global_processor,
 )
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.mm_utils import expand2square, process_anyres_image
-from sglang.srt.models.llava import LlavaMistralForCausalLM, LlavaQwenForCausalLM
+from sglang.srt.models.llava import (
+    LlavaLlamaForCausalLM,
+    LlavaMistralForCausalLM,
+    LlavaQwenForCausalLM,
+)
 from sglang.srt.models.llavavid import LlavaVidForCausalLM
 from sglang.srt.utils import load_image, logger
 from sglang.utils import get_exception_traceback
 class LlavaImageProcessor(BaseMultimodalProcessor):
-    models = [LlavaVidForCausalLM, LlavaQwenForCausalLM, LlavaMistralForCausalLM]
+    models = [
+        LlavaLlamaForCausalLM,
+        LlavaVidForCausalLM,
+        LlavaQwenForCausalLM,
+        LlavaMistralForCausalLM,
+    ]
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
@@ -25,11 +34,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
         image_data: Union[str, bytes],
         image_aspect_ratio: Optional[str] = None,
         image_grid_pinpoints: Optional[str] = None,
-        image_processor=None,
+        processor=None,
     ):
-        processor = get_global_processor()
-        image_processor = image_processor or processor.image_processor
+        image_processor = processor.image_processor
         try:
             image, image_size = load_image(image_data)
@@ -72,18 +80,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
     async def _process_single_image(
         self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
     ):
-        if self.executor is not None:
+        if self.cpu_executor is not None:
             loop = asyncio.get_event_loop()
             return await loop.run_in_executor(
-                self.executor,
+                self.cpu_executor,
                 LlavaImageProcessor._process_single_image_task,
                 image_data,
                 aspect_ratio,
                 grid_pinpoints,
+                self._processor,
             )
         else:
             return self._process_single_image_task(
-                image_data, aspect_ratio, grid_pinpoints
+                image_data,
+                aspect_ratio,
+                grid_pinpoints,
+                self._processor.image_processor,
             )
     async def process_mm_data_async(
@@ -134,14 +146,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
                 pixel_values, image_hash, image_size = await self._process_single_image(
                     image_data[0], aspect_ratio, grid_pinpoints
                 )
-                data_hashes = [image_hash]
                 image_sizes = [image_size]
         else:
             raise ValueError(f"Invalid image data: {image_data}")
+        modality = Modality.IMAGE
+        if isinstance(request_obj.modalities, list):
+            if request_obj.modalities[0] == "multi-images":
+                modality = Modality.MULTI_IMAGES
+            elif request_obj.modalities[0] == "video":
+                modality = Modality.VIDEO
         return {
-            "pixel_values": pixel_values,
-            "data_hashes": data_hashes,
-            "image_sizes": image_sizes,
-            "modalities": request_obj.modalities or ["image"],
+            "mm_items": [
+                MultimodalDataItem(
+                    pixel_values=pixel_values,
+                    image_sizes=image_sizes,
+                    modality=modality,
+                )
+            ],
         }

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl