PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +12 -1
sglang/srt/conversation.py +35 -1
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/layers/communicator.py +3 -1
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +58 -0
sglang/srt/layers/moe/ep_moe/layer.py +140 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +135 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +8 -1
sglang/srt/managers/mm_utils.py +4 -2
sglang/srt/managers/schedule_batch.py +1 -1
sglang/srt/managers/scheduler.py +17 -5
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +113 -63
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/models/deepseek_v2.py +16 -2
sglang/srt/models/mllama4.py +360 -79
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +62 -60
sglang/srt/server_args.py +15 -0
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +37 -17
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +4 -3
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +47 -43
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/mllama4.py CHANGED Viewed

@@ -1,3 +1,6 @@
+import json as json_lib
+import logging
+import os
 from collections.abc import Iterable
 from typing import List, Optional, Set, Tuple
@@ -19,6 +22,13 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix, is_cpu
 _is_cpu = is_cpu()
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix
+logger = logging.getLogger(__name__)
 class Llama4ForConditionalGeneration(nn.Module):
@@ -37,19 +47,85 @@ class Llama4ForConditionalGeneration(nn.Module):
         self.config = config
         self.quant_config = quant_config
-        self.vision_model = Llama4VisionModel(config.vision_config)
-        self.multi_modal_projector = Llama4MultiModalProjector(config)
+        # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
+        self.has_vision = self._has_vision_weights(config)
+        if not self.has_vision:
+            logger.warning(
+                "No vision weights found in checkpoint. Model will run in text-only mode. "
+                "Multimodal capabilities (image processing) will be unavailable."
+            )
+        if self.has_vision:
+            self.vision_model = Llama4VisionModel(config.vision_config)
+            self.multi_modal_projector = Llama4MultiModalProjector(config)
+        else:
+            self.vision_model = None
+            self.multi_modal_projector = None
         # Initialize the language model
         from sglang.srt.models.llama4 import Llama4ForCausalLM
         self.language_model = Llama4ForCausalLM(
-            config.text_config,
+            config.text_config if hasattr(config, "text_config") else config,
             quant_config=quant_config,
             prefix=add_prefix("language_model", prefix),
         )
-        self.logits_processor = LogitsProcessor(config.text_config)
+        self.logits_processor = LogitsProcessor(
+            config.text_config if hasattr(config, "text_config") else config
+        )
+    def _has_vision_weights(self, config) -> bool:
+        """Check if the model has vision components by examining the checkpoint."""
+        model_path = getattr(config, "_name_or_path", None)
+        if not model_path:
+            return False
+        # Check if this is a local path first
+        if os.path.isdir(model_path):
+            index_file = os.path.join(model_path, "model.safetensors.index.json")
+            if os.path.exists(index_file):
+                return self._check_vision_weights_in_index(index_file)
+        # For HuggingFace models, we need to check the actual checkpoint
+        # The config might say it's multimodal, but the checkpoint might be text-only
+        try:
+            # Try to access the HuggingFace cache directory
+            from huggingface_hub import try_to_load_from_cache
+            # Check if index file exists in cache
+            index_file_path = try_to_load_from_cache(
+                repo_id=model_path,
+                filename="model.safetensors.index.json",
+                cache_dir=None,
+            )
+            if index_file_path and os.path.exists(index_file_path):
+                return self._check_vision_weights_in_index(index_file_path)
+        except Exception:
+            # If we can't access the cache, fall back to config-based detection
+            pass
+        # Fallback， assume text-only
+        return False
+    def _check_vision_weights_in_index(self, index_file: str) -> bool:
+        """Check if the model.safetensors.index.json contains vision weights."""
+        try:
+            with open(index_file, "r") as f:
+                index_data = json_lib.load(f)
+            vision_patterns = ["vision_model", "vision_tower", "multi_modal_projector"]
+            weight_names = index_data.get("weight_map", {}).keys()
+            return any(
+                pattern in weight_name
+                for weight_name in weight_names
+                for pattern in vision_patterns
+            )
+        except (OSError, json_lib.JSONDecodeError, KeyError):
+            return False
     def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
         pattern = MultiModalityDataPaddingPatternMultimodalTokens()
@@ -59,6 +135,10 @@ class Llama4ForConditionalGeneration(nn.Module):
         self,
         items: List[MultimodalDataItem],
     ) -> torch.Tensor:
+        # For text-only models, return None or raise an error
+        if not self.has_vision or self.vision_model is None:
+            raise ValueError("Vision model not available for text-only checkpoint")
         pixel_values = (
             torch.concat([item.pixel_values for item in items])
             .to(next(self.vision_model.parameters()).device)
@@ -79,11 +159,14 @@ class Llama4ForConditionalGeneration(nn.Module):
         **kwargs: object,
     ) -> torch.Tensor:
+        # For text-only models, pass None for image_data_embedding_func
+        image_embedding_func = self.get_image_feature if self.has_vision else None
         hs = general_mm_embed_routine(
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            image_data_embedding_func=image_embedding_func,
             positions=positions,
         )
@@ -124,7 +207,6 @@ class Llama4ForConditionalGeneration(nn.Module):
         return name, loaded_weight
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
@@ -137,11 +219,12 @@ class Llama4ForConditionalGeneration(nn.Module):
         ]
         params_dict = dict(self.named_parameters())
+        num_experts = (
+            self.config.text_config.num_local_experts
+            if hasattr(self.config, "text_config")
+            else self.config.num_local_experts
+        )
-        num_experts = self.config.text_config.num_local_experts
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
@@ -150,81 +233,279 @@ class Llama4ForConditionalGeneration(nn.Module):
         )
         for name, loaded_weight in weights:
-            if not "vision" in name:
+            if self._should_skip_weight(name):
+                continue
+            name = self._transform_weight_name(name)
+            if "vision" not in name:
                 name, loaded_weight = self.permute_qk_weight_for_rotary(
                     name, loaded_weight
                 )
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                if "vision" in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
+            if self._handle_scale_remapping(name, params_dict):
+                continue
+            if self._handle_stacked_params(
+                name, loaded_weight, stacked_params_mapping, params_dict
+            ):
+                continue
+            if self._handle_expert_weights(
+                name, loaded_weight, expert_params_mapping, params_dict, num_experts
+            ):
+                continue
+            self._handle_default_weight(name, loaded_weight, params_dict)
+    def _should_skip_weight(self, name: str) -> bool:
+        """Check if we should skip loading this weight."""
+        return "vision" in name and not self.has_vision
+    def _transform_weight_name(self, name: str) -> str:
+        """Transform weight name by adding language_model prefix if needed."""
+        if (
+            not name.startswith("language_model.")
+            and "vision" not in name
+            and "multi_modal_projector" not in name
+        ):
+            return f"language_model.{name}"
+        return name
+    def _handle_scale_remapping(self, name: str, params_dict: dict) -> bool:
+        """Handle scale parameter remapping. Returns True if handled."""
+        if "scale" in name and "expert" not in name:
+            remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            return remapped_name is None
+        return False
+    def _handle_stacked_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        stacked_params_mapping: list,
+        params_dict: dict,
+    ) -> bool:
+        """Handle stacked parameter loading. Returns True if handled."""
+        for param_name, weight_name, shard_id in stacked_params_mapping:
+            if weight_name in name and "vision" not in name:
+                transformed_name = name.replace(weight_name, param_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(param, loaded_weight, shard_id)
+                return True
+        return False
+    def _handle_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+        num_experts: int,
+    ) -> bool:
+        """Handle expert weight loading for MoE (Mixture of Experts) layers.
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: Mapping of parameter names to expert configurations
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts in the MoE layer
+        Returns:
+            bool: True if the parameter was handled (is an expert parameter), False otherwise
+        """
+        if ".experts" not in name:
+            return False
+        if "experts.gate_up_proj" not in name and "experts.down_proj" not in name:
+            return self._handle_other_expert_params(
+                name, loaded_weight, expert_params_mapping, params_dict
+            )
+        if "scale" in name:
+            return self._handle_expert_scale_params(
+                name, loaded_weight, params_dict, num_experts
+            )
+        else:
+            return self._handle_expert_weight_params(
+                name, loaded_weight, params_dict, num_experts
+            )
+    def _handle_other_expert_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+    ) -> bool:
+        """Handle expert parameters that are not gate_up_proj or down_proj weights.
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: List of tuples mapping checkpoint names to model parameters
+            params_dict: Dictionary of model parameters
+        Returns:
+            bool: True if parameter was found and handled, False otherwise
+        """
+        for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+            if weight_name in name:
+                transformed_name = name.replace(weight_name, param_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(
+                    param, loaded_weight, name, shard_id=shard_id, expert_id=expert_id
+                )
+                return True
+        return False
+    def _transform_expert_name(
+        self, name: str, is_weight: bool = False
+    ) -> Tuple[str, str, List[str]]:
+        """Transform expert parameter name and get shard information.
+        Args:
+            name: The original parameter name
+            is_weight: Whether this is a weight parameter (adds _weight suffix)
+        Returns:
+            Tuple of (transformed_name, shard_id, shard_id_list)
+        """
+        suffix = "_weight" if is_weight else ""
+        if ".gate_up_proj" in name:
+            transformed_name = name.replace(
+                ".experts.gate_up_proj", f".experts.w13{suffix}"
+            )
+            shard_id = "w13"
+            shard_id_list = ["w1", "w3"]
+        else:  # down_proj
+            transformed_name = name.replace(
+                ".experts.down_proj", f".experts.w2{suffix}"
+            )
+            shard_id = "w2"
+            shard_id_list = ["w2"]
+        return transformed_name, shard_id, shard_id_list
+    def _handle_expert_scale_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+    ) -> bool:
+        """Handle quantization scale parameters for expert weights.
+        Args:
+            name: Parameter name containing scale information
+            loaded_weight: Scale tensor to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for broadcast operations
+        Returns:
+            bool: True (always handles scale parameters)
+        """
+        import re
+        # Check if this matches the expert parameter pattern: experts.{expert_id}.{param_name}
+        expert_match = re.search(r"experts\.(\d+)\.", name)
+        # Transform name
+        transformed_name, _, _ = self._transform_expert_name(name)
+        if transformed_name not in params_dict:
+            return True
+        param = params_dict[transformed_name]
+        # Handle scale parameters
+        if expert_match:
+            # If we have a specific expert ID, only load for that expert
+            expert_id = int(expert_match.group(1))
+            # For scale parameters, we can directly set the value
+            param.data[expert_id] = loaded_weight
+        else:
+            # No expert ID found - this is a single scale for all experts
+            # Load the same scale for all experts
+            for expert_id in range(num_experts):
+                param.data[expert_id] = loaded_weight
+        return True
+    def _handle_expert_weight_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+    ) -> bool:
+        """Handle actual weight tensors for expert layers (gate_up_proj and down_proj).
+        Args:
+            name: Parameter name (should contain gate_up_proj or down_proj)
+            loaded_weight: Weight tensor(s) to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for tensor distribution
+        Returns:
+            bool: True (always handles weight parameters)
+        """
+        # Transform name and get shard info
+        transformed_name, _, shard_id_list = self._transform_expert_name(
+            name, is_weight=True
+        )
+        if ".gate_up_proj" in name:
+            loaded_weight_list = loaded_weight.chunk(2, dim=-1)
+        else:  # down_proj
+            loaded_weight_list = [loaded_weight]
+        for param_name, weight_chunk, shard_id in zip(
+            [transformed_name] * len(shard_id_list), loaded_weight_list, shard_id_list
+        ):
+            if param_name not in params_dict:
+                continue
+            param = params_dict[param_name]
+            weight_loader = param.weight_loader
+            # Handle the case where loaded_weight might be a single tensor for all experts
+            if weight_chunk.dim() == 2:
+                # Single tensor case - load for all experts
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk.T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
             else:
-                if ".experts" in name:
-                    # NOTE: llama4 fp8 has different weight format for experts
-                    if (
-                        "experts.gate_up_proj" not in name
-                        and "experts.down_proj" not in name
-                    ):
-                        for mapping in expert_params_mapping:
-                            param_name, weight_name, expert_id, shard_id = mapping
-                            if weight_name not in name:
-                                continue
-                            name = name.replace(weight_name, param_name)
-                            param = params_dict[name]
-                            weight_loader = param.weight_loader
-                            weight_loader(
-                                param,
-                                loaded_weight,
-                                name,
-                                shard_id=shard_id,
-                                expert_id=expert_id,
-                            )
-                            break
-                    else:
-                        if ".gate_up_proj" in name:
-                            name_list = [
-                                name.replace(
-                                    ".experts.gate_up_proj", ".experts.w13_weight"
-                                )
-                            ] * 2
-                            loaded_weight_list = loaded_weight.chunk(2, dim=-1)
-                            shard_id_list = ["w1", "w3"]
-                        else:
-                            name_list = [
-                                name.replace(".experts.down_proj", ".experts.w2_weight")
-                            ]
-                            shard_id_list = ["w2"]
-                            loaded_weight_list = [loaded_weight]
-                        for name, loaded_weight, shard_id in zip(
-                            name_list, loaded_weight_list, shard_id_list
-                        ):
-                            param = params_dict[name]
-                            weight_loader = param.weight_loader
-                            for expert_id in range(num_experts):
-                                weight_loader(
-                                    param,
-                                    loaded_weight[expert_id].T,
-                                    name,
-                                    shard_id=shard_id,
-                                    expert_id=expert_id,
-                                )
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
+                # Multiple experts case - load each expert's weights
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk[expert_id].T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
                     )
-                    weight_loader(param, loaded_weight)
+        return True
+    def _handle_default_weight(
+        self, name: str, loaded_weight: torch.Tensor, params_dict: dict
+    ):
+        """Handle default weight loading."""
+        # Skip loading extra bias for GPTQ models
+        if name.endswith(".bias") and name not in params_dict:
+            return
+        param = params_dict[name]
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, loaded_weight)
     def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
         if hasattr(self.language_model, "set_eagle3_layers_to_capture"):

sglang/srt/multimodal/mm_utils.py CHANGED Viewed

@@ -28,12 +28,12 @@ LLaVA-Onevision : https://arxiv.org/pdf/2408.03326
 """
 import ast
-import base64
 import math
 import re
 from io import BytesIO
 import numpy as np
+import pybase64
 from PIL import Image
 from sglang.srt.utils import flatten_nested_list
@@ -252,7 +252,7 @@ def process_anyres_image(image, processor, grid_pinpoints):
 def load_image_from_base64(image):
-    return Image.open(BytesIO(base64.b64decode(image)))
+    return Image.open(BytesIO(pybase64.b64decode(image, validate=True)))
 def expand2square(pil_img, background_color):

sglang/srt/multimodal/processors/mllama4.py CHANGED Viewed

@@ -60,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         )
         # Handle image resolutions and aspect ratios
-        if "pixel_values" in processor_output:
-            image_processor = processor.image_processor
-            tokenizer = self._processor.tokenizer
+        if "pixel_values" not in processor_output:  # no image processed
+            return None
-            # Calculate tile size and find supported resolutions
-            tile_size = self.vision_config.image_size
-            max_num_tiles = getattr(self.vision_config, "max_patches", 1)
+        image_processor = processor.image_processor
+        tokenizer = self._processor.tokenizer
-            possible_resolutions = find_supported_resolutions(
-                max_num_chunks=max_num_tiles,
-                patch_size=SizeDict(height=tile_size, width=tile_size),
+        # Calculate tile size and find supported resolutions
+        tile_size = self.vision_config.image_size
+        max_num_tiles = getattr(self.vision_config, "max_patches", 1)
+        possible_resolutions = find_supported_resolutions(
+            max_num_chunks=max_num_tiles,
+            patch_size=SizeDict(height=tile_size, width=tile_size),
+        )
+        # Find best fit for each image
+        best_fit_sizes = [
+            get_best_fit(
+                (image.size[1], image.size[0]),  # (height, width)
+                torch.tensor(possible_resolutions),
+                resize_to_max_canvas=image_processor.resize_to_max_canvas,
             )
+            for image in processed_data.images
+        ]
+        # Calculate aspect ratios and patches per image
+        aspect_ratios = [
+            (image_size[0] // tile_size, image_size[1] // tile_size)
+            for image_size in best_fit_sizes
+        ]
+        patches_per_image = [
+            1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
+        ]
+        # Add to image_inputs
+        processor_output["aspect_ratios"] = aspect_ratios
+        processor_output["patches_per_image"] = torch.tensor(patches_per_image)
+        # Process embed_is_patch
+        vocab = tokenizer.get_vocab()
+        patch_id = vocab.get(processor.img_patch_token, -1)
+        image_end_id = vocab.get(processor.end_of_img_token, -1)
+        if patch_id != -1 and image_end_id != -1:
+            input_ids = processor_output["input_ids"].view(-1)
+            # Remove BOS token if present
+            if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
+                input_ids = input_ids[1:]
+            # Find image end indices and split input_ids
+            image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
+            if image_end_indices.size(0) > 0:
+                # Split at image boundaries
+                split_indices = (image_end_indices + 1)[:-1]
+                split_input_ids = torch.tensor_split(input_ids, split_indices)
+                split_input_ids = [x for x in split_input_ids if x.numel() > 0]
+                # Create embed_is_patch for each image
+                embed_is_patch = []
+                for per_image_input_ids in split_input_ids:
+                    embed_is_patch.append(per_image_input_ids == patch_id)
-            # Find best fit for each image
-            best_fit_sizes = [
-                get_best_fit(
-                    (image.size[1], image.size[0]),  # (height, width)
-                    torch.tensor(possible_resolutions),
-                    resize_to_max_canvas=image_processor.resize_to_max_canvas,
-                )
-                for image in processed_data.images
-            ]
-            # Calculate aspect ratios and patches per image
-            aspect_ratios = [
-                (image_size[0] // tile_size, image_size[1] // tile_size)
-                for image_size in best_fit_sizes
-            ]
-            patches_per_image = [
-                1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
-            ]
-            # Add to image_inputs
-            processor_output["aspect_ratios"] = aspect_ratios
-            processor_output["patches_per_image"] = torch.tensor(patches_per_image)
-            # Process embed_is_patch
-            vocab = tokenizer.get_vocab()
-            patch_id = vocab.get(processor.img_patch_token, -1)
-            image_end_id = vocab.get(processor.end_of_img_token, -1)
-            if patch_id != -1 and image_end_id != -1:
-                input_ids = processor_output["input_ids"].view(-1)
-                # Remove BOS token if present
-                if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
-                    input_ids = input_ids[1:]
-                # Find image end indices and split input_ids
-                image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
-                if image_end_indices.size(0) > 0:
-                    # Split at image boundaries
-                    split_indices = (image_end_indices + 1)[:-1]
-                    split_input_ids = torch.tensor_split(input_ids, split_indices)
-                    split_input_ids = [x for x in split_input_ids if x.numel() > 0]
-                    # Create embed_is_patch for each image
-                    embed_is_patch = []
-                    for per_image_input_ids in split_input_ids:
-                        embed_is_patch.append(per_image_input_ids == patch_id)
-                    processor_output["embed_is_patch"] = embed_is_patch
+                processor_output["embed_is_patch"] = embed_is_patch
         # Convert to the format expected by SGLang
         processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]

sglang 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl