PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py RENAMED Viewed

@@ -2,13 +2,13 @@ from typing import List, Union
 import torch
-from sglang.srt.managers.multimodal_processors.base_processor import (
-    BaseMultimodalProcessor,
-    MultimodalSpecialTokens,
-)
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.minicpmo import MiniCPMO
 from sglang.srt.models.minicpmv import MiniCPMV
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
 # Compatible with both 'O' and 'V'
@@ -23,19 +23,12 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
+        audio_data: List[Union[str, bytes]],
         input_text,
         request_obj,
         max_req_input_len,
         **kwargs,
     ):
-        audio_data = request_obj.audio_data
-        if not image_data and not audio_data:
-            return None
-        if not isinstance(image_data, list):
-            image_data = [image_data]
-        if not isinstance(audio_data, list):
-            audio_data = [audio_data]
         base_output = self.load_mm_data(
             prompt=input_text,
             max_req_input_len=max_req_input_len,

sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py RENAMED Viewed

@@ -1,10 +1,8 @@
 from typing import List, Union
-from sglang.srt.managers.multimodal_processors.base_processor import (
-    BaseMultimodalProcessor,
-)
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.mllama import MllamaForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
 from sglang.srt.utils import load_image
@@ -17,21 +15,11 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
     async def process_mm_data_async(
         self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
     ):
-        if not image_data:
-            return None
         if isinstance(input_text, list):
             assert len(input_text) and isinstance(input_text[0], int)
             input_text = self._processor.tokenizer.decode(input_text)
-        if not isinstance(image_data, list):
-            image_data = [image_data]
-        if len(image_data) > 0:
-            images = [load_image(image)[0] for image in image_data]
-        else:
-            images = load_image(image_data[0])[0]
+        images = [load_image(image)[0] for image in image_data]
         image_inputs = self.process_mm_data(input_text=input_text, images=images)
         image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
         image_inputs["mm_items"] = [

sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py RENAMED Viewed

@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
     get_best_fit,
 )
-from sglang.srt.managers.multimodal_processors.base_processor import (
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
 class Mllama4ImageProcessor(BaseMultimodalProcessor):
@@ -37,9 +37,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         *args,
         **kwargs,
     ):
-        if not image_data:
-            return None
         if isinstance(input_text, list):
             assert len(input_text) and isinstance(input_text[0], int)
             input_text = self._processor.tokenizer.decode(input_text)
@@ -63,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         )
         # Handle image resolutions and aspect ratios
-        if "pixel_values" in processor_output:
-            image_processor = processor.image_processor
-            tokenizer = self._processor.tokenizer
+        if "pixel_values" not in processor_output:  # no image processed
+            return None
+        image_processor = processor.image_processor
+        tokenizer = self._processor.tokenizer
-            # Calculate tile size and find supported resolutions
-            tile_size = self.vision_config.image_size
-            max_num_tiles = getattr(self.vision_config, "max_patches", 1)
+        # Calculate tile size and find supported resolutions
+        tile_size = self.vision_config.image_size
+        max_num_tiles = getattr(self.vision_config, "max_patches", 1)
-            possible_resolutions = find_supported_resolutions(
-                max_num_chunks=max_num_tiles,
-                patch_size=SizeDict(height=tile_size, width=tile_size),
+        possible_resolutions = find_supported_resolutions(
+            max_num_chunks=max_num_tiles,
+            patch_size=SizeDict(height=tile_size, width=tile_size),
+        )
+        # Find best fit for each image
+        best_fit_sizes = [
+            get_best_fit(
+                (image.size[1], image.size[0]),  # (height, width)
+                torch.tensor(possible_resolutions),
+                resize_to_max_canvas=image_processor.resize_to_max_canvas,
             )
+            for image in processed_data.images
+        ]
+        # Calculate aspect ratios and patches per image
+        aspect_ratios = [
+            (image_size[0] // tile_size, image_size[1] // tile_size)
+            for image_size in best_fit_sizes
+        ]
+        patches_per_image = [
+            1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
+        ]
+        # Add to image_inputs
+        processor_output["aspect_ratios"] = aspect_ratios
+        processor_output["patches_per_image"] = torch.tensor(patches_per_image)
+        # Process embed_is_patch
+        vocab = tokenizer.get_vocab()
+        patch_id = vocab.get(processor.img_patch_token, -1)
+        image_end_id = vocab.get(processor.end_of_img_token, -1)
+        if patch_id != -1 and image_end_id != -1:
+            input_ids = processor_output["input_ids"].view(-1)
+            # Remove BOS token if present
+            if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
+                input_ids = input_ids[1:]
+            # Find image end indices and split input_ids
+            image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
+            if image_end_indices.size(0) > 0:
+                # Split at image boundaries
+                split_indices = (image_end_indices + 1)[:-1]
+                split_input_ids = torch.tensor_split(input_ids, split_indices)
+                split_input_ids = [x for x in split_input_ids if x.numel() > 0]
+                # Create embed_is_patch for each image
+                embed_is_patch = []
+                for per_image_input_ids in split_input_ids:
+                    embed_is_patch.append(per_image_input_ids == patch_id)
-            # Find best fit for each image
-            best_fit_sizes = [
-                get_best_fit(
-                    (image.size[1], image.size[0]),  # (height, width)
-                    torch.tensor(possible_resolutions),
-                    resize_to_max_canvas=image_processor.resize_to_max_canvas,
-                )
-                for image in processed_data.images
-            ]
-            # Calculate aspect ratios and patches per image
-            aspect_ratios = [
-                (image_size[0] // tile_size, image_size[1] // tile_size)
-                for image_size in best_fit_sizes
-            ]
-            patches_per_image = [
-                1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
-            ]
-            # Add to image_inputs
-            processor_output["aspect_ratios"] = aspect_ratios
-            processor_output["patches_per_image"] = torch.tensor(patches_per_image)
-            # Process embed_is_patch
-            vocab = tokenizer.get_vocab()
-            patch_id = vocab.get(processor.img_patch_token, -1)
-            image_end_id = vocab.get(processor.end_of_img_token, -1)
-            if patch_id != -1 and image_end_id != -1:
-                input_ids = processor_output["input_ids"].view(-1)
-                # Remove BOS token if present
-                if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
-                    input_ids = input_ids[1:]
-                # Find image end indices and split input_ids
-                image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
-                if image_end_indices.size(0) > 0:
-                    # Split at image boundaries
-                    split_indices = (image_end_indices + 1)[:-1]
-                    split_input_ids = torch.tensor_split(input_ids, split_indices)
-                    split_input_ids = [x for x in split_input_ids if x.numel() > 0]
-                    # Create embed_is_patch for each image
-                    embed_is_patch = []
-                    for per_image_input_ids in split_input_ids:
-                        embed_is_patch.append(per_image_input_ids == patch_id)
-                    processor_output["embed_is_patch"] = embed_is_patch
+                processor_output["embed_is_patch"] = embed_is_patch
         # Convert to the format expected by SGLang
         processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]

sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py RENAMED Viewed

@@ -1,12 +1,12 @@
 import logging
 from typing import List, Union
-from sglang.srt.managers.multimodal_processors.base_processor import (
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.phi4mm import Phi4MMForCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-from sglang.srt.models.phi4mm import Phi4MMForCausalLM
 logger = logging.getLogger(__name__)
@@ -26,22 +26,12 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
+        audio_data,
         input_text,
         request_obj,
         max_req_input_len,
         **kwargs,
     ):
-        audio_data = request_obj.audio_data
-        if not image_data and not audio_data:
-            return None
-        if not isinstance(image_data, list):
-            image_data = [image_data]
-        if not isinstance(audio_data, list):
-            audio_data = [audio_data]
         if audio_data:
             logger.warning(
                 "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."

sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py RENAMED Viewed

@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens as _get_pixtral_hf_num_image_tokens,
 )
-from sglang.srt.managers.multimodal_processors.base_processor import (
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.pixtral import PixtralVisionModel
+from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-from sglang.srt.models.pixtral import PixtralVisionModel
 class PixtralProcessor(BaseMultimodalProcessor):
@@ -78,12 +78,6 @@ class PixtralProcessor(BaseMultimodalProcessor):
         *args,
         **kwargs,
     ):
-        if not image_data:
-            return None
-        if isinstance(image_data, str):
-            image_data = [image_data]
         mm_data = self.load_mm_data(
             prompt=input_text,
             multimodal_tokens=self.multimodal_tokens,

sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py RENAMED Viewed

@@ -3,19 +3,15 @@ import math
 import re
 from typing import Dict, List, Union
-import torch
 from PIL import Image
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
-from sglang.srt.managers.multimodal_processors.base_processor import (
-    BaseMultimodalProcessor as SGLangBaseProcessor,
-)
-from sglang.srt.managers.multimodal_processors.base_processor import (
-    MultimodalSpecialTokens,
-)
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
 # Compatible with Qwen2VL and Qwen2_5VL
@@ -51,9 +47,6 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         *args,
         **kwargs,
     ):
-        if isinstance(image_data, str):
-            image_data = [image_data]
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
@@ -132,12 +125,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         video_grid_thw = None  # TODO
-        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
-        if combined_mm_item is None:
+        if not mm_items:
             # Note(Xinyuan): This is the case where image loading fails.
             return None
+        combined_mm_item = mm_items[0]  # only image is supported for now
         video_grid_thw = None  # TODO
         second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
@@ -159,7 +153,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         return {
             "input_ids": input_ids.tolist(),
-            "mm_items": [combined_mm_item],
+            "mm_items": mm_items,
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
             "im_token_id": self.IM_TOKEN_ID,

sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py RENAMED Viewed

@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
     GenerateReqInput,
     ImageDataItem,
 )
-from sglang.srt.managers.multimodal_processors.base_processor import (
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-from sglang.srt.models.vila import VILAForConditionalGeneration
 from sglang.srt.server_args import ServerArgs
@@ -37,6 +37,8 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
         _processor: VILAProcessor,
     ) -> None:
         super().__init__(hf_config, server_args, _processor)
+        self.IM_TOKEN_ID = hf_config.image_token_id
+        self.VIDEO_TOKEN_ID = hf_config.video_token_id
     async def process_mm_data_async(
         self,
@@ -46,13 +48,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
         max_req_input_len: int,
         **kwargs,
     ) -> Optional[Dict[str, Any]]:
-        if not image_data:
-            return None
-        if not isinstance(image_data, list):
-            image_data = [image_data]
-        mm_data = self.load_mm_data(
+        base_output = self.load_mm_data(
             prompt=input_text,
             multimodal_tokens=MultimodalSpecialTokens(
                 image_token=self._processor.tokenizer.image_token
@@ -61,25 +57,11 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
             image_data=image_data,
         )
-        inputs = self.process_mm_data(
-            input_text=mm_data.input_text,
-            images=mm_data.images,
-        )
-        image_offsets = self.get_mm_items_offset(
-            input_ids=inputs.input_ids[0],
-            mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
-        )
+        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
-        mm_items: List[MultimodalDataItem] = [
-            MultimodalDataItem(
-                modality=Modality.IMAGE,
-                image_offsets=image_offsets,
-                pixel_values=inputs.pixel_values,
-            )
-        ]
-        return dict(
-            input_ids=inputs.input_ids[0].tolist(),
-            mm_items=mm_items,
-        )
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.IM_TOKEN_ID,
+            "video_token_id": self.VIDEO_TOKEN_ID,
+        }

sglang/srt/operations_strategy.py CHANGED Viewed

@@ -71,7 +71,9 @@ def _compute_moe_deepseek_layer_operations_strategy_tbo(
     assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
     if forward_mode == ForwardMode.EXTEND:
         return _compute_moe_deepseek_blog_prefill(layer)
-    elif forward_mode == ForwardMode.DECODE:
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
         return _compute_moe_deepseek_blog_decode(layer)
     else:
         raise NotImplementedError(f"Unsupported {forward_mode=}")
@@ -146,7 +148,9 @@ def _compute_moe_qwen3_layer_operations_strategy_tbo(
     assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
     if forward_mode == ForwardMode.EXTEND:
         return _compute_moe_qwen3_prefill(layer)
-    elif forward_mode == ForwardMode.DECODE:
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
         return _compute_moe_qwen3_decode(layer)
     else:
         raise NotImplementedError(f"Unsupported {forward_mode=}")

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -66,6 +66,13 @@ class BaseReasoningFormatDetector:
         self._buffer += new_text
         current_text = self._buffer
+        # If the current text is a prefix of the think token, keep buffering
+        if any(
+            token.startswith(current_text) and token != current_text
+            for token in [self.think_start_token, self.think_end_token]
+        ):
+            return StreamingParseResult()
         # Strip `<think>` token if present
         if not self.stripped_think_start and self.think_start_token in current_text:
             current_text = current_text.replace(self.think_start_token, "")
@@ -150,6 +157,24 @@ class Qwen3Detector(BaseReasoningFormatDetector):
         )
+class KimiDetector(BaseReasoningFormatDetector):
+    """
+    Detector for Kimi Thinking model.
+    Assumes reasoning format:
+      ◁think▷*(.*)◁/think▷
+    Returns all the text before the ◁/think▷ tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+    """
+    def __init__(self, stream_reasoning: bool = True):
+        super().__init__(
+            "◁think▷",
+            "◁/think▷",
+            force_reasoning=False,
+            stream_reasoning=stream_reasoning,
+        )
 class ReasoningParser:
     """
     Parser that handles both streaming and non-streaming scenarios for extracting
@@ -164,6 +189,7 @@ class ReasoningParser:
     DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
         "deepseek-r1": DeepSeekR1Detector,
         "qwen3": Qwen3Detector,
+        "kimi": KimiDetector,
     }
     def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -10,7 +10,6 @@ import torch
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_params import TOP_K_ALL
-from sglang.srt.utils import merge_bias_tensor
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -345,3 +344,42 @@ class SamplingBatchInfo:
         self.logit_bias = merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
         )
+def merge_bias_tensor(
+    lhs: Optional[torch.Tensor],
+    rhs: Optional[torch.Tensor],
+    bs1: int,
+    bs2: int,
+    device: str,
+    default: float,
+):
+    """Merge two bias tensors for batch merging.
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        bs1: Batch size of left-hand side tensor
+        bs2: Batch size of right-hand side tensor
+        device: Device to place the merged tensor on
+        default: Default value for missing tensor elements
+    Returns:
+        Merged tensor or None if both inputs are None
+    """
+    if lhs is None and rhs is None:
+        return None
+    if lhs is not None and rhs is not None:
+        return torch.cat([lhs, rhs])
+    else:
+        if lhs is not None:
+            shape, dtype = lhs.shape[1:], lhs.dtype
+        else:
+            shape, dtype = rhs.shape[1:], rhs.dtype
+        if lhs is None:
+            lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
+        if rhs is None:
+            rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
+        return torch.cat([lhs, rhs])

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl