PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +3 -1
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +667 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +63 -11
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/parallel_state.py +10 -3
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +71 -12
sglang/srt/function_call_parser.py +133 -54
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +295 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +32 -21
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +31 -18
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +184 -126
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +24 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -9
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +66 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/layers.py +68 -0
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +47 -23
sglang/srt/lora/mem_pool.py +110 -51
sglang/srt/lora/utils.py +12 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +39 -3
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +133 -30
sglang/srt/managers/scheduler.py +273 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -23
sglang/srt/managers/tp_worker.py +1 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +27 -13
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +70 -6
sglang/srt/model_loader/loader.py +160 -2
sglang/srt/model_loader/weight_utils.py +45 -0
sglang/srt/models/deepseek_janus_pro.py +29 -86
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +208 -77
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +684 -0
sglang/srt/models/gemma3_mm.py +462 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +124 -28
sglang/srt/openai_api/protocol.py +23 -2
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +99 -9
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +139 -53
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +182 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +2 -0
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +55 -4
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/minicpm.py ADDED Viewed

@@ -0,0 +1,167 @@
+import asyncio
+from typing import List, Union
+import torch
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+    get_global_processor,
+)
+from sglang.srt.models.minicpmo import MiniCPMO
+from sglang.srt.models.minicpmv import MiniCPMV
+# Compatible with both 'O' and 'V'
+class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
+    models = [MiniCPMV, MiniCPMO]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.image_token = "(<image>./</image>)"
+        self.audio_token = "(<audio>./</audio>)"
+    @staticmethod
+    def _process_data_task(input_text, images=None, audios=None):
+        if isinstance(images, list) and len(images) == 0:
+            images = None
+        if isinstance(audios, list) and len(audios) == 0:
+            audios = None
+        result = get_global_processor().__call__(
+            text=input_text,
+            images=images,
+            audios=audios,
+            return_tensors="pt",
+            chunk_input=True,
+        )
+        return {
+            "input_ids": result.input_ids,
+            "pixel_values": getattr(result, "pixel_values", None),
+            "tgt_sizes": getattr(result, "tgt_sizes", None),
+            "audio_features": getattr(result, "audio_features", None),
+            "audio_feature_lens": getattr(result, "audio_feature_lens", None),
+            "audio_bounds": getattr(result, "audio_bounds", None),
+        }
+    async def _process_data(self, images, input_text, audios=None):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            multimodal_data_inputs = await loop.run_in_executor(
+                self.executor,
+                MiniCPMMultimodalProcessor._process_data_task,
+                input_text,
+                images,
+                audios,
+            )
+        else:
+            multimodal_data_inputs = self._processor(
+                images=images, text=input_text, audios=audios, return_tensors="pt"
+            )
+        return multimodal_data_inputs
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_ids,
+        request_obj,
+        max_req_input_len,
+    ):
+        audio_data = request_obj.audio_data
+        if not image_data and not audio_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+        base_output = self.load_mm_data(
+            input_ids=input_ids,
+            max_req_input_len=max_req_input_len,
+            audio_data=audio_data,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.image_token, audio_token=self.audio_token
+            ),
+        )
+        if base_output is None:
+            return None
+        res = await self._process_data(
+            images=base_output.images,
+            input_text=base_output.input_text,
+            audios=base_output.audios,
+        )
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        slice_start_id, slice_end_id, audio_start_id, audio_end_id = (
+            None,
+            None,
+            None,
+            None,
+        )
+        if tokenizer.slice_start_id:
+            slice_start_id = tokenizer.slice_start_id
+            slice_end_id = tokenizer.slice_end_id
+        if hasattr(tokenizer, "audio_start_id"):
+            audio_start_id = tokenizer.audio_start_id
+            audio_end_id = tokenizer.audio_end_id
+        im_token_id = tokenizer.unk_token_id
+        pixel_values = res["pixel_values"]
+        tgt_sizes = res["tgt_sizes"]
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
+            )
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of target sizes. " f"Got type: {type(tgt_sizes)}"
+            )
+        if len(pixel_values) != len(tgt_sizes):
+            raise ValueError(
+                "Inconsistent batch lengths, found: "
+                f"{len(pixel_values)} vs. {len(tgt_sizes)}"
+            )
+        pixel_values_flat: List[torch.Tensor] = []
+        tgt_sizes_flat: List[torch.Tensor] = []
+        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+            # per image
+            if len(pixel_b) != len(tgt_b):
+                raise ValueError(
+                    "Inconsistent N lengths, found: " f"{len(pixel_b)} vs {len(tgt_b)}"
+                )
+            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                pixel_values_flat += [pixel_n]
+                tgt_sizes_flat += [tgt_n]
+        pixel_values = pixel_values_flat
+        if len(tgt_sizes_flat) == 0:
+            tgt_sizes = None
+        else:
+            tgt_sizes = torch.stack(tgt_sizes_flat)
+        if not isinstance(res["audio_features"], list):
+            res["audio_features"] = [res["audio_features"]]
+        return {
+            "input_ids": res["input_ids"].flatten().tolist(),
+            "pixel_values": pixel_values,
+            "tgt_sizes": tgt_sizes,
+            "data_hashes": base_output.mm_data_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "audio_start_id": audio_start_id,
+            "audio_end_id": audio_end_id,
+            "audio_features": res["audio_features"],
+            "audio_bounds": res["audio_bounds"],
+            "audio_feature_lens": res["audio_feature_lens"],
+            "im_token_id": im_token_id,
+            "im_start_id": tokenizer.im_start_id,
+            "im_end_id": tokenizer.im_end_id,
+            "slice_start_id": slice_start_id,
+            "slice_end_id": slice_end_id,
+        }

sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py RENAMED Viewed

@@ -1,15 +1,17 @@
 import asyncio
 from typing import List, Union
-from sglang.srt.managers.image_processor import BaseImageProcessor
-from sglang.srt.managers.image_processors.base_image_processor import (
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
     get_global_processor,
 )
 from sglang.srt.models.mllama import MllamaForConditionalGeneration
 from sglang.srt.utils import load_image
-class MllamaImageProcessor(BaseImageProcessor):
+class MllamaImageProcessor(BaseMultimodalProcessor):
+    models = [MllamaForConditionalGeneration]
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
@@ -32,7 +34,7 @@ class MllamaImageProcessor(BaseImageProcessor):
         return image_inputs
-    async def process_images_async(
+    async def process_mm_data_async(
         self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
     ):
         if not image_data:
@@ -51,10 +53,7 @@ class MllamaImageProcessor(BaseImageProcessor):
             images = load_image(image_data[0])[0]
         image_inputs = await self._process_single_image(images, input_text)
-        image_inputs["image_hashes"] = [hash(str(image_data))]
+        image_inputs["data_hashes"] = [hash(str(image_data))]
         image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
         return image_inputs
-ImageProcessorMapping = {MllamaForConditionalGeneration: MllamaImageProcessor}

sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py RENAMED Viewed

@@ -1,11 +1,16 @@
 import asyncio
 import math
+import time
 from typing import List, Union
+import torch
 from PIL import Image
-from sglang.srt.managers.image_processor import BaseImageProcessor
-from sglang.srt.managers.image_processors.base_image_processor import (
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
     get_global_processor,
 )
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
@@ -13,7 +18,9 @@ from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
 # Compatible with Qwen2VL and Qwen2_5VL
-class Qwen2_5VLImageProcessor(BaseImageProcessor):
+class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
+    models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
         self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
@@ -25,7 +32,6 @@ class Qwen2_5VLImageProcessor(BaseImageProcessor):
         self.IMAGE_FACTOR = 28
         self.MIN_PIXELS = 4 * 28 * 28
         self.MAX_PIXELS = 16384 * 28 * 28
-        self.MAX_PIXELS = 16384 * 28 * 28
         self.MAX_RATIO = 200
     @staticmethod
@@ -44,7 +50,7 @@ class Qwen2_5VLImageProcessor(BaseImageProcessor):
             "video_grid_thws": getattr(result, "video_grid_thws", None),
         }
-    async def _process_images(self, images, input_text) -> dict:
+    async def _process_single_image(self, images, input_text) -> dict:
         if self.executor is not None:
             loop = asyncio.get_event_loop()
             return await loop.run_in_executor(
@@ -57,7 +63,7 @@ class Qwen2_5VLImageProcessor(BaseImageProcessor):
         else:
             return self._process_images_task(images, input_text, self.hf_config)
-    async def process_images_async(
+    async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_ids,
@@ -66,17 +72,18 @@ class Qwen2_5VLImageProcessor(BaseImageProcessor):
         *args,
         **kwargs,
     ):
+        start = time.time()
         if not image_data:
             return None
         if isinstance(image_data, str):
             image_data = [image_data]
         image_token = self.IMAGE_TOKEN
-        base_output = self.load_images(
-            input_ids,
-            image_data,
-            image_token,
-            max_req_input_len,
+        base_output = self.load_mm_data(
+            input_ids=input_ids,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            max_req_input_len=max_req_input_len,
         )
         def smart_resize(
@@ -137,25 +144,24 @@ class Qwen2_5VLImageProcessor(BaseImageProcessor):
             """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
             return math.floor(number / factor) * factor
-        images = [resize_image(image) for image in base_output.all_frames]
+        images = [resize_image(image) for image in base_output.images]
-        ret = await self._process_images(images, base_output.input_text)
+        ret = await self._process_single_image(
+            images=images, input_text=base_output.input_text
+        )
+        image_grid_thws = torch.concat([ret["image_grid_thw"]])
+        video_grid_thws = None
         return {
             "input_ids": ret["input_ids"].flatten().tolist(),
             "pixel_values": ret["pixel_values"],
-            "image_hashes": base_output.image_hashes,
+            "data_hashes": base_output.mm_data_hashes,
             "modalities": request_obj.modalities or ["image"],
-            "image_grid_thws": ret["image_grid_thw"],
-            "video_grid_thws": ret["video_grid_thws"],
+            "image_grid_thws": image_grid_thws,
+            "video_grid_thws": video_grid_thws,
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
             "im_token_id": self.image_token_id,
             "video_token_id": self.video_token_id,
             "second_per_grid_ts": ret["second_per_grid_ts"],
         }
-ImageProcessorMapping = {
-    Qwen2VLForConditionalGeneration: Qwen2_5VLImageProcessor,
-    Qwen2_5_VLForConditionalGeneration: Qwen2_5VLImageProcessor,
-}

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -42,6 +42,8 @@ import triton.language as tl
 from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
+from sglang.srt.disaggregation.conn import KVSender
+from sglang.srt.disaggregation.decode import ScheduleBatchDisaggregationDecodeMixin
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
@@ -49,7 +51,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_compiler_backend, next_power_of_2
+from sglang.srt.utils import get_compiler_backend
 if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
@@ -67,12 +69,15 @@ global_server_args_dict = {
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
+    "enable_deepep_moe": ServerArgs.enable_deepep_moe,
     "device": ServerArgs.device,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
     "enable_flashinfer_mla": ServerArgs.enable_flashinfer_mla,
+    "enable_flashmla": ServerArgs.enable_flashmla,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
+    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
 }
 logger = logging.getLogger(__name__)
@@ -139,11 +144,11 @@ class FINISH_ABORT(BaseFinishReason):
 @dataclasses.dataclass
-class ImageInputs:
+class MultimodalInputs:
     """The image related inputs."""
     pixel_values: Union[torch.Tensor, np.array]
-    image_hashes: Optional[list] = None
+    data_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
     image_pad_len: Optional[list] = None
@@ -156,34 +161,48 @@ class ImageInputs:
     aspect_ratio_mask: Optional[List[torch.Tensor]] = None
     # QWen2-VL related
-    image_grid_thws: List[Tuple[int, int, int]] = None
+    # [num_of_images, t, h, w]
+    image_grid_thws: torch.Tensor = None
     mrope_position_delta: Optional[torch.Tensor] = None
+    # Qwen2-VL video related
+    video_token_id: Optional[int] = None
+    video_grid_thws: List[Tuple[int, int, int]] = None
+    second_per_grid_ts: Optional[List[torch.Tensor]] = None
+    # deepseek vl2 related
+    images_emb_mask: Optional[List[torch.Tensor]] = None
+    image_spatial_crop: Optional[List[torch.Tensor]] = None
     # The id of the single-image placeholder token
     im_token_id: Optional[torch.Tensor] = None
     # All the images in the batch should share the same special image
     # bound token ids.
     im_start_id: Optional[int] = None
     im_end_id: Optional[int] = None
     slice_start_id: Optional[int] = None
     slice_end_id: Optional[int] = None
+    # [num_images, 2 (w, h)]
     tgt_sizes: Optional[list] = None
-    # denotes the number of valid image tokens in each image
-    images_emb_mask: Optional[torch.BoolTensor] = None
+    # audio
+    audio_start_id: Optional[torch.Tensor] = None
+    audio_end_id: Optional[torch.Tensor] = None
+    audio_features: Optional[List[torch.Tensor]] = None
+    audio_feature_lens: Optional[List[torch.Tensor]] = None
     @staticmethod
     def from_dict(obj: dict):
-        ret = ImageInputs(
+        ret = MultimodalInputs(
             pixel_values=obj["pixel_values"],
-            image_hashes=obj["image_hashes"],
+            data_hashes=obj["data_hashes"],
         )
         # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
         # Please note that if the `input_ids` is later used in the model forward,
         # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
         # errors in cuda kernels. See also llava.py for example.
-        ret.pad_values = [x % (1 << 30) for x in ret.image_hashes]
+        ret.pad_values = [x % (1 << 30) for x in ret.data_hashes]
         optional_args = [
             "image_sizes",
@@ -191,43 +210,104 @@ class ImageInputs:
             "aspect_ratio_ids",
             "aspect_ratio_mask",
             "image_grid_thws",
+            "images_emb_mask",
+            "image_spatial_crop",
             "im_token_id",
             "im_start_id",
             "im_end_id",
             "slice_start_id",
             "slice_end_id",
             "tgt_sizes",
-            "images_emb_mask",
+            "audio_start_id",
+            "audio_end_id",
+            "audio_features",
+            "audio_feature_lens",
         ]
         for arg in optional_args:
             if arg in obj:
                 setattr(ret, arg, obj[arg])
+        # validate
+        assert (
+            isinstance(ret.pixel_values, torch.Tensor)
+            or isinstance(ret.pixel_values, np.ndarray)
+            or isinstance(ret.pixel_values, list)
+        )
+        assert ret.audio_features is None or isinstance(ret.audio_features, list)
         return ret
-    def merge(self, other):
-        assert self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
-        self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
+    def contains_image_inputs(self) -> bool:
+        """ """
+        return self.pixel_values is not None and self.pixel_values != []
+    def contains_audio_inputs(self) -> bool:
+        """ """
+        return self.audio_features is not None and self.audio_features != []
+    def merge(self, other: MultimodalInputs):
+        """
+        merge image inputs when requests are being merged
+        """
+        if isinstance(self.pixel_values, list):
+            # in some rare cases, pixel values are list of patches with different shapes
+            # e.g. minicpm
+            self.pixel_values += other.pixel_values
+        else:
+            assert (
+                self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
+            ), f"{self.pixel_values.shape[1:]} vs {other.pixel_values.shape[1:]}"
+            self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
+        # args would be stacked along first dim
+        # usually these are already tensors
+        stack_args = [
+            # TODO: merge with image_grid_thws, basically the same thing
+            "tgt_sizes",
+            "image_spatial_crop",
+        ]
+        for arg in stack_args:
+            if getattr(self, arg, None) is None:
+                setattr(self, arg, getattr(other, arg, None))
+            elif getattr(other, arg, None) is not None:
+                # self and other both not None
+                setattr(
+                    self,
+                    arg,
+                    torch.cat([getattr(self, arg), getattr(other, arg)], dim=0),
+                )
+        if self.image_grid_thws is None:
+            self.image_grid_thws = other.image_grid_thws
+        elif other.image_grid_thws is not None:
+            self.image_grid_thws = torch.concat(
+                [self.image_grid_thws, other.image_grid_thws]
+            )
         # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
         # Please note that if the `input_ids` is later used in the model forward,
         # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
         # errors in cuda kernels. See also llava.py for example.
-        self.image_hashes += other.image_hashes
-        self.pad_values = [x % (1 << 30) for x in self.image_hashes]
+        self.data_hashes += other.data_hashes
+        self.pad_values = [x % (1 << 30) for x in self.data_hashes]
+        # args needed to be merged
         optional_args = [
+            "audio_features",
             "image_sizes",
             "image_offsets",
             "image_pad_len",
             # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
             "aspect_ratio_ids",
             "aspect_ratio_mask",
-            "image_grid_thws",
+            "images_emb_mask",
         ]
         for arg in optional_args:
-            if getattr(self, arg, None) is not None:
-                setattr(self, arg, getattr(self, arg) + getattr(other, arg))
+            self_arg = getattr(self, arg, None)
+            if self_arg is not None:
+                setattr(self, arg, self_arg + getattr(other, arg))
+        # other args would be kept intact
 class Req:
@@ -305,7 +385,7 @@ class Req:
         self.decoded_text = ""
         # For multimodal inputs
-        self.image_inputs: Optional[ImageInputs] = None
+        self.multimodal_inputs: Optional[MultimodalInputs] = None
         # Prefix info
         # The indices to kv cache for the shared prefix.
@@ -378,15 +458,33 @@ class Req:
         self.spec_verify_ct = 0
         self.lora_path = lora_path
+        # For disaggregation
+        self.bootstrap_host: str = "0.0.0.0"
+        self.bootstrap_room: Optional[int] = None
+        self.disagg_kv_sender: Optional[KVSender] = None
+        # used for warmup because we don't have a pair yet when init
+        self.skip_kv_transfer: bool = False
+        # the start index of the sent kv cache
+        # We want to send it chunk by chunk for chunked prefill.
+        # After every chunk forward, we do the following:
+        # kv_send(req.input_ids[req.start_send_idx:len(req.fill_ids)])
+        # start_send_idx = len(req.fill_ids)
+        self.start_send_idx: int = 0
+        self.metadata_buffer_index: int = -1
+        # The first output_id transferred from prefill instance.
+        self.transferred_output_id: Optional[int] = None
     @property
     def seqlen(self):
         return len(self.origin_input_ids) + len(self.output_ids)
     def extend_image_inputs(self, image_inputs):
-        if self.image_inputs is None:
-            self.image_inputs = image_inputs
+        if self.multimodal_inputs is None:
+            self.multimodal_inputs = image_inputs
         else:
-            self.image_inputs.merge(image_inputs)
+            self.multimodal_inputs.merge(image_inputs)
     def finished(self) -> bool:
         # Whether request reached finished condition
@@ -513,7 +611,7 @@ bid = 0
 @dataclasses.dataclass
-class ScheduleBatch:
+class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     """Store all information of a batch on the scheduler."""
     # Request, memory pool, and cache
@@ -727,7 +825,7 @@ class ScheduleBatch:
         self.encoder_cached = []
         for req in self.reqs:
-            im = req.image_inputs
+            im = req.multimodal_inputs
             if im is None or im.num_image_tokens is None:
                 # No image input
                 self.encoder_lens_cpu.append(0)
@@ -840,6 +938,8 @@ class ScheduleBatch:
                 # If req.input_embeds is already a list, append its content directly
                 input_embeds.extend(req.input_embeds)  # Use extend to avoid nesting
+            if req.is_retracted:
+                req.already_computed = 0
             req.cached_tokens += pre_len - req.already_computed
             req.already_computed = seq_len
             req.is_retracted = False
@@ -1244,14 +1344,14 @@ class ScheduleBatch:
             self.encoder_lens = torch.cat([self.encoder_lens, other.encoder_lens])
             self.encoder_lens_cpu.extend(other.encoder_lens_cpu)
-        self.req_pool_indices = torch.concat(
+        self.req_pool_indices = torch.cat(
             [self.req_pool_indices, other.req_pool_indices]
         )
-        self.seq_lens = torch.concat([self.seq_lens, other.seq_lens])
+        self.seq_lens = torch.cat([self.seq_lens, other.seq_lens])
         self.out_cache_loc = None
         self.seq_lens_sum += other.seq_lens_sum
         if self.output_ids is not None:
-            self.output_ids = torch.concat([self.output_ids, other.output_ids])
+            self.output_ids = torch.cat([self.output_ids, other.output_ids])
         if self.return_logprob and other.return_logprob:
             self.top_logprobs_nums.extend(other.top_logprobs_nums)
             self.token_ids_logprobs.extend(other.token_ids_logprobs)
@@ -1273,7 +1373,10 @@ class ScheduleBatch:
     def get_model_worker_batch(self) -> ModelWorkerBatch:
         if self.forward_mode.is_decode_or_idle():
-            if global_server_args_dict["enable_flashinfer_mla"]:
+            if (
+                global_server_args_dict["enable_flashinfer_mla"]
+                or global_server_args_dict["enable_flashmla"]
+            ):
                 decode_seq_lens = self.seq_lens.cpu()
             else:
                 decode_seq_lens = None
@@ -1311,7 +1414,7 @@ class ScheduleBatch:
             extend_seq_lens=extend_seq_lens,
             extend_prefix_lens=extend_prefix_lens,
             extend_logprob_start_lens=extend_logprob_start_lens,
-            image_inputs=[r.image_inputs for r in self.reqs],
+            multimodal_inputs=[r.multimodal_inputs for r in self.reqs],
             encoder_cached=self.encoder_cached,
             encoder_lens=self.encoder_lens,
             encoder_lens_cpu=self.encoder_lens_cpu,
@@ -1394,7 +1497,7 @@ class ModelWorkerBatch:
     extend_input_logprob_token_ids: Optional[torch.Tensor]
     # For multimodal
-    image_inputs: Optional[List[ImageInputs]]
+    multimodal_inputs: Optional[List[MultimodalInputs]]
     # For encoder-decoder
     encoder_cached: Optional[List[bool]]

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl