PyPI - sglang - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +23 -2
sglang/bench_serving.py +6 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +37 -5
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +80 -11
sglang/srt/disaggregation/mini_lb.py +58 -123
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +585 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
sglang/srt/disaggregation/prefill.py +82 -22
sglang/srt/disaggregation/utils.py +46 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +42 -13
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +430 -257
sglang/srt/layers/attention/flashinfer_backend.py +18 -9
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +18 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +63 -45
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +13 -5
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +131 -136
sglang/srt/layers/quantization/fp8_kernel.py +328 -46
sglang/srt/layers/quantization/fp8_utils.py +206 -253
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
sglang/srt/layers/quantization/w8a8_int8.py +8 -7
sglang/srt/layers/radix_attention.py +28 -1
sglang/srt/layers/rotary_embedding.py +15 -3
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +255 -97
sglang/srt/managers/mm_utils.py +7 -5
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
sglang/srt/managers/schedule_batch.py +64 -25
sglang/srt/managers/scheduler.py +80 -82
sglang/srt/managers/tokenizer_manager.py +18 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +21 -3
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -6
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +67 -35
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/bert.py +398 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +494 -366
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +6 -5
sglang/srt/models/llama4.py +101 -34
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +30 -200
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +102 -29
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +5 -1
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +15 -13
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +55 -19
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +10 -9
sglang/srt/utils.py +136 -10
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +224 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/disaggregation/conn.py +0 -81
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -4,14 +4,14 @@ import dataclasses
 import multiprocessing as mp
 import os
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import List, Optional
 import numpy as np
 import PIL
-from decord import VideoReader, cpu
-from PIL import Image
+from transformers import BaseImageProcessorFast
-from sglang.srt.utils import encode_video, load_audio, load_image, logger
+from sglang.srt.managers.schedule_batch import Modality
+from sglang.srt.utils import encode_video, load_audio, load_image
 @dataclasses.dataclass
@@ -78,6 +78,10 @@ class BaseMultimodalProcessor(ABC):
             kwargs["audios"] = audios
         processor = self._processor
+        if hasattr(processor, "image_processor") and isinstance(
+            processor.image_processor, BaseImageProcessorFast
+        ):
+            kwargs["device"] = "cuda"
         result = processor.__call__(
             text=[input_text],
             padding=True,
@@ -96,6 +100,9 @@ class BaseMultimodalProcessor(ABC):
         """
         estimate the total frame count from all visual input
         """
+        # Lazy import because decord is not available on some arm platforms.
+        from decord import VideoReader, cpu
         # Before processing inputs
         estimated_frames_list = []
         for image in image_data:
@@ -111,6 +118,84 @@ class BaseMultimodalProcessor(ABC):
         return estimated_frames_list
+    @staticmethod
+    def _load_single_item(
+        data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
+    ):
+        """Static method that can be pickled for multiprocessing"""
+        try:
+            if is_audio:
+                return load_audio(data)
+            elif is_video:
+                path = data[len("video:") :]
+                return encode_video(path, frame_count_limit)
+            else:
+                img, _ = load_image(data)
+                return img.convert("RGB") if discard_alpha_channel else img
+        except Exception as e:
+            raise RuntimeError(f"Error while loading data {data}: {e}")
+    def submit_data_loading_tasks(
+        self,
+        text_parts: List[str],
+        multimodal_tokens: MultimodalSpecialTokens,
+        image_data: Optional[list] = None,
+        audio_data: Optional[list] = None,
+        discard_alpha_channel: bool = True,
+    ):
+        """
+        load multimodal data parallelly
+        """
+        # TODO(mick): load from server_args, env, or sampling_params
+        MAX_NUM_FRAMES = 30
+        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
+        total_frame_count = sum(estimated_frames_list)
+        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
+        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
+        scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
+        assert len(image_data) == len(estimated_frames_list)
+        # Submit all tasks
+        futures = []
+        task_info = []
+        image_index, audio_index = 0, 0
+        for text_part in text_parts:
+            if text_part == multimodal_tokens.image_token:
+                data = image_data[image_index]
+                is_video = isinstance(data, str) and data.startswith("video:")
+                estimated_frames = estimated_frames_list[image_index]
+                frame_count_limit = max(1, int(estimated_frames * scaling_factor))
+                futures.append(
+                    self.io_executor.submit(
+                        BaseMultimodalProcessor._load_single_item,
+                        data,
+                        is_video,
+                        False,
+                        frame_count_limit,
+                        discard_alpha_channel,
+                    )
+                )
+                task_info.append((Modality.IMAGE, data, frame_count_limit))
+                image_index += 1
+            elif text_part == multimodal_tokens.audio_token:
+                data = audio_data[audio_index]
+                futures.append(
+                    self.io_executor.submit(
+                        BaseMultimodalProcessor._load_single_item,
+                        data,
+                        False,
+                        True,
+                        None,
+                        discard_alpha_channel,
+                    )
+                )
+                task_info.append((Modality.AUDIO, data, None))
+                audio_index += 1
+        return futures, task_info
     def load_mm_data(
         self,
         prompt: str,
@@ -155,84 +240,37 @@ class BaseMultimodalProcessor(ABC):
             # split text into list of normal text and special tokens
             text_parts = re.split(pattern, prompt)
-        # TODO(mick): load from server_args, env, or sampling_params
-        MAX_NUM_FRAMES = 30
-        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
-        total_frame_count = sum(estimated_frames_list)
-        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
-        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
-        scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
-        assert len(image_data) == len(estimated_frames_list)
-        image_index, audio_index = 0, 0
-        hashes, image_sizes, images, audios = [], [], [], []
+        futures, task_info = self.submit_data_loading_tasks(
+            text_parts=text_parts,
+            multimodal_tokens=multimodal_tokens,
+            image_data=image_data,
+            audio_data=audio_data,
+            discard_alpha_channel=discard_alpha_channel,
+        )
+        # Process results
+        image_sizes, images, audios = [], [], []
         new_text = ""
-        for index, text_part in enumerate(text_parts):
-            try:
-                if text_part == multimodal_tokens.image_token:
-                    # load as image
-                    if len(images) >= MAX_NUM_FRAMES:
-                        frames_to_process = 0
-                    else:
-                        estimated_frames = estimated_frames_list[image_index]
-                        frames_to_process = max(
-                            1, int(estimated_frames * scaling_factor)
-                        )
-                    if frames_to_process == 0:
-                        frames = []
-                    else:
-                        image_file = image_data[image_index]
-                        if isinstance(image_file, str) and image_file.startswith(
-                            "video:"
-                        ):
-                            # video
-                            path = image_file[len("video:") :]
-                            frames = encode_video(
-                                path, frame_count_limit=frames_to_process
-                            )
-                        else:
-                            # image
-                            raw_image, _size = load_image(image_file)
-                            if discard_alpha_channel:
-                                raw_image = raw_image.convert("RGB")
-                            frames = [raw_image]
-                        if len(frames) == 0:
-                            continue
-                    image_sizes += frames[0].size * len(frames)
-                    # Generate a hashable value for the image file
-                    if isinstance(image_file, Image.Image):
-                        # For PIL.Image objects, use the ID as a hashable value
-                        hash_value = hash(id(image_file))
-                    else:
-                        # For other types (strings, etc.), use the regular hash
-                        hash_value = hash(image_file)
-                    hashes += [hash_value] * len(frames)
-                    images += frames
-                    image_index += 1
-                    if frames_to_process != 0:
+        task_ptr = 0
+        for text_part in text_parts:
+            if text_part in multimodal_tokens.collect():
+                task_type, data, frame_limit = task_info[task_ptr]
+                result = futures[task_ptr].result()
+                task_ptr += 1
+                if task_type == Modality.IMAGE:
+                    frames = [result] if not isinstance(result, list) else result
+                    if frames:
+                        image_sizes += frames[0].size * len(frames)
+                        images += frames
                         new_text += multimodal_tokens.image_token * len(frames)
-                    assert frames_to_process == len(frames)
-                elif text_part == multimodal_tokens.audio_token:
-                    # load as audio
-                    audio_file = audio_data[audio_index]
-                    audio = load_audio(audio_file)
-                    hashes += [hash(audio_file)]
-                    audios += [audio]
-                    audio_index += 1
+                elif task_type == Modality.AUDIO:
+                    # audio
+                    audios.append(result)
                     new_text += multimodal_tokens.audio_token
-                else:
-                    # TODO(mick): handle video
-                    # normal text
-                    new_text += text_part
-            except Exception as e:
-                logger.error(f"An exception occurred while loading images: {e}")
-                raise RuntimeError(f"An exception occurred while loading images: {e}")
+                # TODO: handle video
+            else:
+                new_text += text_part
         out = BaseMultiModalProcessorOutput(
             images=images,

sglang/srt/managers/multimodal_processors/janus_pro.py CHANGED Viewed

@@ -33,7 +33,9 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
         base_out = self.load_mm_data(
             prompt=input_ids,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=processor.image_tag),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=processor.image_token
+            ),
             max_req_input_len=max_req_input_len,
         )

sglang/srt/managers/multimodal_processors/mllama4.py CHANGED Viewed

@@ -1,10 +1,8 @@
-from typing import List, Mapping, Optional, Tuple, Union
+from typing import List, Union
 import torch
-from PIL import Image
-from transformers import Llama4Processor
 from transformers.image_utils import SizeDict
-from transformers.models.llama4.image_processing_llama4 import (
+from transformers.models.llama4.image_processing_llama4_fast import (
     find_supported_resolutions,
     get_best_fit,
 )
@@ -15,7 +13,6 @@ from sglang.srt.managers.multimodal_processors.base_processor import (
 )
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
-from sglang.srt.utils import load_image
 class Mllama4ImageProcessor(BaseMultimodalProcessor):
@@ -25,6 +22,9 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         super().__init__(hf_config, server_args, _processor)
         self.vision_config = hf_config.vision_config
         self.text_config = hf_config.text_config
+        self.boi_token_index = hf_config.boi_token_index
+        self.eoi_token_index = hf_config.eoi_token_index
+        self.image_token_index = hf_config.image_token_index
         self.multimodal_tokens = MultimodalSpecialTokens(
             image_token=_processor.image_token
         )
@@ -54,19 +54,16 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         )
         # Process the images using the processor
-        processor = Llama4Processor.from_pretrained(
-            self.server_args.model_path, **kwargs
-        )
+        processor = self._processor
         # Process the prompt and images
-        image_inputs = processor(
-            text=processed_data.input_text,
+        processor_output = self.process_mm_data(
+            input_text=processed_data.input_text,
             images=processed_data.images,
-            return_tensors="pt",
         )
         # Handle image resolutions and aspect ratios
-        if "pixel_values" in image_inputs:
+        if "pixel_values" in processor_output:
             image_processor = processor.image_processor
             tokenizer = self._processor.tokenizer
@@ -100,8 +97,8 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
             ]
             # Add to image_inputs
-            image_inputs["aspect_ratios"] = aspect_ratios
-            image_inputs["patches_per_image"] = torch.tensor(patches_per_image)
+            processor_output["aspect_ratios"] = aspect_ratios
+            processor_output["patches_per_image"] = torch.tensor(patches_per_image)
             # Process embed_is_patch
             vocab = tokenizer.get_vocab()
@@ -109,7 +106,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
             image_end_id = vocab.get(processor.end_of_img_token, -1)
             if patch_id != -1 and image_end_id != -1:
-                input_ids = image_inputs["input_ids"].view(-1)
+                input_ids = processor_output["input_ids"].view(-1)
                 # Remove BOS token if present
                 if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
@@ -129,33 +126,21 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
                     for per_image_input_ids in split_input_ids:
                         embed_is_patch.append(per_image_input_ids == patch_id)
-                    image_inputs["embed_is_patch"] = embed_is_patch
+                    processor_output["embed_is_patch"] = embed_is_patch
         # Convert to the format expected by SGLang
-        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
+        processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
+        processor_output["im_start_id"] = self.boi_token_index
+        processor_output["im_end_id"] = self.eoi_token_index
+        processor_output["im_token_id"] = self.image_token_index
         # Add metadata for image processing
-        image_inputs["mm_items"] = [
+        processor_output["mm_items"] = [
             MultimodalDataItem(
-                pixel_values=image_inputs["pixel_values"],
+                pixel_values=processor_output["pixel_values"],
                 modality=Modality.IMAGE,
-                # Add additional metadata needed for Llama4 vision processing
-                embed_is_patch=image_inputs.get("embed_is_patch", None),
-                aspect_ratios=image_inputs.get("aspect_ratios", None),
-                patches_per_image=image_inputs.get("patches_per_image", None),
             )
         ]
-        return image_inputs
-    def get_patch_per_chunk(self):
-        """Calculate patches per chunk based on vision config"""
-        image_size = self.vision_config.image_size
-        patch_size = self.vision_config.patch_size
-        assert (
-            image_size % patch_size == 0
-        ), f"chunk size {image_size} should be multiple of patch_size {patch_size}"
-        ds_ratio = int(round(1.0 / (self.vision_config.pixel_shuffle_ratio**2)))
-        return (image_size // patch_size) ** 2 // ds_ratio
+        return processor_output

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import hashlib
 from enum import Enum, auto
 # Copyright 2023-2024 SGLang Team
@@ -44,7 +45,7 @@ import triton.language as tl
 from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
-from sglang.srt.disaggregation.conn import KVSender
+from sglang.srt.disaggregation.base import BaseKVSender
 from sglang.srt.disaggregation.decode import ScheduleBatchDisaggregationDecodeMixin
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
@@ -66,7 +67,6 @@ global_server_args_dict = {
     "attention_backend": ServerArgs.attention_backend,
     "sampling_backend": ServerArgs.sampling_backend,
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "disable_mla": ServerArgs.disable_mla,
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
@@ -76,12 +76,12 @@ global_server_args_dict = {
     "device": ServerArgs.device,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "enable_flashmla": ServerArgs.enable_flashmla,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
+    "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
     "chunked_prefill_size": ServerArgs.chunked_prefill_size,
     "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
+    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
 }
 logger = logging.getLogger(__name__)
@@ -157,7 +157,7 @@ class Modality(Enum):
 @dataclasses.dataclass
 class MultimodalDataItem:
     """
-    A single multimodal data, from a single image/video/audio or other
+    A single multimodal data, from a single image/video/audio or others
     """
     modality: Modality
@@ -195,17 +195,54 @@ class MultimodalDataItem:
     def set_pad_value(self):
         """
-        Set the pad value after first hashign the data
+        Set the pad value after first hashing the data
         """
+        def data_hash(data) -> int:
+            hash_bytes = hashlib.sha256(data).digest()[:8]
+            return int.from_bytes(hash_bytes, byteorder="big", signed=False)
+        def tensor_hash(tensor_list) -> int:
+            """
+            hash a tensor or a tensor list
+            """
+            tensor = tensor_list
+            if isinstance(tensor_list, list):
+                tensor_list = flatten_nested_list(tensor_list)
+                tensor_list = [
+                    x.flatten() if isinstance(x, torch.Tensor) else x
+                    for x in tensor_list
+                ]
+                tensor = torch.concat(tensor_list)
+            tensor = tensor.detach().contiguous()
+            if tensor.dtype == torch.bfloat16:
+                # memoryview() doesn't support PyTorch's BFloat16 dtype
+                tensor = tensor.float()
+            assert isinstance(tensor, torch.Tensor)
+            if tensor.is_cuda:
+                # TODO: improve this
+                tensor_cpu = tensor.cpu()
+            else:
+                tensor_cpu = tensor
+            mv = memoryview(tensor_cpu.numpy())
+            return data_hash(mv.tobytes())
         def hash_feature(f):
             if isinstance(f, list):
-                return hash(tuple(flatten_nested_list(f)))
+                if isinstance(f[0], torch.Tensor):
+                    return tensor_hash(f)
+                return data_hash(tuple(flatten_nested_list(f)))
             elif isinstance(f, np.ndarray):
                 arr = np.ascontiguousarray(f)
                 arr_bytes = arr.tobytes()
-                return hash(arr_bytes)
-            return hash(f)
+                return data_hash(arr_bytes)
+            elif isinstance(f, torch.Tensor):
+                return tensor_hash([f])
+            return data_hash(f)
         if self.is_audio():
             self.hash = hash_feature(self.audio_features)
@@ -230,6 +267,9 @@ class MultimodalDataItem:
             self.modality == Modality.VIDEO
         ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+    def is_valid(self) -> bool:
+        return self.is_image() or self.is_video() or self.is_audio()
     def validate(self):
         ...
         # TODO
@@ -248,7 +288,7 @@ class MultimodalInputs:
     mrope_position_delta: Optional[torch.Tensor] = None
     # image
-    im_token_id: Optional[torch.Tensor] = None
+    im_token_id: Optional[int] = None
     im_start_id: Optional[int] = None
     im_end_id: Optional[int] = None
     slice_start_id: Optional[int] = None
@@ -268,11 +308,7 @@ class MultimodalInputs:
         )
         assert isinstance(ret.mm_items, list)
-        ret.mm_items = [
-            item
-            for item in ret.mm_items
-            if item.is_audio() or item.is_image() or item.is_video()
-        ]
+        ret.mm_items = [item for item in ret.mm_items if item.is_valid()]
         assert len(ret.mm_items) != 0
@@ -284,7 +320,6 @@ class MultimodalInputs:
             item.set_pad_value()
         optional_args = [
-            "modalities",
             "im_token_id",
             "im_start_id",
             "im_end_id",
@@ -307,8 +342,8 @@ class MultimodalInputs:
         """ """
         return any(item.is_audio() for item in self.mm_items)
-    def collect_image_inputs(self) -> List[torch.Tensor]:
-        return [item.pixel_values for item in self.mm_items if item.is_image()]
+    def contains_mm_input(self) -> bool:
+        return any(True for item in self.mm_items if item.is_valid())
     def merge(self, other: MultimodalInputs):
         """
@@ -322,10 +357,8 @@ class MultimodalInputs:
         # args needed to be merged
         optional_args = [
-            "items",
-            "image_offsets",
+            "mm_items",
             "image_pad_len",
-            # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
         ]
         for arg in optional_args:
             self_arg = getattr(self, arg, None)
@@ -354,6 +387,8 @@ class Req:
         custom_logit_processor: Optional[str] = None,
         return_hidden_states: bool = False,
         eos_token_ids: Optional[Set[int]] = None,
+        bootstrap_host: Optional[str] = None,
+        bootstrap_room: Optional[int] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -438,6 +473,10 @@ class Req:
         self.temp_scaled_logprobs = False
         self.top_p_normalized_logprobs = False
+        # Latency Breakdown
+        self.queue_time_start = None
+        self.queue_time_end = None
         # Logprobs (return values)
         self.input_token_logprobs_val: Optional[List[float]] = None
         self.input_token_logprobs_idx: Optional[List[int]] = None
@@ -483,9 +522,9 @@ class Req:
         self.lora_path = lora_path
         # For disaggregation
-        self.bootstrap_host: str = "0.0.0.0"
-        self.bootstrap_room: Optional[int] = None
-        self.disagg_kv_sender: Optional[KVSender] = None
+        self.bootstrap_host: str = bootstrap_host
+        self.bootstrap_room: Optional[int] = bootstrap_room
+        self.disagg_kv_sender: Optional[BaseKVSender] = None
         # used for warmup because we don't have a pair yet when init
         self.skip_kv_transfer: bool = False
@@ -1440,7 +1479,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 global_server_args_dict["use_mla_backend"]
                 and global_server_args_dict["attention_backend"] == "flashinfer"
             )
-            or global_server_args_dict["enable_flashmla"]
+            or global_server_args_dict["attention_backend"] == "flashmla"
             or global_server_args_dict["attention_backend"] == "fa3"
         ):
             seq_lens_cpu = self.seq_lens.cpu()

sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl