PyPI - sglang - Versions diffs - 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl - Mend

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +24 -16
sglang/bench_one_batch.py +51 -3
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +37 -28
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +15 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/model_config.py +16 -6
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +449 -0
sglang/srt/entrypoints/http_server.py +579 -0
sglang/srt/layers/activation.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +27 -12
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +69 -0
sglang/srt/layers/linear.py +76 -102
sglang/srt/layers/logits_processor.py +48 -63
sglang/srt/layers/moe/ep_moe/layer.py +4 -4
sglang/srt/layers/moe/fused_moe_native.py +69 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -14
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +26 -17
sglang/srt/layers/quantization/__init__.py +22 -23
sglang/srt/layers/quantization/fp8.py +112 -55
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +2 -3
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +1179 -31
sglang/srt/layers/sampler.py +39 -1
sglang/srt/layers/vocab_parallel_embedding.py +17 -4
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +46 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +23 -8
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +54 -15
sglang/srt/managers/schedule_batch.py +49 -22
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +319 -181
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +303 -158
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +110 -77
sglang/srt/metrics/collector.py +25 -11
sglang/srt/model_executor/cuda_graph_runner.py +4 -6
sglang/srt/model_executor/model_runner.py +80 -21
sglang/srt/model_loader/loader.py +8 -6
sglang/srt/model_loader/weight_utils.py +55 -2
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +3 -3
sglang/srt/models/dbrx.py +4 -4
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +8 -8
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +6 -24
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +3 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +41 -4
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +6 -6
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +52 -4
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +3 -3
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +153 -9
sglang/srt/sampling/sampling_params.py +4 -2
sglang/srt/server.py +4 -1037
sglang/srt/server_args.py +84 -32
sglang/srt/speculative/eagle_worker.py +1 -0
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +130 -63
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +26 -13
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +126 -117
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -9,6 +9,8 @@ from typing import List, Optional, Union
 import numpy as np
 import transformers
+from decord import VideoReader, cpu
+from PIL import Image
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.mm_utils import expand2square, process_anyres_image
@@ -36,6 +38,7 @@ class BaseImageProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
+        self.server_args = server_args
         self.executor = concurrent.futures.ProcessPoolExecutor(
             initializer=init_global_processor,
@@ -126,7 +129,12 @@ class LlavaImageProcessor(BaseImageProcessor):
             )
     async def process_images_async(
-        self, image_data: List[Union[str, bytes]], input_text, request_obj
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
     ):
         if not image_data:
             return None
@@ -229,6 +237,147 @@ class MllamaImageProcessor(BaseImageProcessor):
         return image_inputs
+class MiniCPMVImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+    @staticmethod
+    def _process_images_task(images, input_text):
+        result = global_processor.__call__(
+            text=input_text, images=images, return_tensors="pt"
+        )
+        return {
+            "input_ids": result["input_ids"],
+            "pixel_values": result["pixel_values"],
+            "tgt_sizes": result["tgt_sizes"],
+        }
+    async def _process_images(self, images, input_text):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                MiniCPMVImageProcessor._process_images_task,
+                images,
+                input_text,
+            )
+        else:
+            image_inputs = self._processor(
+                images=images, text=input_text, return_tensors="pt"
+            )
+        return image_inputs
+    async def process_images_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+    ):
+        if not image_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        image_hashes, image_sizes = [], []
+        raw_images = []
+        IMAGE_TOKEN = "(<image>./</image>)"
+        # roughly calculate the max number of frames
+        # TODO: the process should be applied to all the visual inputs
+        def calculate_max_num_frames() -> int:
+            # Model-specific
+            NUM_TOKEN_PER_FRAME = 330
+            ret = (max_req_input_len - len(input_text)) // NUM_TOKEN_PER_FRAME
+            return min(ret, 100)
+        # if cuda OOM set a smaller number
+        MAX_NUM_FRAMES = calculate_max_num_frames()
+        print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
+        def encode_video(video_path):
+            if not os.path.exists(video_path):
+                logger.error(f"Video {video_path} does not exist")
+                return []
+            if MAX_NUM_FRAMES == 0:
+                return []
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            return frames
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+        # MiniCPMV requires each frame of video as a single image token
+        text_parts = input_text.split(IMAGE_TOKEN)
+        new_text_parts = []
+        for image_index, image in enumerate(image_data):
+            try:
+                if isinstance(image, str) and image.startswith("video:"):
+                    path = image[len("video:") :]
+                    frames = encode_video(path)
+                else:
+                    raw_image, size = load_image(image)
+                    frames = [raw_image]
+                if len(frames) == 0:
+                    continue
+            except FileNotFoundError as e:
+                print(e)
+                return None
+            image_sizes += frames[0].size * len(frames)
+            image_hashes += [hash(image)] * len(frames)
+            raw_images += frames
+            new_text_parts.append(text_parts[image_index])
+            new_text_parts.append(IMAGE_TOKEN * len(frames))
+        new_text_parts.append(text_parts[-1])
+        input_text = "".join(new_text_parts)
+        if len(raw_images) == 0:
+            return None
+        res = await self._process_images(images=raw_images, input_text=input_text)
+        pixel_values = res["pixel_values"]
+        tgt_sizes = res["tgt_sizes"]
+        input_ids = res["input_ids"]
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        im_start_id = [tokenizer.im_start_id]
+        im_end_id = [tokenizer.im_end_id]
+        if tokenizer.slice_start_id:
+            slice_start_id = [tokenizer.slice_start_id]
+            slice_end_id = [tokenizer.slice_end_id]
+        return {
+            "input_ids": input_ids.flatten().tolist(),
+            "pixel_values": pixel_values,
+            "tgt_sizes": tgt_sizes,
+            "image_hashes": image_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "im_start_id": im_start_id,
+            "im_end_id": im_end_id,
+            "slice_start_id": slice_start_id,
+            "slice_end_id": slice_end_id,
+        }
 class Qwen2VLImageProcessor(BaseImageProcessor):
     def __init__(self, hf_config, server_args, _image_processor):
         self.hf_config = hf_config
@@ -289,7 +438,12 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             return self._process_single_image_task(image_data)
     async def process_images_async(
-        self, image_data: List[Union[str, bytes]], input_text, request_obj
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
     ):
         if not image_data:
             return None
@@ -350,6 +504,8 @@ def get_image_processor(
         return MllamaImageProcessor(hf_config, server_args, processor)
     elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
         return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+    elif "MiniCPMV" in hf_config.architectures:
+        return MiniCPMVImageProcessor(hf_config, server_args, processor)
     else:
         return LlavaImageProcessor(hf_config, server_args, processor.image_processor)

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -19,9 +19,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Tuple, Union
-import torch
+from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -61,6 +59,9 @@ class GenerateReqInput:
     return_text_in_logprobs: bool = False
     # Whether to stream output.
     stream: bool = False
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
     # LoRA related
@@ -68,6 +69,8 @@ class GenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[Union[List[Dict], Dict]] = None
+    # Custom logit processor (serialized function)
+    custom_logit_processor: Optional[Union[List[Optional[str]], Optional[str]]] = None
     def normalize_batch_and_arguments(self):
         if (
@@ -182,6 +185,13 @@ class GenerateReqInput:
             else:
                 assert self.parallel_sample_num == 1
+            if self.custom_logit_processor is None:
+                self.custom_logit_processor = [None] * num
+            elif not isinstance(self.custom_logit_processor, list):
+                self.custom_logit_processor = [self.custom_logit_processor] * num
+            else:
+                assert self.parallel_sample_num == 1
     def regenerate_rid(self):
         self.rid = uuid.uuid4().hex
         return self.rid
@@ -198,8 +208,14 @@ class GenerateReqInput:
             top_logprobs_num=self.top_logprobs_num[i],
             return_text_in_logprobs=self.return_text_in_logprobs,
             stream=self.stream,
+            log_metrics=self.log_metrics,
             modalities=self.modalities[i] if self.modalities else None,
             lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            custom_logit_processor=(
+                self.custom_logit_processor[i]
+                if self.custom_logit_processor is not None
+                else None
+            ),
         )
@@ -232,6 +248,10 @@ class TokenizedGenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[SessionParams] = None
+    # Custom logit processor (serialized function)
+    # TODO (hpguo): Add an example and update doc string here
+    custom_logit_processor: Optional[str] = None
 @dataclass
 class EmbeddingReqInput:
@@ -245,6 +265,8 @@ class EmbeddingReqInput:
     sampling_params: Union[List[Dict], Dict] = None
     # Dummy input embeds for compatibility
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
     def normalize_batch_and_arguments(self):
         if (self.text is None and self.input_ids is None) or (
@@ -323,9 +345,7 @@ class BatchTokenIDOut:
     decoded_texts: List[str]
     decode_ids: List[int]
     read_offsets: List[int]
-    # Only used when --return-token-ids` is set
-    origin_input_ids: Optional[List[int]]
-    # Only used when `--skip-tokenizer-init` or `--return-token-ids` is set
+    # Only used when `--skip-tokenizer-init` is on
     output_ids: Optional[List[int]]
     # Detokenization configs
     skip_special_tokens: List[bool]
@@ -344,7 +364,6 @@ class BatchTokenIDOut:
     input_top_logprobs_idx: List[List]
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
-    normalized_prompt_logprob: List[float]
 @dataclass
@@ -356,14 +375,7 @@ class BatchStrOut:
     # The output decoded strings
     output_strs: List[str]
-    # The token ids
-    origin_input_ids: Optional[List[int]]
-    output_ids: Optional[List[int]]
     # Token counts
-    # real input and output tokens can be get from
-    # origin_input_ids and output_ids by enabling --return_token_ids
-    # TODO (Shuai): Rename this to clarify the meaning.
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
@@ -377,7 +389,6 @@ class BatchStrOut:
     input_top_logprobs_idx: List[List]
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
-    normalized_prompt_logprob: List[float]
 @dataclass
@@ -468,6 +479,26 @@ class GetWeightsByNameReqOutput:
     parameter: list
+@dataclass
+class ReleaseMemoryOccupationReqInput:
+    pass
+@dataclass
+class ReleaseMemoryOccupationReqOutput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqInput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqOutput:
+    pass
 @dataclass
 class AbortReq:
     # The request id
@@ -479,6 +510,14 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
+@dataclass
+class ConfigureLoggingReq:
+    log_requests: Optional[bool] = None
+    log_requests_level: Optional[int] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -52,7 +52,6 @@ from sglang.srt.server_args import ServerArgs
 if TYPE_CHECKING:
     from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 # Put some global args for easy access
@@ -65,9 +64,9 @@ global_server_args_dict = {
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
+    "device": ServerArgs.device,
 }
 logger = logging.getLogger(__name__)
@@ -116,14 +115,18 @@ class FINISH_LENGTH(BaseFinishReason):
 class FINISH_ABORT(BaseFinishReason):
-    def __init__(self, message="Unknown error"):
+    def __init__(self, message="Unknown error", status_code=None, err_type=None):
         super().__init__(is_error=True)
         self.message = message
+        self.status_code = status_code
+        self.err_type = err_type
     def to_json(self):
         return {
             "type": "abort",
             "message": self.message,
+            "status_code": self.status_code,
+            "err_type": self.err_type,
         }
@@ -148,6 +151,15 @@ class ImageInputs:
     image_grid_thws: List[Tuple[int, int, int]] = None
     mrope_position_delta: Optional[torch.Tensor] = None
+    # MiniCPMV related
+    # All the images in the batch should share the same special image
+    # bound token ids.
+    im_start_id: Optional[torch.Tensor] = None
+    im_end_id: Optional[torch.Tensor] = None
+    slice_start_id: Optional[torch.Tensor] = None
+    slice_end_id: Optional[torch.Tensor] = None
+    tgt_sizes: Optional[list] = None
     @staticmethod
     def from_dict(obj: dict):
         ret = ImageInputs(
@@ -167,6 +179,11 @@ class ImageInputs:
             "aspect_ratio_ids",
             "aspect_ratio_mask",
             "image_grid_thws",
+            "im_start_id",
+            "im_end_id",
+            "slice_start_id",
+            "slice_end_id",
+            "tgt_sizes",
         ]
         for arg in optional_args:
             if arg in obj:
@@ -215,6 +232,7 @@ class Req:
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
+        custom_logit_processor: Optional[str] = None,
         eos_token_ids: Optional[Set[int]] = None,
     ):
         # Input and output info
@@ -226,14 +244,16 @@ class Req:
             else origin_input_ids  # Before image padding
         )
         self.origin_input_ids = origin_input_ids
-        self.output_ids = []  # Each decode stage's output ids
-        self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
+        # Each decode stage's output ids
+        self.output_ids = []
+        # fill_ids = origin_input_ids + output_ids. Updated if chunked.
         self.session_id = session_id
         self.input_embeds = input_embeds
         # Sampling info
         self.sampling_params = sampling_params
         self.lora_path = lora_path
+        self.custom_logit_processor = custom_logit_processor
         # Memory pool info
         self.req_pool_idx = None
@@ -265,6 +285,7 @@ class Req:
         # Prefix info
         self.prefix_indices = []
         # Tokens to run prefill. input_tokens - shared_prefix_tokens.
+        # Updated if chunked.
         self.extend_input_len = 0
         self.last_node = None
@@ -280,11 +301,10 @@ class Req:
         self.top_logprobs_num = top_logprobs_num
         # Logprobs (return value)
-        self.normalized_prompt_logprob = None
-        self.input_token_logprobs_val = None
-        self.input_token_logprobs_idx = None
-        self.input_top_logprobs_val = None
-        self.input_top_logprobs_idx = None
+        self.input_token_logprobs_val: Optional[List[float]] = None
+        self.input_token_logprobs_idx: Optional[List[int]] = None
+        self.input_top_logprobs_val: Optional[List[float]] = None
+        self.input_top_logprobs_idx: Optional[List[int]] = None
         if return_logprob:
             self.output_token_logprobs_val = []
@@ -344,9 +364,6 @@ class Req:
             max_prefix_len = min(max_prefix_len, input_len - 1)
         if self.return_logprob:
-            if self.normalized_prompt_logprob is None:
-                # Need at least two tokens to compute normalized logprob
-                max_prefix_len = min(max_prefix_len, input_len - 2)
             max_prefix_len = min(max_prefix_len, self.logprob_start_len)
         max_prefix_len = max(max_prefix_len, 0)
@@ -578,6 +595,9 @@ class ScheduleBatch:
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[SpecInfo] = None
+    # Enable custom logit processor
+    enable_custom_logit_processor: bool = False
     @classmethod
     def init_new(
         cls,
@@ -588,6 +608,7 @@ class ScheduleBatch:
         model_config: ModelConfig,
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
+        enable_custom_logit_processor: bool,
     ):
         return cls(
             reqs=reqs,
@@ -601,6 +622,7 @@ class ScheduleBatch:
             has_grammar=any(req.grammar for req in reqs),
             device=req_to_token_pool.device,
             spec_algorithm=spec_algorithm,
+            enable_custom_logit_processor=enable_custom_logit_processor,
         )
     def batch_size(self):
@@ -656,7 +678,7 @@ class ScheduleBatch:
                     or len(req.prefix_indices) >= im.num_image_tokens
                 )
-        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int32).to(
+        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
@@ -690,7 +712,7 @@ class ScheduleBatch:
         self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
@@ -766,10 +788,10 @@ class ScheduleBatch:
         self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        self.req_pool_indices = torch.tensor(req_pool_indices, dtype=torch.int32).to(
+        self.req_pool_indices = torch.tensor(req_pool_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
-        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
         self.input_embeds = (
@@ -1002,11 +1024,16 @@ class ScheduleBatch:
     def prepare_for_idle(self):
         self.forward_mode = ForwardMode.IDLE
         self.input_ids = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int64, device=self.device)
         self.seq_lens_sum = 0
         self.extend_num_tokens = 0
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+            enable_overlap_schedule=self.enable_overlap,
+        )
     def prepare_for_decode(self):
         self.forward_mode = ForwardMode.DECODE
@@ -1067,7 +1094,7 @@ class ScheduleBatch:
             self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
         self.reqs = [self.reqs[i] for i in keep_indices]
-        new_indices = torch.tensor(keep_indices, dtype=torch.int32).to(
+        new_indices = torch.tensor(keep_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
         self.req_pool_indices = self.req_pool_indices[new_indices]
@@ -1121,7 +1148,7 @@ class ScheduleBatch:
             self.spec_info.merge_batch(other.spec_info)
     def get_model_worker_batch(self):
-        if self.forward_mode.is_decode() or self.forward_mode.is_idle():
+        if self.forward_mode.is_decode_or_idle():
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
             extend_seq_lens = self.extend_lens
@@ -1136,7 +1163,6 @@ class ScheduleBatch:
         global bid
         bid += 1
         return ModelWorkerBatch(
             bid=bid,
             forward_mode=self.forward_mode,
@@ -1180,6 +1206,7 @@ class ScheduleBatch:
             return_logprob=self.return_logprob,
             decoding_reqs=self.decoding_reqs,
             spec_algorithm=self.spec_algorithm,
+            enable_custom_logit_processor=self.enable_custom_logit_processor,
         )
     def __str__(self):

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -24,6 +24,7 @@ import torch
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
 # Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
@@ -250,23 +251,24 @@ class PrefillAdder:
     def __init__(
         self,
         tree_cache: BasePrefixCache,
+        token_to_kv_pool: BaseTokenToKVPool,
         running_batch: ScheduleBatch,
         new_token_ratio: float,
-        rem_total_tokens: int,
         rem_input_tokens: int,
         rem_chunk_tokens: Optional[int],
         mixed_with_decode_tokens: int = 0,
     ):
         self.tree_cache = tree_cache
+        self.token_to_kv_pool = token_to_kv_pool
         self.running_batch = running_batch
         self.new_token_ratio = new_token_ratio
-        self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
         self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
         self.rem_chunk_tokens = rem_chunk_tokens
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= mixed_with_decode_tokens
-        self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
+        self.rem_total_token_offset = mixed_with_decode_tokens
+        self.cur_rem_token_offset = mixed_with_decode_tokens
         self.req_states = None
         self.can_run_list = []
@@ -275,8 +277,7 @@ class PrefillAdder:
         self.log_input_tokens = 0
         if running_batch is not None:
-            # Pre-remove the tokens which will be occupied by the running requests
-            self.rem_total_tokens -= sum(
+            self.rem_total_token_offset += sum(
                 [
                     min(
                         (r.sampling_params.max_new_tokens - len(r.output_ids)),
@@ -287,6 +288,22 @@ class PrefillAdder:
                 ]
             )
+    @property
+    def rem_total_tokens(self):
+        return (
+            self.token_to_kv_pool.available_size()
+            + self.tree_cache.evictable_size()
+            - self.rem_total_token_offset
+        )
+    @property
+    def cur_rem_tokens(self):
+        return (
+            self.token_to_kv_pool.available_size()
+            + self.tree_cache.evictable_size()
+            - self.cur_rem_token_offset
+        )
     def budget_state(self):
         if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
             return AddReqResult.NO_TOKEN
@@ -301,8 +318,8 @@ class PrefillAdder:
     def _prefill_one_req(
         self, prefix_len: int, extend_input_len: int, max_new_tokens: int
     ):
-        self.rem_total_tokens -= extend_input_len + max_new_tokens
-        self.cur_rem_tokens -= extend_input_len
+        self.rem_total_token_offset += extend_input_len + max_new_tokens
+        self.cur_rem_token_offset += extend_input_len
         self.rem_input_tokens -= extend_input_len
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= extend_input_len
@@ -332,12 +349,10 @@ class PrefillAdder:
     @contextmanager
     def _lock_node(self, last_node: TreeNode):
         try:
-            delta = self.tree_cache.inc_lock_ref(last_node)
-            self.rem_total_tokens += delta
+            self.tree_cache.inc_lock_ref(last_node)
             yield None
         finally:
-            delta = self.tree_cache.dec_lock_ref(last_node)
-            self.rem_total_tokens += delta
+            self.tree_cache.dec_lock_ref(last_node)
     def add_one_req_ignore_eos(self, req: Req):
         def add_req_state(r, insert_sort=False):
@@ -433,7 +448,6 @@ class PrefillAdder:
                 or input_tokens <= self.rem_chunk_tokens
                 or (
                     req.return_logprob
-                    and req.normalized_prompt_logprob is None
                     and req.logprob_start_len != len(req.origin_input_ids) - 1
                 )
             ):

sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl