PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/pixtral.py ADDED Viewed

@@ -0,0 +1,127 @@
+import asyncio
+import math
+from typing import List, Union
+from transformers.models.pixtral.image_processing_pixtral import (
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.pixtral import PixtralVisionModel
+class PixtralProcessor(BaseMultimodalProcessor):
+    models = [PixtralVisionModel]
+    PAD_TOKEN = "<pad>"
+    IMG_BREAK_TOKEN_ID = 12
+    IMG_END_TOKEN_ID = 13
+    def get_patch_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_width = max_height = self.image_size
+        patch_width = patch_height = self.patch_size
+        ratio = max(image_width / max_width, image_height / max_height)
+        if ratio > 1:
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
+        nrows, ncols = _get_pixtral_hf_num_image_tokens(
+            (image_height, image_width),
+            (patch_height, patch_width),
+        )
+        return ncols, nrows
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.image_token_id = getattr(
+            hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
+        )
+        # Instantiate the patcher logic helper using the class defined above
+        self.vision_config = hf_config.vision_config
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+        self.multimodal_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token
+        )
+        _processor.tokenizer.add_special_tokens(
+            {
+                "pad_token": getattr(hf_config, "pad_token", self.PAD_TOKEN),
+            }
+        )
+    async def _resize(self, image):
+        num_w_tokens, num_h_tokens = self.get_patch_grid_size(
+            image_width=image.size[0],
+            image_height=image.size[1],
+        )
+        new_size = (num_w_tokens * self.patch_size, num_h_tokens * self.patch_size)
+        return image.resize(new_size)
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        mm_data = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=self.multimodal_tokens,
+            max_req_input_len=kwargs.get("max_req_input_len", 4096),
+            image_data=image_data,
+            return_text=True,
+        )
+        if mm_data.images:
+            resize_tasks = [self._resize(image) for image in mm_data.images]
+            mm_data.images = await asyncio.gather(*resize_tasks)
+        processor_output = self.process_mm_data(
+            input_text=mm_data.input_text,
+            images=mm_data.images,
+        )
+        if "pixel_values" in processor_output:
+            input_ids = processor_output["input_ids"].view(-1)
+            image_offsets = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=self.image_token_id,
+            )
+            mm_items = [
+                MultimodalDataItem(
+                    pixel_values=processor_output["pixel_values"],
+                    image_sizes=processor_output["image_sizes"],
+                    modality=Modality.IMAGE,
+                    image_offsets=image_offsets,
+                )
+            ]
+            input_ids = input_ids.tolist()
+            processor_output.update(
+                input_ids=input_ids,
+                mm_items=mm_items,
+                # there's no im_start_id for pixtral, only im_token and im_end_token
+                im_end_id=self.IMG_END_TOKEN_ID,
+                im_token_id=self.image_token_id,
+            )
+        return processor_output

sglang/srt/managers/multimodal_processors/qwen_vl.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import math
-from typing import List, Union
+import re
+from typing import Dict, List, Union
 import torch
 from PIL import Image
@@ -23,7 +24,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        # The single, pre-expanded image token.
         self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
+        # The regex that matches expanded image tokens.
+        self.IMAGE_TOKEN_REGEX = re.compile(
+            r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
+        )
         self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
         self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
         self.image_token_id = hf_config.image_token_id
@@ -38,7 +44,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes]],
+        image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
         max_req_input_len,
@@ -48,11 +54,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         if isinstance(image_data, str):
             image_data = [image_data]
-        image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN,
+                image_token_regex=self.IMAGE_TOKEN_REGEX,
+            ),
             max_req_input_len=max_req_input_len,
         )
@@ -117,26 +125,60 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         async def resize_image_async(image):
             return resize_image(image)
-        if base_output.images:
+        images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
+        if base_output.images and not images_are_preprocessed:
             resize_tasks = [resize_image_async(image) for image in base_output.images]
             base_output.images = await asyncio.gather(*resize_tasks)
         ret = self.process_mm_data(
             input_text=base_output.input_text,
-            images=base_output.images,
+            images=None if images_are_preprocessed else base_output.images,
         )
+        input_ids = ret["input_ids"].flatten().tolist()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=ret["input_ids"].flatten(), mm_token_id=self.image_token_id
+        )
+        image_grid_thw = None
+        video_grid_thw = None  # TODO
         items = []
-        input_ids = ret["input_ids"].flatten().tolist()
-        if "pixel_values" in ret:
+        if base_output.images:
+            if images_are_preprocessed:
+                image_grid_thw = torch.concat(
+                    [
+                        torch.as_tensor(item.image_grid_thws)
+                        for item in base_output.images
+                    ]
+                )
+                all_pixel_values = [
+                    item.pixel_values
+                    for item in base_output.images
+                    if item.pixel_values is not None
+                ]
+                all_precomputed_features = [
+                    item.precomputed_features
+                    for item in base_output.images
+                    if item.precomputed_features is not None
+                ]
+                pixel_values = (
+                    torch.concat(all_pixel_values) if all_pixel_values else None
+                )
+                precomputed_features = (
+                    torch.concat(all_precomputed_features)
+                    if all_precomputed_features
+                    else None
+                )
+            else:
+                image_grid_thw = ret["image_grid_thw"]
+                pixel_values = ret["pixel_values"]
+                precomputed_features = None
             items += [
                 MultimodalDataItem(
-                    pixel_values=ret["pixel_values"],
-                    image_grid_thws=torch.concat([ret["image_grid_thw"]]),
-                    # TODO
-                    video_grid_thws=None,
-                    second_per_grid_ts=ret.get("second_per_grid_ts", None),
+                    pixel_values=pixel_values,
+                    image_grid_thws=image_grid_thw,
+                    video_grid_thws=video_grid_thw,
+                    precomputed_features=precomputed_features,
+                    image_offsets=image_offsets,
                     modality=Modality.IMAGE,
                 )
             ]
@@ -151,8 +193,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
                 self.hf_config.vision_config, "tokens_per_second", None
             ),
             input_ids=torch.tensor(input_ids).unsqueeze(0),
-            image_grid_thw=ret.get("image_grid_thw", None),
-            video_grid_thw=ret.get("video_grid_thw", None),
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
             second_per_grid_ts=ret.get("second_per_grid_ts", None),
         )
         mrope_positions = mrope_positions.squeeze(1)

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,8 +1,5 @@
 from __future__ import annotations
-import hashlib
-from enum import Enum, auto
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,12 +27,16 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
   It will be transformed from CPU scheduler to GPU model runner.
 - ForwardBatch is managed by `model_runner.py::ModelRunner`.
   It contains low-level tensor data. Most of the data consists of GPU tensors.
+TODO(lmzheng): ModelWorkerBatch seems a bit redundant and we consider removing it in the future.
 """
 import copy
 import dataclasses
+import hashlib
 import logging
 import threading
+from enum import Enum, auto
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -47,10 +48,14 @@ from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
 from sglang.srt.disaggregation.base import BaseKVSender
-from sglang.srt.disaggregation.decode import ScheduleBatchDisaggregationDecodeMixin
+from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
+    ScheduleBatchDisaggregationDecodeMixin,
+)
+from sglang.srt.layers.multimodal import gpu_tensor_hash
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
+from sglang.srt.metrics.collector import TimeStats
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -73,17 +78,21 @@ global_server_args_dict = {
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "enable_deepep_moe": ServerArgs.enable_deepep_moe,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
+    "enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
+    "deepep_config": ServerArgs.deepep_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
     "max_micro_batch_size": ServerArgs.max_micro_batch_size,
     "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
+    "ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
     "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
     "sampling_backend": ServerArgs.sampling_backend,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "torchao_config": ServerArgs.torchao_config,
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
+    "ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
 }
 logger = logging.getLogger(__name__)
@@ -134,9 +143,9 @@ class FINISH_LENGTH(BaseFinishReason):
 class FINISH_ABORT(BaseFinishReason):
-    def __init__(self, message="Unknown error", status_code=None, err_type=None):
+    def __init__(self, message=None, status_code=None, err_type=None):
         super().__init__(is_error=True)
-        self.message = message
+        self.message = message or "Aborted"
         self.status_code = status_code
         self.err_type = err_type
@@ -174,10 +183,10 @@ class MultimodalDataItem:
     image_offsets: Optional[list] = None
     # the real data, pixel_values or audio_features
-    # data: Union[List[torch.Tensor], List[np.array]]
-    pixel_values: Union[torch.Tensor, np.array] = None
-    image_grid_thws: Union[torch.Tensor, np.array] = None
-    video_grid_thws: Union[torch.Tensor, np.array] = None
+    # data: Union[List[torch.Tensor], List[np.ndarray]]
+    pixel_values: Union[torch.Tensor, np.ndarray] = None
+    image_grid_thws: Union[torch.Tensor, np.ndarray] = None
+    video_grid_thws: Union[torch.Tensor, np.ndarray] = None
     image_emb_mask: Optional[torch.Tensor] = None
     image_spatial_crop: Optional[torch.Tensor] = None
@@ -186,8 +195,11 @@ class MultimodalDataItem:
     # [num_images, (n, w, h)]
     tgt_size: Tuple[int, int] = None
-    audio_features: Union[torch.Tensor, np.array] = None
+    audio_features: Union[torch.Tensor, np.ndarray] = None
     audio_feature_lens: Optional[List[torch.Tensor]] = None
+    audio_offsets: Optional[List[Tuple[int, int]]] = None
+    precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
     @staticmethod
     def is_empty_list(l):
@@ -216,7 +228,8 @@ class MultimodalDataItem:
                     for x in tensor_list
                 ]
                 tensor = torch.concat(tensor_list)
+            if tensor.is_cuda:
+                return gpu_tensor_hash(tensor)
             tensor = tensor.detach().contiguous()
             if tensor.dtype == torch.bfloat16:
@@ -246,7 +259,9 @@ class MultimodalDataItem:
                 return tensor_hash([f])
             return data_hash(f)
-        if self.is_audio():
+        if self.precomputed_features is not None:
+            self.hash = hash_feature(self.precomputed_features)
+        elif self.is_audio():
             self.hash = hash_feature(self.audio_features)
         else:
             self.hash = hash_feature(self.pixel_values)
@@ -255,19 +270,24 @@ class MultimodalDataItem:
         self.pad_value = self.hash % (1 << 30)
     def is_audio(self):
-        return (
-            self.modality == Modality.AUDIO
-        ) and not MultimodalDataItem.is_empty_list(self.audio_features)
+        return (self.modality == Modality.AUDIO) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.audio_features)
+        )
     def is_image(self):
         return (
             self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
-        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+        ) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.pixel_values)
+        )
     def is_video(self):
-        return (
-            self.modality == Modality.VIDEO
-        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+        return (self.modality == Modality.VIDEO) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.pixel_values)
+        )
     def is_valid(self) -> bool:
         return self.is_image() or self.is_video() or self.is_audio()
@@ -276,6 +296,16 @@ class MultimodalDataItem:
         ...
         # TODO
+    @staticmethod
+    def from_dict(obj: dict):
+        kwargs = dict(obj)
+        modality = kwargs.pop("modality")
+        if isinstance(modality, str):
+            modality = Modality[modality]
+        ret = MultimodalDataItem(modality=modality, **kwargs)
+        ret.validate()
+        return ret
 @dataclasses.dataclass
 class MultimodalInputs:
@@ -301,8 +331,9 @@ class MultimodalInputs:
     video_token_id: Optional[int] = None
     # audio
-    audio_start_id: Optional[torch.Tensor] = None
-    audio_end_id: Optional[torch.Tensor] = None
+    audio_token_id: Optional[int] = None
+    audio_start_id: Optional[int] = None
+    audio_end_id: Optional[int] = None
     @staticmethod
     def from_dict(obj: dict):
@@ -326,6 +357,7 @@ class MultimodalInputs:
             "slice_end_id",
             "audio_start_id",
             "audio_end_id",
+            "audio_token_id",
         ]
         for arg in optional_args:
             if arg in obj:
@@ -434,6 +466,7 @@ class Req:
         self.sampling_params = sampling_params
         self.custom_logit_processor = custom_logit_processor
         self.return_hidden_states = return_hidden_states
+        self.lora_path = lora_path
         # Memory pool info
         self.req_pool_idx: Optional[int] = None
@@ -441,11 +474,13 @@ class Req:
         # Check finish
         self.tokenizer = None
         self.finished_reason = None
+        # Whether this request has finished output
+        self.finished_output = None
         # If we want to abort the request in the middle of the event loop, set this to true
         # Note: We should never set finished_reason in the middle, the req will get filtered and never respond
         self.to_abort = False
         # This carries the error message for `.to_abort` and will be attached to the finished_reason at the end of the event loop
-        self.to_abort_message: str = "Unknown error"
+        self.to_abort_message: str = None
         self.stream = stream
         self.eos_token_ids = eos_token_ids
@@ -483,6 +518,13 @@ class Req:
         # For retraction
         self.is_retracted = False
+        # Incremental streamining
+        self.send_token_offset: int = 0
+        self.send_decode_id_offset: int = 0
+        # TODO (Byron): send_output_token_logprobs_offset and send_decode_id_offset can be different in disaggregation mode
+        # because the decode server does not have the first output token logprobs
+        self.send_output_token_logprobs_offset: int = 0
         # Logprobs (arguments)
         self.return_logprob = return_logprob
         # Start index to compute logprob from.
@@ -492,11 +534,9 @@ class Req:
         self.temp_scaled_logprobs = False
         self.top_p_normalized_logprobs = False
-        # Latency Breakdown
-        self.queue_time_start = None
-        self.queue_time_end = None
         # Logprobs (return values)
+        # True means the input logprob has been already sent to detokenizer.
+        self.input_logprob_sent: bool = False
         self.input_token_logprobs_val: Optional[List[float]] = None
         self.input_token_logprobs_idx: Optional[List[int]] = None
         self.input_top_logprobs_val: Optional[List[float]] = None
@@ -511,8 +551,10 @@ class Req:
         self.temp_input_token_ids_logprobs_idx: Optional[List[int]] = None
         if return_logprob:
+            # shape: (bs, 1)
             self.output_token_logprobs_val = []
             self.output_token_logprobs_idx = []
+            # shape: (bs, k)
             self.output_top_logprobs_val = []
             self.output_top_logprobs_idx = []
             self.output_token_ids_logprobs_val = []
@@ -530,6 +572,7 @@ class Req:
         # Constrained decoding
         self.grammar: Optional[BaseGrammarObject] = None
+        self.grammar_wait_ct = 0
         # The number of cached tokens that were already cached in the KV cache
         self.cached_tokens = 0
@@ -538,7 +581,12 @@ class Req:
         # The number of verification forward passes in the speculative decoding.
         # This is used to compute the average acceptance length per request.
         self.spec_verify_ct = 0
-        self.lora_path = lora_path
+        # For metrics
+        self.time_stats: TimeStats = TimeStats()
+        self.has_log_time_stats: bool = False
+        self.queue_time_start = None
+        self.queue_time_end = None
         # For disaggregation
         self.bootstrap_host: str = bootstrap_host
@@ -546,8 +594,6 @@ class Req:
         self.bootstrap_room: Optional[int] = bootstrap_room
         self.disagg_kv_sender: Optional[BaseKVSender] = None
-        # used for warmup because we don't have a pair yet when init
-        self.skip_kv_transfer: bool = False
         # the start index of the sent kv cache
         # We want to send it chunk by chunk for chunked prefill.
         # After every chunk forward, we do the following:
@@ -555,14 +601,11 @@ class Req:
         # start_send_idx = len(req.fill_ids)
         self.start_send_idx: int = 0
-        self.metadata_buffer_index: int = -1
-        # The first output_id transferred from prefill instance.
-        self.transferred_output_id: Optional[int] = None
         # For overlap schedule, we delay the kv transfer until `process_batch_result_disagg_prefill` rather than `process_prefill_chunk` in non-overlap
         # This is because kv is not ready in `process_prefill_chunk`.
         # We use `tmp_end_idx` to store the end index of the kv cache to send.
         self.tmp_end_idx: int = -1
+        self.metadata_buffer_index: int = -1
     @property
     def seqlen(self):
@@ -653,6 +696,11 @@ class Req:
             )
             return
+        if self.grammar is not None:
+            if self.grammar.is_terminated():
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=self.output_ids[-1])
+                return
         last_token_id = self.output_ids[-1]
         if not self.sampling_params.ignore_eos:
@@ -697,13 +745,41 @@ class Req:
         self.req_pool_idx = None
         self.already_computed = 0
+    def offload_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        self.kv_cache_cpu = token_to_kv_pool_allocator.get_cpu_copy(token_indices)
+    def load_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        token_to_kv_pool_allocator.load_cpu_copy(self.kv_cache_cpu, token_indices)
+        del self.kv_cache_cpu
+    def log_time_stats(self):
+        # If overlap schedule, we schedule one decode batch ahead so this gets called twice.
+        if self.has_log_time_stats is True:
+            return
+        if self.bootstrap_room is not None:
+            prefix = f"Req Time Stats(rid={self.rid}, bootstrap_room={self.bootstrap_room}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
+        else:
+            prefix = f"Req Time Stats(rid={self.rid}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
+        logger.info(f"{prefix}: {self.time_stats}")
+        self.has_log_time_stats = True
     def __repr__(self):
         return (
             f"Req(rid={self.rid}, "
-            f"input_ids={self.origin_input_ids}, output_ids={self.output_ids})"
+            f"input_ids={self.origin_input_ids}, output_ids={self.output_ids}, "
+            f"{self.grammar=}, "
+            f"{self.sampling_params=})"
         )
+# Batch id
 bid = 0
@@ -862,7 +938,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             error_msg = (
                 f"{phase_str} out of memory. Try to lower your batch size.\n"
                 f"Try to allocate {num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
             )
             logger.error(error_msg)
             if self.tree_cache is not None:
@@ -903,7 +979,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             error_msg = (
                 f"Prefill out of memory. Try to lower your batch size.\n"
                 f"Try to allocate {extend_num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                 f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                 f"{self.tree_cache.evictable_size()=}\n"
             )
@@ -938,7 +1014,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             error_msg = (
                 f"Decode out of memory. Try to lower your batch size.\n"
                 f"Try to allocate {len(seq_lens)} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                 f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                 f"{self.tree_cache.evictable_size()=}\n"
             )
@@ -1019,7 +1095,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         else:
             self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
-        assert len(self.out_cache_loc) == self.extend_num_tokens
+        assert (
+            len(self.out_cache_loc) == self.extend_num_tokens
+        ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
     def prepare_for_extend(self):
         self.forward_mode = ForwardMode.EXTEND
@@ -1447,7 +1525,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 i
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
-                and not self.reqs[i] in chunked_req_to_exclude
+                and self.reqs[i] not in chunked_req_to_exclude
             ]
         if keep_indices is None or len(keep_indices) == 0:
@@ -1468,7 +1546,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
         self.reqs = [self.reqs[i] for i in keep_indices]
-        self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
         self.req_pool_indices = self.req_pool_indices[keep_indices_device]
         self.seq_lens = self.seq_lens[keep_indices_device]
         self.out_cache_loc = None
@@ -1517,7 +1596,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
             self.token_ids_logprobs = [None] * len(self.reqs) + other.token_ids_logprobs
         self.reqs.extend(other.reqs)
-        self.multimodal_inputs.extend(other.multimodal_inputs)
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs.extend(other.multimodal_inputs)
         self.return_logprob |= other.return_logprob
         self.has_stream |= other.has_stream

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -22,11 +22,7 @@ from typing import Dict, List, Optional, Set, Union
 import torch
-from sglang.srt.managers.schedule_batch import (
-    Req,
-    ScheduleBatch,
-    global_server_args_dict,
-)
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl