PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sglang/bench_one_batch.py +1 -11
sglang/bench_serving.py +149 -1
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +17 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +30 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +14 -2
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +5 -0
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/lora/lora_manager.py +10 -13
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/schedule_batch.py +19 -1
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +28 -13
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +9 -12
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/model_executor/model_runner.py +44 -33
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +55 -20
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +1 -1
sglang/srt/models/llama4.py +53 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +24 -40
sglang/srt/openai_api/protocol.py +28 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +30 -6
sglang/srt/utils.py +35 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -14,6 +14,9 @@ except ImportError:
 from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    fp8_max,
+    is_fp8_fnuz,
     per_token_group_quant_fp8,
     scaled_fp8_quant,
     sglang_per_token_quant_fp8,
@@ -30,8 +33,11 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
-if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
+if _is_hip and use_aiter_moe:
     from aiter import gemm_a8w8_blockscale
 if _is_cuda:
@@ -43,19 +49,23 @@ use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_K
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = None
-_TORCH_VERSION = torch.__version__.split("+")[0]
-try:
-    _TORCH_VERSION_TUPLE = tuple(map(int, _TORCH_VERSION.split(".")[:3]))
-except ValueError:
-    _TORCH_VERSION_TUPLE = (0, 0, 0)
-# The condition to determine if it is on a platform that supports
-# torch._scaled_mm rowwise feature.
-# The condition is determined once as the operations
-# are time consuming.
-USE_ROWWISE_TORCH_SCALED_MM = (
-    _is_hip and get_device_capability() >= (9, 4) and _TORCH_VERSION_TUPLE >= (2, 7, 0)
-)
+def use_rowwise_torch_scaled_mm():
+    _TORCH_VERSION = torch.__version__.split("+")[0]
+    try:
+        _TORCH_VERSION_TUPLE = tuple(map(int, _TORCH_VERSION.split(".")[:3]))
+    except ValueError:
+        _TORCH_VERSION_TUPLE = (0, 0, 0)
+    if _is_hip:
+        # The condition to determine if it is on a platform that supports
+        # torch._scaled_mm rowwise feature.
+        # The condition is determined once as the operations
+        # are time consuming.
+        return get_device_capability() >= (9, 4) and _TORCH_VERSION_TUPLE >= (2, 7, 0)
+    return False
+USE_ROWWISE_TORCH_SCALED_MM = use_rowwise_torch_scaled_mm()
 def cutlass_fp8_supported():
@@ -132,7 +142,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+    elif _is_hip and use_aiter_moe:
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )
@@ -164,18 +174,21 @@ def apply_w8a8_block_fp8_linear(
 def input_to_float8(
-    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+    x: torch.Tensor, dtype: torch.dtype = fp8_dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values with tensor-wise quantization."""
-    finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
-    fp8_max = finfo.max
-    if _is_hip:
-        dtype = torch.float8_e4m3fnuz
-        fp8_max = 224.0
-    scale = fp8_max / amax
-    x_scl_sat = (x.float() * scale).clamp(min=-fp8_max, max=fp8_max)
+    if _is_fp8_fnuz:
+        dtype = fp8_dtype
+        fp_max = fp8_max
+    else:
+        finfo = torch.finfo(dtype)
+        fp_max = finfo.max
+    scale = fp_max / amax
+    x_scl_sat = (x.float() * scale).clamp(min=-fp_max, max=fp_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
@@ -222,6 +235,41 @@ def block_quant_to_tensor_quant(
     return x_q_tensor, scale
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+    x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
+    for j in range(n_tiles):
+        for i in range(k_tiles):
+            x_q_block_tile = x_q_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile = x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
+    return x_dq_block
 def channel_quant_to_tensor_quant(
     x_q_channel: torch.Tensor,
     x_s: torch.Tensor,

sglang/srt/layers/quantization/kv_cache.py CHANGED Viewed

@@ -8,10 +8,8 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import is_hip
-_is_hip = is_hip()
 logger = logging.getLogger(__name__)
@@ -44,11 +42,6 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
         )
-    @classmethod
-    def is_fp8_fnuz(cls) -> bool:
-        # only device 0 is checked, this assumes MI300 platforms are homogeneous
-        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
@@ -57,7 +50,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             # We prefer to use separate k_scale and v_scale if present
             k_scale = layer.k_scale.to("cpu").tolist()
             v_scale = layer.v_scale.to("cpu").tolist()
-            if _is_hip and self.is_fp8_fnuz():
+            if is_fp8_fnuz():
                 k_scale *= 2
                 v_scale *= 2
         elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
@@ -73,7 +66,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):
             scale_to_duplicate = max(layer.k_scale, layer.v_scale)
             k_scale = scale_to_duplicate.to("cpu").tolist()
             v_scale = scale_to_duplicate.to("cpu").tolist()
-            if _is_hip and self.is_fp8_fnuz():
+            if is_fp8_fnuz():
                 k_scale *= 2
                 v_scale *= 2

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -14,11 +14,6 @@ if not _is_cuda:
     from vllm._custom_ops import scaled_fp8_quant
-def is_fp8_fnuz() -> bool:
-    # only device 0 is checked, this assumes MI300 platforms are homogeneous
-    return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
 def is_layer_skipped(
     prefix: str,
     ignored_layers: List[str],

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -9,16 +9,20 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+)
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
     input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
 )
-from sglang.srt.utils import is_hip, set_weight_attrs
+from sglang.srt.utils import set_weight_attrs
-_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
 class W8A8Fp8Config(QuantizationConfig):
@@ -97,7 +101,7 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
         if self.quantization_config.is_checkpoint_fp8_serialized:
             weight_scale = layer.weight_scale.detach()
             # If checkpoint offline quantized with w8a8_fp8, load the weight and weight_scale directly.
-            if _is_hip:
+            if _is_fp8_fnuz:
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight, weight_scale=weight_scale
                 )
@@ -113,14 +117,9 @@ class W8A8Fp8LinearMethod(LinearMethodBase):
                     layer.weight, layer.weight.shape[-1]
                 )
                 weight_scale = weight_scale.t().contiguous()
-                if _is_hip:
-                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                        weight=weight, weight_scale=weight_scale
-                    )
             else:
                 # if cutlass not supported, we fall back to use torch._scaled_mm
                 # which requires per tensor quantization on weight
-                fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
                 qweight, weight_scale = input_to_float8(layer.weight, dtype=fp8_dtype)
             # Update the layer with the new values.
@@ -227,7 +226,6 @@ class W8A8FP8MoEMethod:
     ):
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-        fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
         # WEIGHTS
         w13_weight = torch.nn.Parameter(
             torch.empty(

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -156,18 +156,15 @@ class LoRAManager:
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
-        if hasattr(self, "max_bs_in_cuda_graph") and bs <= self.max_bs_in_cuda_graph:
-            # Do in-place updates when CUDA graph is enabled. Note that
-            # if CUDA graph is enabled, the batch whose bs <= max_bs_in_cuda_graph
-            # will also use these preallocated buffers, no matter whether
-            # the batch can use CUDA graph or not.
+        if (
+            hasattr(self, "max_bs_in_cuda_graph")
+            and bs <= self.max_bs_in_cuda_graph
+            and forward_batch.forward_mode.is_cuda_graph()
+        ):
+            # Do in-place updates when CUDA graph is enabled and the batch forward mode
+            # could use CUDA graph.
             self.cuda_graph_batch_info.bs = bs
-            if forward_batch.forward_mode.is_extend():
-                self.cuda_graph_batch_info.seg_lens[:bs].copy_(
-                    forward_batch.extend_seq_lens
-                )
-            else:
-                self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
+            self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
             torch.cumsum(
                 self.cuda_graph_batch_info.seg_lens[:bs],
                 dim=0,
@@ -201,10 +198,10 @@ class LoRAManager:
             max_len = int(torch.max(seg_lens))
             weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-            lora_ranks = torch.empty(
+            lora_ranks = torch.zeros(
                 (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
             )
-            scalings = torch.empty(
+            scalings = torch.zeros(
                 (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
             )
             for i, lora_path in enumerate(forward_batch.lora_paths):

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -268,98 +268,97 @@ class HiCacheController:
         """
         Directly write through KV caches to host memory without buffering.
         """
-        with torch.cuda.stream(self.write_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.write_queue.get(block=True, timeout=1)
-                    self.mem_pool_host.write_page_all_layers(
-                        operation.host_indices,
-                        operation.device_indices,
-                        self.mem_pool_device,
-                    )
-                    self.write_stream.synchronize()
-                    self.mem_pool_host.complete_io(operation.host_indices)
-                    for node_id in operation.node_ids:
-                        if node_id != 0:
-                            self.ack_write_queue.put(node_id)
-                except Empty:
-                    continue
-                except Exception as e:
-                    logger.error(e)
+        torch.cuda.set_stream(self.write_stream)
+        while not self.stop_event.is_set():
+            try:
+                operation = self.write_queue.get(block=True, timeout=1)
+                self.mem_pool_host.write_page_all_layers(
+                    operation.host_indices,
+                    operation.device_indices,
+                    self.mem_pool_device,
+                )
+                self.write_stream.synchronize()
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    if node_id != 0:
+                        self.ack_write_queue.put(node_id)
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_thread_func_direct(self):
         """
         Directly load KV caches from host memory to device memory without buffering.
         """
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.load_queue.get(block=True, timeout=1)
-                    # time.sleep(18e-6 * len(operation.host_indices))
-                    operation.data = self.mem_pool_host.get_flat_data(
-                        operation.host_indices
-                    )
-                    self.mem_pool_device.transfer(
-                        operation.device_indices, operation.data
-                    )
-                    self.mem_pool_host.complete_io(operation.host_indices)
-                    for node_id in operation.node_ids:
-                        if node_id != 0:
-                            self.ack_load_queue.put(node_id)
-                except Empty:
-                    continue
-                except Exception as e:
-                    logger.error(e)
+        torch.cuda.set_stream(self.load_stream)
+        while not self.stop_event.is_set():
+            try:
+                operation = self.load_queue.get(block=True, timeout=1)
+                # time.sleep(18e-6 * len(operation.host_indices))
+                operation.data = self.mem_pool_host.get_flat_data(
+                    operation.host_indices
+                )
+                self.mem_pool_device.transfer(operation.device_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    if node_id != 0:
+                        self.ack_load_queue.put(node_id)
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_thread_func_layer_by_layer(self):
         """
         Load KV caches from host memory to device memory layer by layer.
         """
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                self.load_cache_event.wait(timeout=1)
-                if not self.load_cache_event.is_set():
-                    continue
-                self.load_cache_event.clear()
+        torch.cuda.set_stream(self.load_stream)
+        while not self.stop_event.is_set():
+            self.load_cache_event.wait(timeout=1)
+            if not self.load_cache_event.is_set():
+                continue
+            self.load_cache_event.clear()
-                batch_operation = None
-                while self.load_queue.qsize() > 0:
-                    op = self.load_queue.get(block=True)
-                    if batch_operation is None:
-                        batch_operation = op
-                    else:
-                        batch_operation.merge(op)
+            batch_operation = None
+            while self.load_queue.qsize() > 0:
+                op = self.load_queue.get(block=True)
                 if batch_operation is None:
-                    continue
+                    batch_operation = op
+                else:
+                    batch_operation.merge(op)
+            if batch_operation is None:
+                continue
-                self.layer_done_counter.reset()
-                for i in range(self.mem_pool_host.layer_num):
-                    if self.page_size == 1:
-                        flat_data = self.mem_pool_host.get_flat_data_by_layer(
-                            batch_operation.host_indices, i
-                        )
-                        self.mem_pool_device.transfer_per_layer(
-                            batch_operation.device_indices, flat_data, i
-                        )
-                    else:
-                        self.mem_pool_host.load_page_per_layer(
-                            batch_operation.host_indices,
-                            batch_operation.device_indices,
-                            self.mem_pool_device,
-                            i,
-                        )
-                        self.load_stream.synchronize()
-                    self.layer_done_counter.increment()
-                self.mem_pool_host.complete_io(batch_operation.host_indices)
-                for node_id in batch_operation.node_ids:
-                    if node_id != 0:
-                        self.ack_load_queue.put(node_id)
+            self.layer_done_counter.reset()
+            for i in range(self.mem_pool_host.layer_num):
+                if self.page_size == 1:
+                    flat_data = self.mem_pool_host.get_flat_data_by_layer(
+                        batch_operation.host_indices, i
+                    )
+                    self.mem_pool_device.transfer_per_layer(
+                        batch_operation.device_indices, flat_data, i
+                    )
+                else:
+                    self.mem_pool_host.load_page_per_layer(
+                        batch_operation.host_indices,
+                        batch_operation.device_indices,
+                        self.mem_pool_device,
+                        i,
+                    )
+                    self.load_stream.synchronize()
+                self.layer_done_counter.increment()
+            self.mem_pool_host.complete_io(batch_operation.host_indices)
+            for node_id in batch_operation.node_ids:
+                if node_id != 0:
+                    self.ack_load_queue.put(node_id)
     def write_aux_func(self, no_wait=False):
         """
         Auxiliary function to prepare the buffer for write operations.
         """
+        torch.cuda.set_stream(self.write_stream)
         def _to_op(op_):
             assert op_.device_indices.is_cuda, "Device indices should be on GPU"
@@ -370,44 +369,42 @@ class HiCacheController:
             return op_
         buffer = None
-        with torch.cuda.stream(self.write_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.write_queue.get(block=True, timeout=1)
-                    factor = (
-                        len(operation.device_indices)
-                        // self.write_buffer.max_buffer_size
-                    )
+        while not self.stop_event.is_set():
+            try:
+                operation = self.write_queue.get(block=True, timeout=1)
+                factor = (
+                    len(operation.device_indices) // self.write_buffer.max_buffer_size
+                )
-                    if factor >= 1:
-                        if buffer is not None:
-                            _to_op(buffer)
-                            buffer = None
-                        if factor < 2:
-                            _to_op(operation)
-                        else:
-                            split_ops = operation.split(factor)
-                            for op_ in split_ops:
-                                _to_op(op_)
-                        continue
-                    if buffer is None:
-                        buffer = operation
-                    else:
-                        buffer.merge(operation)
-                    if (
-                        no_wait
-                        or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
-                        or self.write_queue.empty()
-                        or self.write_buffer.empty()
-                    ):
+                if factor >= 1:
+                    if buffer is not None:
                         _to_op(buffer)
                         buffer = None
-                except Empty:
+                    if factor < 2:
+                        _to_op(operation)
+                    else:
+                        split_ops = operation.split(factor)
+                        for op_ in split_ops:
+                            _to_op(op_)
                     continue
-                except Exception as e:
-                    logger.error(e)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    no_wait
+                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                    or self.write_queue.empty()
+                    or self.write_buffer.empty()
+                ):
+                    _to_op(buffer)
+                    buffer = None
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_aux_func(self):
         """
@@ -484,19 +481,18 @@ class HiCacheController:
         aux_thread.join()
     def load_thread_func_buffer(self):
+        torch.cuda.set_stream(self.load_stream)
         aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
         aux_thread.start()
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                operation = self.load_buffer.get()
-                if operation is None:
-                    continue
-                self.mem_pool_device.transfer(operation.device_indices, operation.data)
-                self.mem_pool_host.complete_io(operation.host_indices)
-                for node_id in operation.node_ids:
-                    if node_id != 0:
-                        self.ack_load_queue.put(node_id)
+        while not self.stop_event.is_set():
+            operation = self.load_buffer.get()
+            if operation is None:
+                continue
+            self.mem_pool_device.transfer(operation.device_indices, operation.data)
+            self.mem_pool_host.complete_io(operation.host_indices)
+            for node_id in operation.node_ids:
+                if node_id != 0:
+                    self.ack_load_queue.put(node_id)
         aux_thread.join()
     def evict_device(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -790,6 +790,16 @@ class ResumeMemoryOccupationReqOutput:
     pass
+@dataclass
+class SlowDownReqInput:
+    forward_sleep_time: Optional[float]
+@dataclass
+class SlowDownReqOutput:
+    pass
 @dataclass
 class AbortReq:
     # The request id

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import List, Optional
 import numpy as np
 import PIL
+import torch
 from PIL import Image
 from transformers import BaseImageProcessorFast
@@ -89,6 +90,10 @@ class BaseMultimodalProcessor(ABC):
             return_tensors="pt",
             **kwargs,
         )
+        if "pixel_values" in result and isinstance(
+            result["pixel_values"], torch.Tensor
+        ):
+            result["pixel_values"] = result["pixel_values"].to("cpu")
         return result
     @abstractmethod

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl