PyPI - sglang - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl - Mend

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/srt/lora/triton_ops/qkv_lora_b.py ADDED Viewed

@@ -0,0 +1,182 @@
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.lora.lora import LoraBatchInfo
+@triton.jit
+def _qkv_lora_b_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Parameters of size
+    K,  # K = R
+    max_qkv_out_dim,  # max(output_q_dim, output_kv_dim)
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    # Offsets of q/k/v slice on output dimension
+    n_offs,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # For fused output scaling and adding
+    fuse_scaling_add,
+    scaling,
+):
+    # This kernel packs 3 sgemms (q/k/v) into a single kernel.
+    # x: (s, 3 * K), s is the sum of sequence lengths, K equals to lora rank
+    # weights: (num_lora, N_Q + 2 * N_KV, K)
+    # output: (s, N_Q + 2 * N_KV)
+    # N_Q >> K, N_KV >> K
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len.
+    # qkv_id decides which of q,k,v to compute (0: q, 1: k, 2: v)
+    batch_id = tl.program_id(axis=2)
+    qkv_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    w_index = tl.load(weight_indices + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    n_start = tl.load(n_offs + qkv_id)
+    n_size = tl.load(n_offs + qkv_id + 1) - n_start
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(max_qkv_out_dim, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    # Create pointers for the first block of x and weights[batch_id][n_start: n_end][:]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    x_ptrs = (x + seg_start * x_stride_0 + (qkv_id * K) * x_stride_1) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + n_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    # Iteate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len)
+            and (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) and (n_offset[None, :] < n_size),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+    # Store result to output matrix
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0 + n_start * output_stride_1) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < n_size)
+    if fuse_scaling_add:
+        partial_sum += tl.load(output_ptr, mask=output_mask)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+def qkv_lora_b_fwd(
+    x: torch.Tensor,
+    qkv_lora_b: torch.Tensor,
+    batch_info: LoraBatchInfo,
+    output_offset: torch.Tensor,
+    max_qkv_out_dim: int,
+    base_output: torch.Tensor = None,
+    scaling: float = 1.0,
+) -> torch.Tensor:
+    # x: (s, 3 * r)
+    # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
+    # output_offset = [0, output_dim_q, output_dim_q + output_dim_kv,
+    #                     output_dim_q + 2 * output_dim_kv]
+    # max_qkv_out_dim = max(output_dim_q, output_dim_kv)
+    # output: (s, output_dim_q + 2 * output_dim_kv)
+    # Compute lora_output with shape (s, output_dim) as follows:
+    # lora_output[:, :output_dim_q] = sgemm(lora_output_a[:, :r], )
+    # lora_output[:, output_dim_q: output_dim_q + output_dim_kv]
+    #      = sgemm(lora_output_a[:, r: 2 * r], kv_lora_b[0])
+    # lora_output[:, output_dim_q + output_dim_kv: ]
+    #      = sgemm(lora_output_a[:, 2 * r: 3 * r], kv_lora_b[1])
+    # Get dims
+    s = x.shape[0]
+    input_dim = x.shape[1]
+    r = qkv_lora_b.shape[-1]
+    output_dim = qkv_lora_b.shape[-2]
+    assert input_dim == 3 * r
+    assert output_offset.shape[0] == 4
+    BLOCK_S = 16
+    BLOCK_R = 16
+    BLOCK_OUT = 64
+    grid_b = (
+        triton.cdiv(batch_info.max_len, BLOCK_S)
+        * triton.cdiv(max_qkv_out_dim, BLOCK_OUT),
+        3,  # this dimension decides current block computes on q, k or v
+        batch_info.bs,
+    )
+    if base_output is None:
+        output = torch.empty((s, output_dim), device=x.device, dtype=x.dtype)
+        fuse_scaling_add = False
+    else:
+        output = base_output
+        fuse_scaling_add = True
+    _qkv_lora_b_kernel[grid_b](
+        x,
+        qkv_lora_b,
+        output,
+        r,
+        max_qkv_out_dim,
+        x.stride(0),
+        x.stride(1),
+        qkv_lora_b.stride(0),
+        qkv_lora_b.stride(1),
+        qkv_lora_b.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        output_offset,
+        BLOCK_S,
+        BLOCK_OUT,
+        BLOCK_R,
+        fuse_scaling_add,
+        scaling,
+    )
+    return output

sglang/srt/lora/triton_ops/sgemm_lora_a.py ADDED Viewed

@@ -0,0 +1,143 @@
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.lora.lora import LoraBatchInfo
+@triton.jit
+def _sgemm_lora_a_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Matrix dimensions
+    N,  # r
+    K,  # input_dim
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    # x: (s, K), s is the sum of sequence lengths
+    # weights: (num_lora, N, K)
+    # output: (s, N)
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len
+    batch_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    w_index = tl.load(weight_indices + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    # Create pointers for the first block of x and weights[batch_id]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    x_ptrs = (x + seg_start * x_stride_0) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    # Iteate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len)
+            and (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) and (n_offset[None, :] < N),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+    # Store result to output matrix
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < N)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+def sgemm_lora_a_fwd(
+    x: torch.Tensor, weights: torch.Tensor, batch_info: LoraBatchInfo
+) -> torch.Tensor:
+    # x: (s, input_dim)
+    # weights: (num_lora, r, input_dim)
+    # output: (s, r)
+    # when called by run_qkv_lora, the weights.shape[-2] will be 3 * r
+    # input_dim is much larger than r
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert len(x.shape) == 2
+    assert len(weights.shape) == 3
+    S = x.shape[0]
+    R = weights.shape[-2]
+    K = weights.shape[-1]
+    assert x.shape[-1] == K
+    # Block shapes
+    BLOCK_S = 16
+    BLOCK_K = 256
+    BLOCK_R = 16
+    grid = (
+        triton.cdiv(batch_info.max_len, BLOCK_S) * triton.cdiv(R, BLOCK_R),
+        batch_info.bs,
+    )
+    output = torch.empty((S, R), device=x.device, dtype=x.dtype)
+    _sgemm_lora_a_kernel[grid](
+        x,
+        weights,
+        output,
+        R,
+        K,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        BLOCK_S,
+        BLOCK_R,
+        BLOCK_K,
+    )
+    return output

sglang/srt/lora/triton_ops/sgemm_lora_b.py ADDED Viewed

@@ -0,0 +1,159 @@
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.lora.lora import LoraBatchInfo
+@triton.jit
+def _sgemm_lora_b_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Matrix dimensions
+    N,  # output_dim
+    K,  # r
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # For fused output scaling and adding
+    fuse_scaling_add,
+    scaling,
+):
+    # x: (s, K), s is the sum of sequence lengths
+    # weights: (num_lora, N, K)
+    # output: (s, N)
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len
+    batch_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    w_index = tl.load(weight_indices + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    # Create pointers for the first block of x and weights[batch_id]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    x_ptrs = (x + seg_start * x_stride_0) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    # Iteate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len)
+            and (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+    # Store result to output matrix
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = s_offset[:, None] < seg_len
+    if fuse_scaling_add:
+        partial_sum += tl.load(output_ptr, mask=output_mask)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+def sgemm_lora_b_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: LoraBatchInfo,
+    base_output: torch.Tensor = None,
+    scaling: float = 1.0,
+) -> torch.Tensor:
+    # x: (s, r)
+    # weights: (num_lora, output_dim, r)
+    # output: (s, output_dim)
+    # output_dim is much larger than r
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert len(x.shape) == 2
+    assert len(weights.shape) == 3
+    S = x.shape[0]
+    N = weights.shape[-2]
+    R = weights.shape[-1]
+    assert x.shape[-1] == R
+    # Block shapes
+    BLOCK_S = 16
+    BLOCK_R = 16
+    BLOCK_N = 256
+    grid = (
+        triton.cdiv(batch_info.max_len, BLOCK_S) * triton.cdiv(N, BLOCK_N),
+        batch_info.bs,
+    )
+    if base_output is None:
+        output = torch.empty((S, N), device=x.device, dtype=x.dtype)
+        fuse_scaling_add = False
+    else:
+        output = base_output
+        fuse_scaling_add = True
+    _sgemm_lora_b_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        R,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        BLOCK_S,
+        BLOCK_N,
+        BLOCK_R,
+        fuse_scaling_add,
+        scaling,
+    )
+    return output

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -240,6 +240,7 @@ class MllamaImageProcessor(BaseImageProcessor):
 class MiniCPMVImageProcessor(BaseImageProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "(<image>./</image>)"
     @staticmethod
     def _process_images_task(images, input_text):
@@ -271,7 +272,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
     async def process_images_async(
         self,
         image_data: List[Union[str, bytes]],
-        input_text,
+        input_ids,
         request_obj,
         max_req_input_len,
     ):
@@ -282,28 +283,49 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             image_data = [image_data]
         image_hashes, image_sizes = [], []
-        raw_images = []
-        IMAGE_TOKEN = "(<image>./</image>)"
+        all_frames = []
-        # roughly calculate the max number of frames
-        # TODO: the process should be applied to all the visual inputs
+        # roughly calculate the max number of frames under the max_req_input_len limit
         def calculate_max_num_frames() -> int:
             # Model-specific
             NUM_TOKEN_PER_FRAME = 330
-            ret = (max_req_input_len - len(input_text)) // NUM_TOKEN_PER_FRAME
+            ret = (max_req_input_len - len(input_ids)) // NUM_TOKEN_PER_FRAME
             return min(ret, 100)
-        # if cuda OOM set a smaller number
         MAX_NUM_FRAMES = calculate_max_num_frames()
-        print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
-        def encode_video(video_path):
+        # print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
+        def get_estimated_frames_list():
+            """
+            estimate the total frame count from all visual input
+            """
+            # Before processing inputs
+            estimated_frames_list = []
+            for image in image_data:
+                if isinstance(image, str) and image.startswith("video:"):
+                    path = image[len("video:") :]
+                    # Estimate frames for the video
+                    vr = VideoReader(path, ctx=cpu(0))
+                    num_frames = len(vr)
+                else:
+                    # For images, each contributes one frame
+                    num_frames = 1
+                estimated_frames_list.append(num_frames)
+            return estimated_frames_list
+        estimated_frames_list = get_estimated_frames_list()
+        total_frame_count = sum(estimated_frames_list)
+        scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
+        def encode_video(video_path, frame_count_limit=None):
             if not os.path.exists(video_path):
                 logger.error(f"Video {video_path} does not exist")
                 return []
-            if MAX_NUM_FRAMES == 0:
+            if frame_count_limit == 0:
                 return []
             def uniform_sample(l, n):
@@ -314,45 +336,63 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             vr = VideoReader(video_path, ctx=cpu(0))
             sample_fps = round(vr.get_avg_fps() / 1)  # FPS
             frame_idx = [i for i in range(0, len(vr), sample_fps)]
-            if len(frame_idx) > MAX_NUM_FRAMES:
-                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
+                frame_idx = uniform_sample(frame_idx, frame_count_limit)
             frames = vr.get_batch(frame_idx).asnumpy()
             frames = [Image.fromarray(v.astype("uint8")) for v in frames]
             return frames
-        if isinstance(input_text, list):
-            assert len(input_text) and isinstance(input_text[0], int)
-            input_text = self._processor.tokenizer.decode(input_text)
+        if isinstance(input_ids, list):
+            assert len(input_ids) and isinstance(input_ids[0], int)
+            input_text = self._processor.tokenizer.decode(input_ids)
+        else:
+            input_text = input_ids
         # MiniCPMV requires each frame of video as a single image token
-        text_parts = input_text.split(IMAGE_TOKEN)
+        text_parts = input_text.split(self.IMAGE_TOKEN)
         new_text_parts = []
-        for image_index, image in enumerate(image_data):
-            try:
-                if isinstance(image, str) and image.startswith("video:"):
-                    path = image[len("video:") :]
-                    frames = encode_video(path)
-                else:
-                    raw_image, size = load_image(image)
-                    frames = [raw_image]
-                if len(frames) == 0:
-                    continue
-            except FileNotFoundError as e:
-                print(e)
-                return None
-            image_sizes += frames[0].size * len(frames)
-            image_hashes += [hash(image)] * len(frames)
-            raw_images += frames
+        # Process each input with allocated frames
+        for image_index, (image, estimated_frames) in enumerate(
+            zip(image_data, estimated_frames_list)
+        ):
+            if len(all_frames) >= MAX_NUM_FRAMES:
+                frames_to_process = 0
+            else:
+                frames_to_process = max(1, int(estimated_frames * scaling_factor))
+            if frames_to_process == 0:
+                frames = []
+            else:
+                try:
+                    if isinstance(image, str) and image.startswith("video:"):
+                        path = image[len("video:") :]
+                        frames = encode_video(path, frame_count_limit=frames_to_process)
+                    else:
+                        raw_image, _size = load_image(image)
+                        frames = [raw_image]
+                    if len(frames) == 0:
+                        continue
+                except FileNotFoundError as e:
+                    print(e)
+                    return None
+                image_sizes += frames[0].size * len(frames)
+                image_hashes += [hash(image)] * len(frames)
+                all_frames += frames
+            assert frames_to_process == len(frames)
             new_text_parts.append(text_parts[image_index])
-            new_text_parts.append(IMAGE_TOKEN * len(frames))
+            if frames_to_process != 0:
+                new_text_parts.append(self.IMAGE_TOKEN * len(frames))
         new_text_parts.append(text_parts[-1])
         input_text = "".join(new_text_parts)
-        if len(raw_images) == 0:
+        if len(all_frames) == 0:
             return None
-        res = await self._process_images(images=raw_images, input_text=input_text)
+        res = await self._process_images(images=all_frames, input_text=input_text)
         pixel_values = res["pixel_values"]
         tgt_sizes = res["tgt_sizes"]
         input_ids = res["input_ids"]
@@ -364,7 +404,6 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         if tokenizer.slice_start_id:
             slice_start_id = [tokenizer.slice_start_id]
             slice_end_id = [tokenizer.slice_end_id]
         return {
             "input_ids": input_ids.flatten().tolist(),
             "pixel_values": pixel_values,

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -149,6 +149,7 @@ class Scheduler:
             if not self.spec_algorithm.is_none()
             else 1
         )
+        self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
         # Distributed rank info
         self.dp_size = server_args.dp_size
@@ -831,10 +832,16 @@ class Scheduler:
         available_size = (
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
         )
-        if available_size != self.max_total_num_tokens:
+        protected_size = self.tree_cache.protected_size()
+        memory_leak = available_size != (
+            self.max_total_num_tokens
+            if not self.enable_hierarchical_cache
+            else self.max_total_num_tokens - protected_size
+        )
+        if memory_leak:
             msg = (
                 "KV cache pool leak detected!"
-                f"{available_size=}, {self.max_total_num_tokens=}\n"
+                f"{available_size=}, {protected_size=}, {self.max_total_num_tokens=}\n"
             )
             warnings.warn(msg)
             if crash_on_warnings():
@@ -949,7 +956,14 @@ class Scheduler:
             res = adder.add_one_req(req)
             if res != AddReqResult.CONTINUE:
                 if res == AddReqResult.NO_TOKEN:
-                    self.batch_is_full = True
+                    if self.enable_hierarchical_cache:
+                        # Set batch_is_full after making sure there are requests that can be served
+                        self.batch_is_full = len(adder.can_run_list) > 0 or (
+                            self.running_batch is not None
+                            and not self.running_batch.is_empty()
+                        )
+                    else:
+                        self.batch_is_full = True
                 break
             if self.server_args.prefill_only_one_req:
                 break

sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl