PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +84 -22
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +25 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +35 -1
sglang/srt/managers/tokenizer_manager.py +37 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +68 -14
sglang/srt/models/deepseek_v2.py +62 -28
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +5 -2
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +57 -6
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/unquant.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sglang.srt.utils import (
 )
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.layer import EPMoE
     from sglang.srt.layers.moe.topk import TopKOutput
 has_triton_kernels = importlib.util.find_spec("triton_kernels") is not None
@@ -129,6 +130,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         super().__init__()
         self.use_triton_kernels = use_triton_kernels
+        self.triton_kernel_moe_forward = None
+        if torch.cuda.is_available() and has_triton_kernels:
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_forward as _tk_forward,
+            )
+            self.triton_kernel_moe_forward = _tk_forward
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -194,6 +203,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
+        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+        if isinstance(layer, EPMoE):
+            return layer.run_moe(
+                hidden_states=x,
+                topk_output=topk_output,
+            )
         return self.forward(
             x=x,
             layer=layer,
@@ -219,16 +237,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     ) -> torch.Tensor:
         if self.use_triton_kernels:
-            # TODO(ch-wan): re-enable the Triton kernel
-            raise NotImplementedError("The Triton kernel is temporarily disabled.")
-            # return triton_kernel_moe_forward(
-            #     hidden_states=x,
-            #     w1=layer.w13_weight,
-            #     w2=layer.w2_weight,
-            #     gating_output=router_logits,
-            #     topk=top_k,
-            #     renormalize=renormalize,
-            # )
+            return self.triton_kernel_moe_forward(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_output=topk_output,
+            )
         else:
             if _use_aiter:
                 assert not no_combine, "unsupported"
@@ -354,69 +368,3 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         raise NotImplementedError("The TPU backend currently does not support MoE.")
     forward_native = forward_cpu
-class UnquantizedEPMoEMethod(FusedMoEMethodBase, CustomOp):
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts_per_partition: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts_per_partition,
-                2 * intermediate_size,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-        # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts_per_partition,
-                hidden_size,
-                intermediate_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-        # scale
-        layer.register_parameter("w13_input_scale", None)
-        layer.register_parameter("w13_weight_scale", None)
-        ones_tensor = torch.ones(num_experts_per_partition, dtype=torch.float32)
-        w2_input_scale = torch.nn.Parameter(
-            ones_tensor,
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_input_scale", w2_input_scale)
-        set_weight_attrs(w2_input_scale, extra_weight_attrs)
-        w2_weight_scale = torch.nn.Parameter(
-            ones_tensor,
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        hidden_states: torch.Tensor,
-        topk_output: TopKOutput,
-    ) -> torch.Tensor:
-        raise NotImplementedError

sglang/srt/layers/quantization/w4afp8.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import logging
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import torch
 from torch.nn import Module
@@ -17,6 +17,9 @@ from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.utils import is_layer_skipped
 from sglang.srt.utils import set_weight_attrs
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.layer import EPMoE, TopKOutput
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = logging.getLogger(__name__)
@@ -84,13 +87,14 @@ class W4AFp8Config(QuantizationConfig):
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[QuantizeMethodBase]:
         from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
-        elif isinstance(layer, FusedMoE):
+        elif isinstance(layer, EPMoE):
             return W4AFp8MoEMethod(self)
         return None
@@ -105,8 +109,8 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
     def create_weights(
         self,
-        layer: Module,
-        num_experts_per_partition: int,
+        layer: EPMoE,
+        num_experts: int,
         hidden_size: int,
         intermediate_size: int,
         params_dtype: torch.dtype,
@@ -117,7 +121,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.empty(
-                num_experts_per_partition,
+                num_experts,
                 intermediate_size * 2,
                 hidden_size // 2,
                 dtype=torch.int8,
@@ -130,7 +134,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # down_proj (row parallel)
         w2_weight = torch.nn.Parameter(
             torch.empty(
-                num_experts_per_partition,
+                num_experts,
                 hidden_size,
                 intermediate_size // 2,
                 dtype=torch.int8,
@@ -142,7 +146,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         w13_weight_scale = torch.nn.Parameter(
             torch.zeros(
-                num_experts_per_partition,
+                num_experts,
                 2 * intermediate_size,
                 hidden_size // self.quant_config.group_size,
                 dtype=torch.float32,
@@ -154,7 +158,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         w2_weight_scale = torch.nn.Parameter(
             torch.zeros(
-                num_experts_per_partition,
+                num_experts,
                 hidden_size,
                 intermediate_size // self.quant_config.group_size,
                 dtype=torch.float32,
@@ -166,14 +170,14 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         # Input scales
         w13_input_scale = torch.nn.Parameter(
-            torch.ones((num_experts_per_partition, 2), dtype=torch.bfloat16),
+            torch.ones((num_experts, 2), dtype=torch.bfloat16),
             requires_grad=False,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
         set_weight_attrs(w13_input_scale, extra_weight_attrs)
         w2_input_scale = torch.nn.Parameter(
-            torch.ones(num_experts_per_partition, dtype=torch.bfloat16),
+            torch.ones(num_experts, dtype=torch.bfloat16),
             requires_grad=False,
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
@@ -183,25 +187,25 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         device = layer.w13_weight.device
         self.a_strides1 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             hidden_size,
             device=device,
             dtype=torch.int64,
         )
         self.c_strides1 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             2 * intermediate_size,
             device=device,
             dtype=torch.int64,
         )
         self.a_strides2 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             intermediate_size,
             device=device,
             dtype=torch.int64,
         )
         self.c_strides2 = torch.full(
-            (num_experts_per_partition, 3),
+            (num_experts, 3),
             hidden_size,
             device=device,
             dtype=torch.int64,
@@ -212,13 +216,13 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         self.s_strides2 = self.c_strides2
         self.expert_offsets = torch.empty(
-            (num_experts_per_partition + 1), dtype=torch.int32, device=device
+            (num_experts + 1), dtype=torch.int32, device=device
         )
         self.problem_sizes1 = torch.empty(
-            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+            (num_experts, 3), dtype=torch.int32, device=device
         )
         self.problem_sizes2 = torch.empty(
-            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+            (num_experts, 3), dtype=torch.int32, device=device
         )
         return
@@ -266,3 +270,50 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
             [w2_input_scale_max], dtype=dtype, device=device
         )
         layer.w2_input_scale = Parameter(new_w2_input_scale, requires_grad=False)
+    def apply(
+        self,
+        layer: EPMoE,
+        hidden_states: torch.Tensor,
+        topk_output: TopKOutput,
+    ) -> torch.Tensor:
+        # TODO(ch-wan): move it out of this class
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+        topk_ids, topk_weights, _ = topk_output
+        local_topk_ids = topk_ids
+        if layer.expert_map is not None:
+            "Translate info from expert_map to topk_ids"
+            local_topk_ids = torch.where(
+                layer.expert_map[topk_ids] != layer.num_experts,
+                layer.expert_map[topk_ids],
+                layer.num_experts,
+            )
+        return cutlass_w4a8_moe(
+            layer.start_expert_id,
+            layer.end_expert_id,
+            layer.num_experts,
+            hidden_states,
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale_inv,
+            layer.w2_weight_scale_inv,
+            topk_weights,
+            topk_ids,
+            local_topk_ids,
+            self.a_strides1,
+            self.b_strides1,
+            self.c_strides1,
+            self.a_strides2,
+            self.b_strides2,
+            self.c_strides2,
+            self.s_strides13,
+            self.s_strides2,
+            self.expert_offsets,
+            self.problem_sizes1,
+            self.problem_sizes2,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+        )

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -14,12 +14,16 @@
 import asyncio
+from collections import defaultdict
 from dataclasses import dataclass, field, fields
 from typing import Dict, List, Optional, Union
 from uuid import uuid4
+from sglang.srt.aio_rwlock import RWLock
+from sglang.srt.utils import ConcurrentCounter
-@dataclass(frozen=True, slots=True)
+@dataclass(frozen=True)
 class LoRARef:
     """
     Reference record for a LoRA model.
@@ -48,10 +52,11 @@ class LoRARef:
 class LoRARegistry:
     """
-    The central registry to keep track of available LoRA adapters.
+    The central registry to keep track of available LoRA adapters and ongoing LoRA requests.
-    TODO (lifuhuang): This registry is intended as the foundation for overlapped lora update. We decided
-    to keep it in a separate PR to keep code review simple and to unblock the radix cache work.
+    The `LoRARegistry` resides in the tokenizer manager process and acts as the single source of truth for all
+    available LoRA adapters. It supports concurrent inference and dynamic adapter updates through a two-phase
+    update / eventual consistency model between the tokenizer manager process and the scheduler processes.
     """
     def __init__(self, lora_paths: Optional[Dict[str, LoRARef]] = None):
@@ -62,8 +67,19 @@ class LoRARegistry:
             "Please file an issue if you see this error."
         )
+        # A read-write lock to ensure adapters loading / unloading operations are exclusive.
+        # Please note that the counter increment/decrement operations are not synchronized through this
+        # lock, as they are designed to be non-blocking and can be performed concurrently.
+        self._registry_lock = RWLock()
         # A dictionary to hold LoRARef objects, mapping from LoRA name to LoRARef.
-        self._registry: Dict[str, LoRARef] = dict(lora_paths or {})
+        self._registry: Dict[str, LoRARef] = {}
+        # Counters for ongoing requests, mapping from LoRA ID to ConcurrentCounter.
+        self._counters: Dict[str, ConcurrentCounter] = {}
+        # Initialize the registry with provided LoRA paths, if present.
+        if lora_paths:
+            for lora_ref in lora_paths.values():
+                self._register_adapter(lora_ref)
     async def register(self, lora_ref: LoRARef):
         """
@@ -72,11 +88,8 @@ class LoRARegistry:
         Args:
             lora_ref (LoRARef): The LoRARef object to register.
         """
-        if lora_ref.lora_name in self._registry:
-            raise ValueError(
-                f"LoRA with name {lora_ref.lora_name} already exists. Loaded LoRAs: {self._registry.keys()}"
-            )
-        self._registry[lora_ref.lora_name] = lora_ref
+        async with self._registry_lock.writer_lock:
+            self._register_adapter(lora_ref)
     async def unregister(self, lora_name: str) -> str:
         """
@@ -85,12 +98,14 @@ class LoRARegistry:
         Args:
             lora_name (str): The name of the LoRA model to unregister.
         """
-        lora_ref = self._registry.get(lora_name, None)
-        if lora_ref is None:
-            raise ValueError(
-                f"LoRA with name {lora_name} does not exist. Loaded LoRAs: {self._registry.keys()}"
-            )
-        del self._registry[lora_name]
+        async with self._registry_lock.writer_lock:
+            lora_ref = self._registry.get(lora_name, None)
+            if lora_ref is None:
+                raise ValueError(
+                    f"LoRA with name {lora_name} does not exist. Loaded LoRAs: {self._registry.keys()}"
+                )
+            del self._registry[lora_name]
+            del self._counters[lora_ref.lora_id]
         return lora_ref.lora_id
@@ -98,27 +113,76 @@ class LoRARegistry:
         """
         Queries registry for LoRA IDs based on LoRA names and start tracking the usage of the corresponding LoRA adapters
         by incrementing its counter.
-        TODO (lifuhuang): currently it only queries the registry and does not track the usage of LoRA adapters.
         """
-        async def _acquire_single(name: str) -> str:
+        def _lookup(name: str) -> str:
             lora_ref = self._registry.get(name, None)
             if lora_ref is None:
                 raise ValueError(
                     f"The following requested LoRA adapters are not loaded: {name}\n"
                     f"Loaded adapters: {self._registry.keys()}."
                 )
-            # await self._counters[lora_ref.lora_id].increment()
             return lora_ref.lora_id
-        if isinstance(lora_name, str):
-            lora_id = await _acquire_single(lora_name)
-            return lora_id
-        elif isinstance(lora_name, list):
-            lora_ids = await asyncio.gather(
-                *[_acquire_single(name) for name in lora_name]
+        async with self._registry_lock.reader_lock:
+            if isinstance(lora_name, str):
+                lora_id = _lookup(lora_name)
+                await self._counters[lora_id].increment(notify_all=False)
+                return lora_id
+            elif isinstance(lora_name, list):
+                lora_ids = [_lookup(name) for name in lora_name]
+                # Increment the counters only after all IDs are looked up.
+                await asyncio.gather(
+                    *[self._counters[id].increment(notify_all=False) for id in lora_ids]
+                )
+                return lora_ids
+            else:
+                raise TypeError(
+                    "lora_name must be either a string or a list of strings."
+                )
+    async def release(self, lora_id: Union[str, List[str]]):
+        """
+        Decrements the usage counter for a LoRA adapter, indicating that it is no longer in use.
+        """
+        async with self._registry_lock.reader_lock:
+            if isinstance(lora_id, str):
+                await self._counters[lora_id].decrement()
+            elif isinstance(lora_id, list):
+                await asyncio.gather(
+                    *[self._counters[id].decrement() for id in lora_id]
+                )
+            else:
+                raise TypeError("lora_id must be either a string or a list of strings.")
+    async def wait_for_unload(self, lora_id: str):
+        """
+        Waits until the usage counter for a LoRA adapter reaches zero, indicating that it is no longer in use.
+        This is useful for ensuring that a LoRA adapter can be safely unloaded.
+        This method itself is not synchronized, which is safe because it should only be called during LoRA unloading,
+        which itself is guaranteed to be sequential.
+        """
+        assert (
+            lora_id not in self._registry
+        ), "wait_for_unload should only be called after the LoRA adapter has been unregistered. "
+        counter = self._counters.get(lora_id)
+        if counter:
+            # Wait until no requests are using this LoRA adapter.
+            await counter.wait_for_zero()
+            del self._counters[lora_id]
+    def _register_adapter(self, lora_ref: LoRARef):
+        """
+        Internal helper method to register a LoRA adapter.
+        """
+        if lora_ref.lora_name in self._registry:
+            raise ValueError(
+                f"LoRA with name {lora_ref.lora_name} already exists. Loaded LoRAs: {self._registry.keys()}"
             )
-            return lora_ids
-        else:
-            raise TypeError("lora_name must be either a string or a list of strings.")
+        self._registry[lora_ref.lora_name] = lora_ref
+        self._counters[lora_ref.lora_id] = ConcurrentCounter()
+        return lora_ref

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -201,8 +201,9 @@ class PrefetchOperation(StorageOperation):
     def increment(self, num_tokens: int):
         with self._lock:
             if self._done_flag:
-                return
+                return False
             self.completed_tokens += num_tokens
+            return True
     def mark_done(self):
         with self._lock:
@@ -528,12 +529,12 @@ class HiCacheController:
                             f"Prefetch operation {operation.request_id} failed to retrieve page {h}."
                         )
                         break
-                    self.mem_pool_host.set_from_flat_data_page(
-                        operation.host_indices[operation.completed_tokens],
-                        page_data,
-                    )
-                    operation.increment(self.page_size)
-                    if operation.is_done():
+                    if operation.increment(self.page_size):
+                        self.mem_pool_host.set_from_flat_data_page(
+                            operation.host_indices[operation.completed_tokens],
+                            page_data,
+                        )
+                    else:
                         # operation terminated by controller, release pre-allocated memory
                         self.mem_pool_host.free(
                             operation.host_indices[operation.completed_tokens :]
@@ -589,6 +590,7 @@ class HiCacheController:
                 if storage_hit_count < self.prefetch_threshold:
                     # not to prefetch if not enough benefits
                     self.prefetch_revoke_queue.put(operation.request_id)
+                    self.mem_pool_host.free(operation.host_indices)
                     logger.debug(
                         f"Revoking prefetch for request {operation.request_id} due to insufficient hits ({storage_hit_count})."
                     )

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl