PyPI - sglang - Versions diffs - 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl - Mend

sglang 0.5.1.post1py3-none-any.whl → 0.5.1.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

sglang/bench_one_batch_server.py +79 -53
sglang/bench_serving.py +186 -14
sglang/profiler.py +0 -1
sglang/srt/conversation.py +38 -5
sglang/srt/disaggregation/decode.py +4 -0
sglang/srt/disaggregation/prefill.py +4 -0
sglang/srt/entrypoints/engine.py +2 -2
sglang/srt/entrypoints/openai/protocol.py +27 -24
sglang/srt/entrypoints/openai/serving_chat.py +50 -9
sglang/srt/entrypoints/openai/serving_completions.py +15 -0
sglang/srt/entrypoints/tool.py +7 -7
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/harmony_parser.py +588 -0
sglang/srt/hf_transformers_utils.py +16 -7
sglang/srt/layers/attention/ascend_backend.py +218 -111
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +76 -91
sglang/srt/layers/attention/utils.py +15 -94
sglang/srt/layers/communicator.py +1 -2
sglang/srt/layers/moe/cutlass_moe.py +0 -15
sglang/srt/layers/moe/ep_moe/layer.py +1 -7
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
sglang/srt/layers/quantization/fp8.py +2 -1
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/mxfp4.py +16 -23
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/utils.py +0 -14
sglang/srt/lora/lora_manager.py +29 -12
sglang/srt/managers/cache_controller.py +223 -156
sglang/srt/managers/detokenizer_manager.py +5 -0
sglang/srt/managers/io_struct.py +30 -0
sglang/srt/managers/scheduler.py +58 -7
sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
sglang/srt/managers/tokenizer_manager.py +36 -3
sglang/srt/mem_cache/hicache_storage.py +31 -20
sglang/srt/mem_cache/hiradix_cache.py +12 -3
sglang/srt/mem_cache/memory_pool.py +73 -14
sglang/srt/mem_cache/memory_pool_host.py +3 -2
sglang/srt/mem_cache/radix_cache.py +1 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
sglang/srt/metrics/collector.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/model_runner.py +1 -1
sglang/srt/models/deepseek_v2.py +12 -3
sglang/srt/models/gpt_oss.py +2 -1
sglang/srt/models/qwen2_5_vl.py +1 -0
sglang/srt/offloader.py +115 -0
sglang/srt/reasoning_parser.py +56 -300
sglang/srt/server_args.py +10 -5
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +59 -12
sglang/test/test_cutlass_moe.py +33 -28
sglang/version.py +1 -1
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +6 -5
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +69 -65
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/utils.py CHANGED Viewed

@@ -34,17 +34,3 @@ class PPMissingLayer(torch.nn.Identity):
         """
         input = args[0] if args else next(iter(kwargs.values()))
         return (input,) if self.return_tuple else input
-@lru_cache(maxsize=1)
-def is_sm100_supported(device=None) -> bool:
-    return (torch.cuda.get_device_capability(device)[0] == 10) and (
-        torch.version.cuda >= "12.8"
-    )
-@lru_cache(maxsize=1)
-def is_sm90_supported(device=None) -> bool:
-    return (torch.cuda.get_device_capability(device)[0] == 9) and (
-        torch.version.cuda >= "12.3"
-    )

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -420,20 +420,37 @@ class LoRAManager:
     ):
         """Infer LoRA target modules and max_lora_rank from loaded adapters if not provided."""
-        if target_modules is not None:
-            self.target_modules = set(target_modules)
-        else:
-            self.target_modules = set()
-            for config in self.configs.values():
-                if not isinstance(config.target_modules, list):
+        self.target_modules = (
+            get_normalized_target_modules(target_modules) if target_modules else set()
+        )
+        for lora_id, config in self.configs.items():
+            if not isinstance(config.target_modules, list):
+                raise ValueError(
+                    f"SGLang currently only supports inferring LoRA target modules when a list of "
+                    "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
+                    "specify `--lora-target-modules` during server startup. You can specify `all` to "
+                    "enable all support modules types. "
+                )
+            adapter_target_modules = get_normalized_target_modules(
+                config.target_modules
+            )
+            if target_modules is not None:
+                # When `--lora-target-modules` is provided, validate adapter target modules is a subset of the specified target modules.
+                if not adapter_target_modules.issubset(self.target_modules):
+                    unsupported_modules = adapter_target_modules - self.target_modules
+                    lora_name = self.lora_refs[lora_id].lora_name
                     raise ValueError(
-                        f"SGLang currently only supports inferring LoRA target modules when a list of "
-                        "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
-                        "specify `--lora-target-modules` during server startup. You can specify `all` to "
-                        "enable all support modules types. "
+                        f"LoRA adapter '{lora_name}' contains target modules {sorted(unsupported_modules)} "
+                        f"that are not included in the specified --lora-target-modules {sorted(self.target_modules)}. "
+                        f"Please update --lora-target-modules to include all required modules: "
+                        f"{sorted(self.target_modules | adapter_target_modules)}, or use 'all' to enable all supported modules."
                     )
-                self.target_modules.update(config.target_modules)
-        self.target_modules = get_normalized_target_modules(self.target_modules)
+            else:
+                # Otherwise, infer target_modules from adapter configs.
+                self.target_modules.update(adapter_target_modules)
         if max_lora_rank is not None:
             self.max_lora_rank = max_lora_rank

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -22,12 +22,22 @@ from typing import TYPE_CHECKING, List, Optional
 import torch
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
 if TYPE_CHECKING:
     from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
     from sglang.srt.mem_cache.memory_pool_host import HostKVCache
-from sglang.srt.distributed import get_tensor_model_parallel_rank
-from sglang.srt.mem_cache.memory_pool_host import MLATokenToKVPoolHost
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
 logger = logging.getLogger(__name__)
@@ -231,6 +241,8 @@ class HiCacheController:
         io_backend: str = "",
         storage_backend: Optional[str] = None,
         prefetch_threshold: int = 256,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
     ):
         self.mem_pool_device_allocator = token_to_kv_pool_allocator
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
@@ -240,28 +252,40 @@ class HiCacheController:
         self.io_backend = io_backend
         self.enable_storage = False
-        self.is_mla = isinstance(self.mem_pool_host, MLATokenToKVPoolHost)
         # todo: move backend initialization to storage backend module
         if storage_backend is not None:
             self.storage_backend_type = storage_backend
-            from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
+            from sglang.srt.mem_cache.hicache_storage import get_hash_str
+            self.get_hash_str = get_hash_str
+            self.storage_config = self._generate_storage_config(
+                model_name, storage_backend_extra_config
+            )
+            # In MLA backend, only one rank needs to backup the KV cache
+            self.backup_skip = (
+                self.storage_config.is_mla_model
+                # todo: for load balancing, decide which rank to backup the KV cache by hash value
+                and self.storage_config.tp_rank != 0
+                # todo: support other storage backends
+                and self.storage_backend_type in ["file", "mooncake"]
+            )
             if storage_backend == "file":
-                self.storage_backend = HiCacheFile(is_mla=self.is_mla)
-                self.get_hash_str = get_hash_str
+                from sglang.srt.mem_cache.hicache_storage import HiCacheFile
+                self.storage_backend = HiCacheFile(self.storage_config)
             elif storage_backend == "nixl":
                 from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
                 self.storage_backend = HiCacheNixl()
-                self.get_hash_str = get_hash_str
             elif storage_backend == "mooncake":
                 from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
                     MooncakeStore,
-                    get_hash_str_mooncake,
                 )
-                self.storage_backend = MooncakeStore(is_mla=self.is_mla)
-                self.get_hash_str = get_hash_str_mooncake
+                self.storage_backend = MooncakeStore(self.storage_config)
                 self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
                 assert self.mem_pool_host.layout == "page_first"
             elif storage_backend == "hf3fs":
@@ -279,9 +303,8 @@ class HiCacheController:
                     )
                 dtype = mem_pool_host.dtype
                 self.storage_backend = HiCacheHF3FS.from_env_config(
-                    bytes_per_page, dtype
+                    bytes_per_page, dtype, self.storage_config
                 )
-                self.get_hash_str = get_hash_str
             else:
                 raise NotImplementedError(
                     f"Unsupported storage backend: {storage_backend}"
@@ -361,6 +384,40 @@ class HiCacheController:
             self.prefetch_thread.start()
             self.backup_thread.start()
+    def _generate_storage_config(
+        self,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
+    ):
+        if is_dp_attention_enabled():
+            self.tp_rank = get_attention_tp_rank()
+            self.tp_size = get_attention_tp_size()
+        else:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.tp_size = get_tensor_model_parallel_world_size()
+        # Currently, AscendMLAPagedTokenToKVPool is the subclass of MLATokenToKVPool.
+        is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
+        # Parse extra config JSON if provided
+        extra_config = None
+        if storage_backend_extra_config:
+            try:
+                import json
+                extra_config = json.loads(storage_backend_extra_config)
+            except Exception as e:
+                logger.error(f"Invalid backend extra config JSON: {e}")
+        return HiCacheStorageConfig(
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            is_mla_model=is_mla_backend,
+            model_name=model_name,
+            extra_config=extra_config,
+        )
     def reset(self):
         self.stop_event.set()
         self.write_thread.join()
@@ -400,15 +457,6 @@ class HiCacheController:
             self.prefetch_thread.start()
             self.backup_thread.start()
-    @property
-    def backup_skip(self):
-        return (
-            self.is_mla
-            and get_tensor_model_parallel_rank() != 0
-            # todo: only support file and mooncake
-            and self.storage_backend_type in ["file", "mooncake"]
-        )
     def write(
         self,
         device_indices: torch.Tensor,
@@ -570,57 +618,92 @@ class HiCacheController:
         operation.mark_done()
         return operation.completed_tokens, operation.hash_value
-    def zerocopy_page_transfer(self, operation, batch_size=8):
+    # zero copy
+    def _3fs_zero_copy_page_get(self, operation, hash_values, host_indices):
         hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
-            operation.hash_value, operation.host_indices
+            hash_values, host_indices
         )
-        for i in range(0, len(hashes), batch_size):
-            page_hashes = hashes[i : i + batch_size]
-            page_dsts = dsts[i : i + batch_size]
-            page_data = self.storage_backend.batch_get(page_hashes, page_dsts)
-            if page_data is None:
+        page_data = self.storage_backend.batch_get(hashes, dsts)
+        if page_data:
+            operation.increment(self.page_size * len(hashes))
+        else:
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed to retrieve page {hashes}."
+            )
+    # zero copy
+    def _mooncake_page_get(self, operation, hash_values, host_indices):
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
+        )
+        get_result = self.storage_backend.batch_get(
+            key_strs,
+            target_location=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        if get_result != len(hash_values):
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed or partially failed."
+            )
+        if get_result != 0:
+            operation.increment(get_result * self.page_size)
+    # non-zero copy
+    def _generic_page_get(self, operation, hash_values, host_indices):
+        # todo: zero copy
+        dummy_page_dst = [self.mem_pool_host.get_dummy_flat_data_page()] * len(
+            hash_values
+        )
+        page_data = self.storage_backend.batch_get(hash_values, dummy_page_dst)
+        if page_data is None:
+            return
+        for i in range(len(hash_values)):
+            if page_data[i] is None:
                 logger.warning(
-                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
                 )
                 break
-            completed_tokens = operation.completed_tokens
-            if operation.increment(self.page_size * len(page_hashes)):
-                for i in range(len(page_hashes)):
-                    completed_tokens += self.page_size
+            if operation.increment(self.page_size):
+                self.mem_pool_host.set_from_flat_data_page(
+                    host_indices[i * self.page_size],
+                    page_data[i],
+                )
             else:
                 break
-    def generic_page_transfer(self, operation, batch_size=8):
+    def _page_transfer(self, operation):
+        # Select the get function and batch size
+        if self.is_mooncake_backend():
+            get_func = self._mooncake_page_get
+            batch_size = 128
+        elif self.storage_backend_type == "hf3fs":
+            if self.mem_pool_host.layout == "page_first":
+                get_func = self._3fs_zero_copy_page_get
+            elif self.mem_pool_host.layout == "layer_first":
+                get_func = self._generic_page_get
+            batch_size = 128
+        else:
+            get_func = self._generic_page_get
+            batch_size = 8
+        # Transfer batch by batch
         for i in range(0, len(operation.hash_value), batch_size):
-            page_hashes = operation.hash_value[i : i + batch_size]
-            # todo: zero copy
-            dummy_page_dst = [
-                self.mem_pool_host.get_dummy_flat_data_page()
-                for _ in range(len(page_hashes))
+            batch_hashes = operation.hash_value[i : i + batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
             ]
-            page_data = self.storage_backend.batch_get(page_hashes, dummy_page_dst)
-            if page_data is None:
-                logger.warning(
-                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
-                )
-                break
-            completed_tokens = operation.completed_tokens
-            if operation.increment(self.page_size * len(page_hashes)):
-                for i in range(len(page_hashes)):
-                    self.mem_pool_host.set_from_flat_data_page(
-                        operation.host_indices[completed_tokens],
-                        page_data[i],
-                    )
-                    completed_tokens += self.page_size
-            else:
-                break
-    def mooncake_page_transfer(self, operation):
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            operation.hash_value, operation.host_indices
-        )
-        self.storage_backend.batch_get(key_strs, buffer_ptrs, buffer_sizes)
-        operation.increment(len(operation.hash_value) * self.page_size)
+            prev_completed_tokens = operation.completed_tokens
+            # Get one batch token, and update the completed_tokens if succeed
+            get_func(operation, batch_hashes, batch_host_indices)
+            # Check termination
+            if (
+                operation.completed_tokens
+                != prev_completed_tokens + len(batch_hashes) * self.page_size
+            ):
+                break  # Some operations fail or operation terminated by controller
+        # release pre-allocated memory
+        self.mem_pool_host.free(operation.host_indices[operation.completed_tokens :])
     def is_mooncake_backend(self):
         return self.storage_backend_type == "mooncake"
@@ -632,15 +715,7 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.prefetch_buffer.get(block=True, timeout=1)
-                if self.is_mooncake_backend():
-                    self.mooncake_page_transfer(operation)
-                elif self.storage_backend_type == "hf3fs":
-                    if self.mem_pool_host.layout == "page_first":
-                        self.zerocopy_page_transfer(operation, batch_size=128)
-                    elif self.mem_pool_host.layout == "layer_first":
-                        self.generic_page_transfer(operation, batch_size=128)
-                else:
-                    self.generic_page_transfer(operation)
+                self._page_transfer(operation)
                 if self.tp_world_size > 1:
                     # to ensure all TP workers release the host memory at the same time
@@ -662,6 +737,27 @@ class HiCacheController:
         # todo: more sophisticated rate limiting based on storage backend performance
         return True
+    def _generic_storage_hit_query(self, operation) -> tuple[list[str], int]:
+        last_hash = operation.last_hash
+        tokens_to_fetch = operation.token_ids
+        storage_query_count = 0
+        remaining_tokens = len(tokens_to_fetch)
+        hash_value = []
+        while remaining_tokens >= self.page_size:
+            last_hash = self.get_hash_str(
+                tokens_to_fetch[
+                    storage_query_count : storage_query_count + self.page_size
+                ],
+                last_hash,
+            )
+            hash_value.append(last_hash)
+            storage_query_count += self.page_size
+            remaining_tokens -= self.page_size
+        # deferring to batch exists
+        hit_page_num = self.storage_backend.batch_exists(hash_value)
+        return hash_value[:hit_page_num], hit_page_num * self.page_size
     def prefetch_thread_func(self):
         """
         Manage prefetching operations from storage backend to host memory.
@@ -675,38 +771,12 @@ class HiCacheController:
                 if operation is None:
                     continue
-                storage_hit_count = 0
                 if (
                     operation.host_indices is not None
                 ) and self.prefetch_rate_limit_check():
-                    last_hash = operation.last_hash
-                    tokens_to_fetch = operation.token_ids
-                    remaining_tokens = len(tokens_to_fetch)
-                    hash_value = []
-                    while remaining_tokens >= self.page_size:
-                        last_hash = self.get_hash_str(
-                            tokens_to_fetch[
-                                storage_hit_count : storage_hit_count + self.page_size
-                            ],
-                            last_hash,
-                        )
-                        # todo, more unified interface
-                        if not self.is_mooncake_backend():
-                            if not self.storage_backend.exists(last_hash):
-                                break
-                        hash_value.append(last_hash)
-                        storage_hit_count += self.page_size
-                        remaining_tokens -= self.page_size
-                    if self.is_mooncake_backend():
-                        # deferring to batch exists for mooncake store
-                        exist_result = self.storage_backend.exists(hash_value)
-                        storage_hit_count = (
-                            sum(1 for v in exist_result.values() if v != 0)
-                            * self.page_size
-                        )
+                    hash_value, storage_hit_count = self._generic_storage_hit_query(
+                        operation
+                    )
                 if self.tp_world_size > 1:
                     storage_hit_count_tensor = torch.tensor(
@@ -755,59 +825,64 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
-    def zerocopy_page_backup(self, operation, batch_size=8):
-        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
-            operation.hash_value, operation.host_indices
+    # non-zero copy
+    def _generic_page_set(self, hash_values, host_indices) -> bool:
+        data = [
+            self.mem_pool_host.get_flat_data_page(host_indices[i * self.page_size])
+            for i in range(len(hash_values))
+        ]
+        return self.storage_backend.batch_set(hash_values, data)
+    # zero copy
+    def _mooncake_page_set(self, hash_values, host_indices) -> bool:
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
         )
-        for i in range(0, len(hashes), batch_size):
-            page_hashes = hashes[i : i + batch_size]
-            page_data = dsts[i : i + batch_size]
-            success = self.storage_backend.batch_set(page_hashes, page_data)
-            if not success:
-                logger.warning(f"Failed to write page {page_hashes} to storage.")
-                break
-            operation.completed_tokens += self.page_size * len(page_hashes)
+        success = self.storage_backend.batch_set(
+            key_strs,
+            target_location=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        return success
-    def generic_page_backup(self, operation, batch_size=8):
+    # zero copy
+    def _3fs_zero_copy_page_set(self, hash_values, host_indices) -> bool:
+        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
+            hash_values, host_indices
+        )
+        return self.storage_backend.batch_set(hashes, dsts)
+    # Backup batch by batch
+    def _page_backup(self, operation):
+        # Select the set function and batch size
+        if self.is_mooncake_backend():
+            backup_set_func = self._mooncake_page_set
+            batch_size = 128
+        elif self.storage_backend_type == "hf3fs":
+            if self.mem_pool_host.layout == "page_first":
+                backup_set_func = self._3fs_zero_copy_page_set
+            elif self.mem_pool_host.layout == "layer_first":
+                backup_set_func = self._generic_page_set
+            batch_size = 128
+        else:
+            backup_set_func = self._generic_page_set
+            batch_size = 8
+        # Backup batch by batch
         for i in range(0, len(operation.hash_value), batch_size):
-            page_hashes = operation.hash_value[i : i + batch_size]
-            page_data = [
-                self.mem_pool_host.get_flat_data_page(
-                    operation.host_indices[j * self.page_size]
-                )
-                for j in range(i, i + len(page_hashes))
+            batch_hashes = operation.hash_value[i : i + batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
             ]
-            success = self.storage_backend.batch_set(page_hashes, page_data)
+            # Set one batch token, and record if success.
+            # todo: allow partial success
+            success = backup_set_func(batch_hashes, batch_host_indices)
             if not success:
-                logger.warning(f"Failed to write page {page_hashes} to storage.")
-                break
-            operation.completed_tokens += self.page_size * len(page_hashes)
-    def mooncake_page_backup(self, operation):
-        if len(operation.hash_value):
-            exist_hashvalues = self.storage_backend.exists(operation.hash_value)
-            indices = operation.host_indices.tolist()
-            non_exist_keys = []
-            non_exist_indices = []
-            for i in range(len(operation.hash_value)):
-                if not exist_hashvalues[operation.hash_value[i]]:
-                    non_exist_keys.append(operation.hash_value[i])
-                    non_exist_indices.extend(
-                        indices[i * self.page_size : (i + 1) * self.page_size]
-                    )
-            if len(non_exist_keys) > 0:
-                key_strs, buffer_ptrs, buffer_sizes = (
-                    self.mem_pool_host.get_buffer_meta(
-                        non_exist_keys, non_exist_indices
-                    )
-                )
-                # TODO: check the return value of batch set to see how many tokens are set successfully
-                self.storage_backend.batch_set(
-                    key_strs,
-                    target_location=buffer_ptrs,
-                    target_sizes=buffer_sizes,
+                logger.warning(
+                    f"Write page to storage: {len(batch_hashes)} pages failed."
                 )
-        operation.completed_tokens += len(operation.hash_value) * self.page_size
+                break
+            operation.completed_tokens += self.page_size * len(batch_hashes)
     def backup_thread_func(self):
         """
@@ -820,15 +895,7 @@ class HiCacheController:
                     continue
                 if not self.backup_skip:
-                    if self.is_mooncake_backend():
-                        self.mooncake_page_backup(operation)
-                    elif self.storage_backend_type == "hf3fs":
-                        if self.mem_pool_host.layout == "page_first":
-                            self.zerocopy_page_backup(operation, batch_size=128)
-                        elif self.mem_pool_host.layout == "layer_first":
-                            self.generic_page_backup(operation, batch_size=128)
-                    else:
-                        self.generic_page_backup(operation)
+                    self._page_backup(operation)
                     min_completed_tokens = operation.completed_tokens
                 else:
                     min_completed_tokens = len(operation.token_ids)

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -106,6 +106,8 @@ class DetokenizerManager:
             ]
         )
+        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
     def event_loop(self):
         """The event loop that handles requests"""
         while True:
@@ -133,6 +135,9 @@ class DetokenizerManager:
         # Trim stop token.
         if isinstance(matched, int) and isinstance(output, list):
+            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
+            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
+                return output
             assert len(output) > 0
             return output[:-1]
         return output

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -533,6 +533,21 @@ class TokenizedGenerateReqInput:
     dp_balance_id: int = -1
+@dataclass
+class BatchTokenizedGenerateReqInput:
+    # The batch of tokenized requests
+    batch: List[TokenizedGenerateReqInput]
+    def __len__(self):
+        return len(self.batch)
+    def __getitem__(self, i):
+        return self.batch[i]
+    def __iter__(self):
+        return iter(self.batch)
 @dataclass
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
@@ -668,6 +683,21 @@ class TokenizedEmbeddingReqInput:
     dp_balance_id: int = -1
+@dataclass
+class BatchTokenizedEmbeddingReqInput:
+    # The batch of tokenized embedding requests
+    batch: List[TokenizedEmbeddingReqInput]
+    def __len__(self):
+        return len(self.batch)
+    def __getitem__(self, i):
+        return self.batch[i]
+    def __iter__(self):
+        return iter(self.batch)
 @dataclass
 class BatchTokenIDOut:
     # The request id

sglang 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

sglang 0.5.1.post1py3-none-any.whl → 0.5.1.post3py3-none-any.whl