PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_one_batch.py +8 -6
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +6 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +19 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +211 -72
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +30 -29
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +144 -55
sglang/srt/disaggregation/utils.py +155 -123
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +37 -29
sglang/srt/entrypoints/http_server.py +153 -72
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +40 -3
sglang/srt/layers/attention/aiter_backend.py +20 -4
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
sglang/srt/layers/attention/flashattention_backend.py +71 -72
sglang/srt/layers/attention/flashinfer_backend.py +10 -8
sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -12
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +138 -130
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +28 -10
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +2 -14
sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
sglang/srt/layers/moe/ep_moe/layer.py +249 -33
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +107 -12
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +249 -105
sglang/srt/lora/mem_pool.py +53 -50
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -14
sglang/srt/managers/io_struct.py +31 -10
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +79 -37
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +220 -79
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +40 -10
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -15
sglang/srt/mem_cache/hiradix_cache.py +38 -25
sglang/srt/mem_cache/memory_pool.py +213 -505
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +56 -28
sglang/srt/model_executor/cuda_graph_runner.py +198 -100
sglang/srt/model_executor/forward_batch_info.py +32 -10
sglang/srt/model_executor/model_runner.py +28 -12
sglang/srt/model_loader/loader.py +16 -2
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +213 -173
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +351 -238
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
sglang/srt/speculative/eagle_utils.py +468 -116
sglang/srt/speculative/eagle_worker.py +258 -84
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +235 -11
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +2 -0
sglang/test/test_utils.py +4 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -1990
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/managers/template_manager.py ADDED Viewed

@@ -0,0 +1,226 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Centralized template management for chat templates and completion templates.
+This module provides a unified interface for managing both chat conversation templates
+and code completion templates, eliminating global state and improving modularity.
+"""
+import json
+import logging
+import os
+from typing import Optional
+from sglang.srt.code_completion_parser import (
+    CompletionTemplate,
+    FimPosition,
+    completion_template_exists,
+    register_completion_template,
+)
+from sglang.srt.conversation import (
+    Conversation,
+    SeparatorStyle,
+    chat_template_exists,
+    get_conv_template_by_model_path,
+    register_conv_template,
+)
+from sglang.srt.jinja_template_utils import detect_jinja_template_content_format
+logger = logging.getLogger(__name__)
+class TemplateManager:
+    """
+    Centralized manager for chat and completion templates.
+    This class encapsulates all template-related state and operations,
+    eliminating the need for global variables and providing a clean
+    interface for template management.
+    """
+    def __init__(self):
+        self._chat_template_name: Optional[str] = None
+        self._completion_template_name: Optional[str] = None
+        self._jinja_template_content_format: Optional[str] = None
+    @property
+    def chat_template_name(self) -> Optional[str]:
+        """Get the current chat template name."""
+        return self._chat_template_name
+    @property
+    def completion_template_name(self) -> Optional[str]:
+        """Get the current completion template name."""
+        return self._completion_template_name
+    @property
+    def jinja_template_content_format(self) -> Optional[str]:
+        """Get the detected template content format ('string' or 'openai' or None)."""
+        return self._jinja_template_content_format
+    def load_chat_template(
+        self, tokenizer_manager, chat_template_arg: str, model_path: str
+    ) -> None:
+        """
+        Load a chat template from various sources.
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            chat_template_arg: Template name or file path
+            model_path: Path to the model
+        """
+        logger.info(f"Loading chat template: {chat_template_arg}")
+        if not chat_template_exists(chat_template_arg):
+            if not os.path.exists(chat_template_arg):
+                raise RuntimeError(
+                    f"Chat template {chat_template_arg} is not a built-in template name "
+                    "or a valid chat template file path."
+                )
+            if chat_template_arg.endswith(".jinja"):
+                self._load_jinja_template(tokenizer_manager, chat_template_arg)
+            else:
+                self._load_json_chat_template(chat_template_arg)
+        else:
+            self._chat_template_name = chat_template_arg
+    def guess_chat_template_from_model_path(self, model_path: str) -> None:
+        """
+        Infer chat template name from model path.
+        Args:
+            model_path: Path to the model
+        """
+        template_name = get_conv_template_by_model_path(model_path)
+        if template_name is not None:
+            logger.info(f"Inferred chat template from model path: {template_name}")
+            self._chat_template_name = template_name
+    def load_completion_template(self, completion_template_arg: str) -> None:
+        """
+        Load completion template for code completion.
+        Args:
+            completion_template_arg: Template name or file path
+        """
+        logger.info(f"Loading completion template: {completion_template_arg}")
+        if not completion_template_exists(completion_template_arg):
+            if not os.path.exists(completion_template_arg):
+                raise RuntimeError(
+                    f"Completion template {completion_template_arg} is not a built-in template name "
+                    "or a valid completion template file path."
+                )
+            self._load_json_completion_template(completion_template_arg)
+        else:
+            self._completion_template_name = completion_template_arg
+    def initialize_templates(
+        self,
+        tokenizer_manager,
+        model_path: str,
+        chat_template: Optional[str] = None,
+        completion_template: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize all templates based on provided configuration.
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            model_path: Path to the model
+            chat_template: Optional chat template name/path
+            completion_template: Optional completion template name/path
+        """
+        # Load chat template
+        if chat_template:
+            self.load_chat_template(tokenizer_manager, chat_template, model_path)
+        else:
+            self.guess_chat_template_from_model_path(model_path)
+        # Load completion template
+        if completion_template:
+            self.load_completion_template(completion_template)
+    def _load_jinja_template(self, tokenizer_manager, template_path: str) -> None:
+        """Load a Jinja template file."""
+        with open(template_path, "r") as f:
+            chat_template = "".join(f.readlines()).strip("\n")
+        tokenizer_manager.tokenizer.chat_template = chat_template.replace("\\n", "\n")
+        self._chat_template_name = None
+        # Detect content format from the loaded template
+        self._jinja_template_content_format = detect_jinja_template_content_format(
+            chat_template
+        )
+        logger.info(
+            f"Detected chat template content format: {self._jinja_template_content_format}"
+        )
+    def _load_json_chat_template(self, template_path: str) -> None:
+        """Load a JSON chat template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of chat template file"
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                sep_style = SeparatorStyle[template["sep_style"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown separator style: {template['sep_style']}"
+                ) from None
+            register_conv_template(
+                Conversation(
+                    name=template["name"],
+                    system_template=template["system"] + "\n{system_message}",
+                    system_message=template.get("system_message", ""),
+                    roles=(template["user"], template["assistant"]),
+                    sep_style=sep_style,
+                    sep=template.get("sep", "\n"),
+                    stop_str=template["stop_str"],
+                ),
+                override=True,
+            )
+        self._chat_template_name = template["name"]
+    def _load_json_completion_template(self, template_path: str) -> None:
+        """Load a JSON completion template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of completion template file"
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                fim_position = FimPosition[template["fim_position"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown fim position: {template['fim_position']}"
+                ) from None
+            register_completion_template(
+                CompletionTemplate(
+                    name=template["name"],
+                    fim_begin_token=template["fim_begin_token"],
+                    fim_middle_token=template["fim_middle_token"],
+                    fim_end_token=template["fim_end_token"],
+                    fim_position=fim_position,
+                ),
+                override=True,
+            )
+        self._completion_template_name = template["name"]

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -418,6 +418,20 @@ class TokenizerManager:
         obj.normalize_batch_and_arguments()
+        if isinstance(obj, GenerateReqInput):
+            return_hidden_states = obj.return_hidden_states
+            has_return_hidden_states = return_hidden_states == True or (
+                isinstance(return_hidden_states, list) and any(return_hidden_states)
+            )
+            if (
+                not self.server_args.enable_return_hidden_states
+                and has_return_hidden_states
+            ):
+                raise ValueError(
+                    "return_hidden_states=True requires the server to be started "
+                    "with --enable-return-hidden-states (ServerArgs.enable_return_hidden_states)."
+                )
         if self.log_requests:
             max_length, skip_names, _ = self.log_request_metadata
             logger.info(
@@ -445,6 +459,10 @@ class TokenizerManager:
         # Tokenize
         input_embeds = None
         input_text = obj.text
+        token_type_ids = None
+        is_cross_encoder_request = (
+            isinstance(obj, EmbeddingReqInput) and obj.is_cross_encoder_request
+        )
         if obj.input_embeds is not None:
             if not self.server_args.disable_radix_cache:
                 raise ValueError(
@@ -463,7 +481,14 @@ class TokenizerManager:
                     "accept text prompts. Please provide input_ids or re-initialize "
                     "the engine with skip_tokenizer_init=False."
                 )
-            input_ids = self.tokenizer.encode(input_text)
+            encoded = self.tokenizer(
+                input_text, return_token_type_ids=is_cross_encoder_request
+            )
+            input_ids = encoded["input_ids"]
+            if is_cross_encoder_request:
+                input_ids = encoded["input_ids"][0]
+                token_type_ids = encoded.get("token_type_ids", [None])[0]
         if self.mm_processor and obj.contains_mm_input():
             image_inputs = await self.mm_processor.process_mm_data_async(
@@ -479,7 +504,7 @@ class TokenizerManager:
         self._validate_token_len(obj, input_ids)
         return self._create_tokenized_object(
-            obj, input_text, input_ids, input_embeds, image_inputs
+            obj, input_text, input_ids, input_embeds, image_inputs, token_type_ids
         )
     def _validate_token_len(
@@ -518,6 +543,7 @@ class TokenizerManager:
         input_ids: List[int],
         input_embeds: Optional[Union[List[float], None]] = None,
         image_inputs: Optional[Dict] = None,
+        token_type_ids: Optional[List[int]] = None,
     ) -> Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]:
         """Create a tokenized request object from common parameters."""
@@ -578,6 +604,7 @@ class TokenizerManager:
                 input_text,
                 input_ids,
                 image_inputs,
+                token_type_ids,
                 sampling_params,
             )
@@ -1031,12 +1058,7 @@ class TokenizerManager:
                         "lora_path",
                     ]
                 )
-                out_skip_names = set(
-                    [
-                        "text",
-                        "output_ids",
-                    ]
-                )
+                out_skip_names = set(["text", "output_ids", "embedding"])
             elif self.log_requests_level == 1:
                 max_length = 2048
             elif self.log_requests_level == 2:
@@ -1113,13 +1135,21 @@ class TokenizerManager:
             remain_num_req = len(self.rid_to_state)
             if self.health_check_failed:
-                # if health check failed, we should exit immediately
+                # if health check failed, exit immediately
                 logger.error(
                     "Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
                     remain_num_req,
                 )
                 break
+            elif get_bool_env_var("SGL_FORCE_SHUTDOWN"):
+                # if force shutdown flag set, exit immediately
+                logger.error(
+                    "Signal SIGTERM received while force shutdown flag set. Force exiting... remaining number of requests: %d",
+                    remain_num_req,
+                )
+                break
             logger.info(
                 f"Gracefully exiting... remaining number of requests {remain_num_req}"
             )
@@ -1196,7 +1226,7 @@ class TokenizerManager:
                     state.last_output_offset = len(state.output_ids)
                 else:
                     state.output_ids.extend(recv_obj.output_ids[i])
-                    output_token_ids = state.output_ids
+                    output_token_ids = state.output_ids.copy()
                 out_dict = {
                     "output_ids": output_token_ids,

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -35,7 +35,8 @@ from sglang.srt.managers.io_struct import (
     UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.schedule_batch import ModelWorkerBatch, global_server_args_dict
-from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
@@ -57,7 +58,7 @@ class TpModelWorker:
         nccl_port: int,
         is_draft_worker: bool = False,
         req_to_token_pool: Optional[ReqToTokenPool] = None,
-        token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None,
+        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
     ):
         # Parse args
         self.tp_size = server_args.tp_size
@@ -147,6 +148,15 @@ class TpModelWorker:
         # A reference make this class has the same member as TpModelWorkerClient
         self.worker = self
+        self.hicache_layer_transfer_counter = None
+    def register_hicache_layer_transfer_counter(self, counter):
+        self.hicache_layer_transfer_counter = counter
+    def set_hicache_consumer(self, consumer_index):
+        if self.hicache_layer_transfer_counter is not None:
+            self.hicache_layer_transfer_counter.set_consumer(consumer_index)
     def get_worker_info(self):
         return (
             self.max_total_num_tokens,

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -88,6 +88,15 @@ class TpModelWorkerClient:
         if self.device == "cpu":
             self.scheduler_stream.synchronize = lambda: None  # No-op for CPU
+        self.hicache_layer_transfer_counter = None
+    def register_hicache_layer_transfer_counter(self, counter):
+        self.hicache_layer_transfer_counter = counter
+    def set_hicache_consumer(self, consumer_index):
+        if self.hicache_layer_transfer_counter is not None:
+            self.hicache_layer_transfer_counter.set_consumer(consumer_index)
     def get_worker_info(self):
         return self.worker.get_worker_info()
@@ -146,6 +155,8 @@ class TpModelWorkerClient:
             input_ids = model_worker_batch.input_ids
             resolve_future_token_ids(input_ids, self.future_token_ids_map)
+            # update the consumer index of hicache to the running batch
+            self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
             # Run forward
             logits_output, next_token_ids, can_run_cuda_graph = (
                 self.worker.forward_batch_generation(

sglang/srt/mem_cache/{paged_allocator.py → allocator.py} RENAMED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 """
 Copyright 2025 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +19,132 @@ limitations under the License.
 Page-aligned memory pool.
 """
+import abc
+from typing import TYPE_CHECKING
 import torch
 import triton
 import triton.language as tl
-from sglang.srt.mem_cache.memory_pool import KVCache
 from sglang.srt.utils import get_bool_env_var, next_power_of_2
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import KVCache
+class BaseTokenToKVPoolAllocator(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+    ):
+        self.size = size
+        self.page_size = page_size
+        self.dtype = dtype
+        self.device = device
+        self._kvcache = kvcache
+        self.free_pages = None
+        self.is_not_in_free_group = True
+        self.free_group = []
+    def debug_print(self) -> str:
+        return ""
+    def available_size(self):
+        return len(self.free_pages) * self.page_size
+    def get_kvcache(self):
+        return self._kvcache
+    def restore_state(self, free_pages):
+        self.free_pages = free_pages
+    def backup_state(self):
+        return self.free_pages
+    def free_group_begin(self):
+        self.is_not_in_free_group = False
+        self.free_group = []
+    def free_group_end(self):
+        self.is_not_in_free_group = True
+        if self.free_group:
+            self.free(torch.cat(self.free_group))
+    def get_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the get_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+    def load_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the load_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+    def alloc_extend(self, *args, **kwargs):
+        raise NotImplementedError("alloc_extend is only for paged allocator")
+    def alloc_decode(self, *args, **kwargs):
+        raise NotImplementedError("alloc_decode is only for paged allocator")
+    @abc.abstractmethod
+    def clear(self):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def alloc(self, need_size: int):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def free(self, free_index: torch.Tensor):
+        raise NotImplementedError()
+class TokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """An allocator managing the indices to kv cache data."""
+    def __init__(self, size: int, dtype: torch.dtype, device: str, kvcache: KVCache):
+        super().__init__(size, 1, dtype, device, kvcache)
+        self.clear()
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_pages = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_not_in_free_group = True
+        self.free_group = []
+    def available_size(self):
+        # To avoid minor "len(free_pages) * 1" overhead
+        return len(self.free_pages)
+    def alloc(self, need_size: int):
+        if need_size > len(self.free_pages):
+            return None
+        select_index = self.free_pages[:need_size]
+        self.free_pages = self.free_pages[need_size:]
+        return select_index
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+        if self.is_not_in_free_group:
+            self.free_pages = torch.cat((self.free_pages, free_index))
+        else:
+            self.free_group.append(free_index)
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
 @triton.jit
 def alloc_extend_kernel(
@@ -154,7 +275,7 @@ def alloc_decode_kernel(
         tl.store(out_indices + pid, page * page_size)
-class PagedTokenToKVPoolAllocator:
+class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
     """
     An allocator managing the indices to kv cache data.
@@ -172,26 +293,11 @@ class PagedTokenToKVPoolAllocator:
         device: str,
         kvcache: KVCache,
     ):
-        self.size = size
-        self.dtype = dtype
-        self.device = device
-        self.page_size = page_size
+        super().__init__(size, page_size, dtype, device, kvcache)
         self.num_pages = size // page_size
-        self.free_pages = None
-        self.is_not_in_free_group = True
-        self.free_group = []
-        self.clear()
         self.debug_mode = get_bool_env_var("SGLANG_DEBUG_MEMORY_POOL")
-        self._kvcache = kvcache
         self.ret_values = torch.empty((), dtype=torch.int64, device=self.device)
-    def available_size(self):
-        return len(self.free_pages) * self.page_size
-    def get_kvcache(self):
-        return self._kvcache
+        self.clear()
     def alloc(self, need_size: int):
         # page-aligned allocation, returning contiguous indices of pages
@@ -298,21 +404,6 @@ class PagedTokenToKVPoolAllocator:
         if self.debug_mode:
             assert len(torch.unique(self.free_pages)) == len(self.free_pages)
-    def free_group_begin(self):
-        self.is_not_in_free_group = False
-        self.free_group = []
-    def free_group_end(self):
-        self.is_not_in_free_group = True
-        if self.free_group:
-            self.free(torch.cat(self.free_group))
-    def backup_state(self):
-        return self.free_pages
-    def restore_state(self, free_pages):
-        self.free_pages = free_pages
     def clear(self):
         # The padded slot 0 is used for writing dummy outputs from padded tokens.
         self.free_pages = torch.arange(

sglang/srt/mem_cache/base_prefix_cache.py CHANGED Viewed

@@ -1,5 +1,31 @@
 from abc import ABC, abstractmethod
-from typing import Any, List, Tuple
+from typing import TYPE_CHECKING, Any, List, NamedTuple, Tuple
+import torch
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+else:
+    Req = Any  # Placeholder for Req type when not type checking
+class MatchResult(NamedTuple):
+    """Result of a prefix match operation.
+    Attributes:
+        device_indices  :   Indices of the KV cache on the device matched by common prefix.
+        last_device_node:   The last TreeNode on the device that was matched.
+        last_host_node  :   The last TreeNode on the host that was matched.
+                            Note that if HiCache is not enabled,
+                            this **must** be the same as `last_device_node`.
+        host_hit_length :   Length of the KV cache hit on the host, if applicable.
+                            0 if HiCache is not enabled.
+    """
+    device_indices: torch.Tensor
+    last_device_node: Any
+    last_host_node: Any
+    host_hit_length: int = 0
 class BasePrefixCache(ABC):
@@ -10,19 +36,15 @@ class BasePrefixCache(ABC):
         pass
     @abstractmethod
-    def match_prefix(self, **kwargs) -> Tuple[List[int], int]:
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
         pass
     @abstractmethod
-    def insert(self, **kwargs):
+    def cache_finished_req(self, req: Req, **kwargs):
         pass
     @abstractmethod
-    def cache_finished_req(self, **kwargs):
-        pass
-    @abstractmethod
-    def cache_unfinished_req(self, **kwargs):
+    def cache_unfinished_req(self, req: Req, **kwargs):
         pass
     @abstractmethod
@@ -49,5 +71,27 @@ class BasePrefixCache(ABC):
     def pretty_print(self):
         raise NotImplementedError()
+    def init_load_back(
+        self,
+        last_host_node: Any,
+        host_hit_length: int,
+    ) -> Tuple[torch.Tensor, Any]:
+        """
+        Preparing KV cache loading from host to device.
+        """
+        raise NotImplementedError()
+    def ready_to_load_host_cache(self) -> Any:
+        """
+        Notify the cache controller to start the KV cache loading
+        """
+        raise NotImplementedError()
+    def check_hicache_events(self) -> Any:
+        """
+        Check HiCache related activities to update radix tree and synchronize across TP workers if needed
+        """
+        raise NotImplementedError()
     def take_events(self):
         return []

sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl