PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +3 -6
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +6 -2
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +105 -6
sglang/srt/disaggregation/mini_lb.py +74 -9
sglang/srt/disaggregation/mooncake/conn.py +33 -63
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +137 -17
sglang/srt/disaggregation/utils.py +32 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +883 -209
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +20 -5
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +17 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -88
sglang/srt/layers/quantization/fp8_utils.py +189 -264
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +9 -8
sglang/srt/layers/sampler.py +7 -12
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +15 -4
sglang/srt/managers/scheduler.py +28 -77
sglang/srt/managers/tokenizer_manager.py +116 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +41 -29
sglang/srt/mem_cache/memory_pool.py +38 -15
sglang/srt/model_executor/cuda_graph_runner.py +15 -10
sglang/srt/model_executor/model_runner.py +39 -31
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +292 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +31 -203
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +86 -72
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +6 -14
sglang/srt/utils.py +62 -6
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -60,7 +60,8 @@ from sglang.srt.managers.io_struct import (
     CloseSessionReqInput,
     ExpertDistributionReq,
     ExpertDistributionReqOutput,
-    FlushCacheReq,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
     GetInternalStateReq,
     GetInternalStateReqOutput,
     GetWeightsByNameReqInput,
@@ -391,6 +392,7 @@ class Scheduler(
         self.torch_profiler = None
         self.torch_profiler_output_dir: Optional[str] = None
         self.profiler_activities: Optional[List[str]] = None
+        self.profiler_id: Optional[str] = None
         self.profiler_target_forward_ct: Optional[int] = None
         # Init metrics stats
@@ -401,7 +403,7 @@ class Scheduler(
             [
                 (TokenizedGenerateReqInput, self.handle_generate_request),
                 (TokenizedEmbeddingReqInput, self.handle_embedding_request),
-                (FlushCacheReq, self.flush_cache_wrapped),
+                (FlushCacheReqInput, self.flush_cache_wrapped),
                 (AbortReq, self.abort_request),
                 (OpenSessionReqInput, self.open_session),
                 (CloseSessionReqInput, self.close_session),
@@ -484,9 +486,11 @@ class Scheduler(
                 self.tree_cache = HiRadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
-                    tp_cache_group=self.tp_worker.get_tp_cpu_group(),
+                    tp_cache_group=self.tp_cpu_group,
                     page_size=self.page_size,
                     hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
                 )
             else:
                 self.tree_cache = RadixCache(
@@ -553,7 +557,7 @@ class Scheduler(
             # The decode requests polling kv cache
             self.disagg_decode_transfer_queue = DecodeTransferQueue(
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
                 metadata_buffers=metadata_buffers,
             )
@@ -568,7 +572,7 @@ class Scheduler(
                 scheduler=self,
                 transfer_queue=self.disagg_decode_transfer_queue,
                 tree_cache=self.tree_cache,
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 tp_rank=self.tp_rank,
                 tp_size=self.tp_size,
                 bootstrap_port=self.server_args.disaggregation_bootstrap_port,
@@ -597,7 +601,7 @@ class Scheduler(
                 tp_rank=self.tp_rank,
                 tp_size=self.tp_size,
                 bootstrap_port=self.server_args.disaggregation_bootstrap_port,
-                gloo_group=self.tp_worker.get_attention_tp_cpu_group(),
+                gloo_group=self.attn_tp_cpu_group,
                 transfer_backend=self.transfer_backend,
                 scheduler=self,
             )
@@ -664,70 +668,6 @@ class Scheduler(
             self.last_batch = batch
-    @torch.no_grad()
-    def event_loop_normal_disagg_prefill(self):
-        """A normal scheduler loop for prefill worker in disaggregation mode."""
-        while True:
-            recv_reqs = self.recv_requests()
-            self.process_input_requests(recv_reqs)
-            self.waiting_queue.extend(
-                self.disagg_prefill_pending_queue.pop_bootstrapped()
-            )
-            self.process_prefill_chunk()
-            batch = self.get_new_batch_prefill()
-            self.cur_batch = batch
-            if batch:
-                result = self.run_batch(batch)
-                self.process_batch_result_disagg_prefill(batch, result)
-            if len(self.disagg_prefill_inflight_queue) > 0:
-                self.process_disagg_prefill_inflight_queue()
-            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-            self.last_batch = batch
-            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
-            # Otherwise, it hangs under high concurrency
-            self.running_batch.batch_is_full = False
-    @torch.no_grad()
-    def event_loop_normal_disagg_decode(self):
-        """A normal scheduler loop for decode worker in disaggregation mode."""
-        while True:
-            recv_reqs = self.recv_requests()
-            self.process_input_requests(recv_reqs)
-            # polling and allocating kv cache
-            self.process_decode_queue()
-            batch = self.get_next_disagg_decode_batch_to_run()
-            self.cur_batch = batch
-            if batch:
-                # Generate fake extend output.
-                if batch.forward_mode.is_extend():
-                    # Note: Logprobs should be handled on the prefill engine.
-                    self.stream_output(
-                        batch.reqs, [False for _ in range(len(batch.reqs))]
-                    )
-                else:
-                    result = self.run_batch(batch)
-                    self.process_batch_result(batch, result)
-            if batch is None and (
-                len(self.disagg_decode_transfer_queue.queue)
-                + len(self.disagg_decode_prealloc_queue.queue)
-                == 0
-            ):
-                # When the server is idle, do self-check and re-init some states
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-            self.last_batch = batch
     def recv_requests(self) -> List[Req]:
         """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
         if self.attn_tp_rank == 0:
@@ -1659,8 +1599,9 @@ class Scheduler(
         time.sleep(5)
         self.parent_process.send_signal(signal.SIGQUIT)
-    def flush_cache_wrapped(self, recv_req: FlushCacheReq):
-        self.flush_cache()
+    def flush_cache_wrapped(self, recv_req: FlushCacheReqInput):
+        success = self.flush_cache()
+        return FlushCacheReqOutput(success=success)
     def flush_cache(self):
         """Flush the memory pool and cache."""
@@ -1869,6 +1810,7 @@ class Scheduler(
                 recv_req.activities,
                 recv_req.with_stack,
                 recv_req.record_shapes,
+                recv_req.profile_id,
             )
         else:
             return self.stop_profile()
@@ -1880,6 +1822,7 @@ class Scheduler(
         activities: Optional[List[str]],
         with_stack: Optional[bool],
         record_shapes: Optional[bool],
+        profile_id: Optional[str],
     ) -> None:
         if self.profiler_activities:
             return ProfileReqOutput(
@@ -1894,9 +1837,11 @@ class Scheduler(
         self.torch_profiler_output_dir = output_dir
         self.profiler_activities = activities
+        self.profiler_id = profile_id
         logger.info(
-            "Profiling starts. Traces will be saved to: %s",
+            "Profiling starts. Traces will be saved to: %s (with id %s)",
             self.torch_profiler_output_dir,
+            self.profiler_id,
         )
         activity_map = {
@@ -1938,14 +1883,14 @@ class Scheduler(
             self.torch_profiler.export_chrome_trace(
                 os.path.join(
                     self.torch_profiler_output_dir,
-                    str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+                    self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
                 )
             )
         if "MEM" in self.profiler_activities:
             memory_profile_path = os.path.join(
                 self.torch_profiler_output_dir,
-                str(time.time()) + f"-TP-{self.tp_rank}-memory" + ".pickle",
+                self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
             )
             torch.cuda.memory._dump_snapshot(memory_profile_path)
             torch.cuda.memory._record_memory_history(enabled=None)
@@ -2069,9 +2014,15 @@ def run_scheduler_process(
             else:
                 scheduler.event_loop_normal()
         elif disaggregation_mode == DisaggregationMode.PREFILL:
-            scheduler.event_loop_normal_disagg_prefill()
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_prefill()
+            else:
+                scheduler.event_loop_normal_disagg_prefill()
         elif disaggregation_mode == DisaggregationMode.DECODE:
-            scheduler.event_loop_normal_disagg_decode()
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_decode()
+            else:
+                scheduler.event_loop_normal_disagg_decode()
     except Exception:
         traceback = get_exception_traceback()

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -66,7 +66,8 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     ExpertDistributionReq,
     ExpertDistributionReqOutput,
-    FlushCacheReq,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
     GenerateReqInput,
     GetInternalStateReq,
     GetInternalStateReqOutput,
@@ -264,6 +265,9 @@ class TokenizerManager:
         self.resume_memory_occupation_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
+        self.flush_cache_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
         self.start_profile_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
@@ -314,6 +318,10 @@ class TokenizerManager:
                     ResumeMemoryOccupationReqOutput,
                     self.resume_memory_occupation_communicator.handle_recv,
                 ),
+                (
+                    FlushCacheReqOutput,
+                    self.flush_cache_communicator.handle_recv,
+                ),
                 (
                     ProfileReqOutput,
                     self.start_profile_communicator.handle_recv,
@@ -415,38 +423,60 @@ class TokenizerManager:
         )
         if image_inputs and "input_ids" in image_inputs:
             input_ids = image_inputs["input_ids"]
-        if self.is_generation:
-            return_logprob = obj.return_logprob
-            logprob_start_len = obj.logprob_start_len
-            top_logprobs_num = obj.top_logprobs_num
-            token_ids_logprob = obj.token_ids_logprob
-            session_params = (
-                SessionParams(**obj.session_params) if obj.session_params else None
-            )
+        self._validate_token_len(obj, input_ids)
+        return self._create_tokenized_object(
+            obj, input_text, input_ids, input_embeds, image_inputs
+        )
+    def _validate_token_len(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput], input_ids: List[int]
+    ) -> None:
+        """Validates that the input token count and the requested token count doesn't exceed the model's context length."""
         input_token_num = len(input_ids) if input_ids is not None else 0
+        # Check if input alone exceeds context length
         if input_token_num >= self.context_len:
             raise ValueError(
                 f"The input ({input_token_num} tokens) is longer than the "
                 f"model's context length ({self.context_len} tokens)."
             )
+        # Check total tokens (input + max_new_tokens)
+        max_new_tokens = obj.sampling_params.get("max_new_tokens")
         if (
-            obj.sampling_params.get("max_new_tokens") is not None
-            and obj.sampling_params.get("max_new_tokens") + input_token_num
-            >= self.context_len
+            max_new_tokens is not None
+            and (max_new_tokens + input_token_num) >= self.context_len
         ):
-            raise ValueError(
+            total_tokens = max_new_tokens + input_token_num
+            error_msg = (
                 f"Requested token count exceeds the model's maximum context length "
-                f"of {self.context_len} tokens. You requested a total of "
-                f"{obj.sampling_params.get('max_new_tokens') + input_token_num} "
+                f"of {self.context_len} tokens. You requested a total of {total_tokens} "
                 f"tokens: {input_token_num} tokens from the input messages and "
-                f"{obj.sampling_params.get('max_new_tokens')} tokens for the "
-                f"completion. Please reduce the number of tokens in the input "
-                f"messages or the completion to fit within the limit."
+                f"{max_new_tokens} tokens for the completion. Please reduce the number "
+                f"of tokens in the input messages or the completion to fit within the limit."
+            )
+            raise ValueError(error_msg)
+    def _create_tokenized_object(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        input_text: str,
+        input_ids: List[int],
+        input_embeds: Optional[Union[List[float], None]] = None,
+        image_inputs: Optional[Dict] = None,
+    ) -> Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]:
+        """Create a tokenized request object from common parameters."""
+        if self.is_generation:
+            return_logprob = obj.return_logprob
+            logprob_start_len = obj.logprob_start_len
+            top_logprobs_num = obj.top_logprobs_num
+            token_ids_logprob = obj.token_ids_logprob
+            session_params = (
+                SessionParams(**obj.session_params) if obj.session_params else None
             )
-        # Parse sampling parameters
         sampling_params = SamplingParams(**obj.sampling_params)
         sampling_params.normalize(self.tokenizer)
         sampling_params.verify()
@@ -483,6 +513,50 @@ class TokenizerManager:
         return tokenized_obj
+    async def _batch_tokenize_and_process(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]]:
+        """Handle batch tokenization for text inputs only."""
+        logger.debug(f"Starting batch tokenization for {batch_size} text requests")
+        # Collect requests and texts
+        requests = [obj[i] for i in range(batch_size)]
+        texts = [req.text for req in requests]
+        # Batch tokenize all texts
+        encoded = self.tokenizer(texts)
+        input_ids_list = encoded["input_ids"]
+        # Process all requests
+        tokenized_objs = []
+        for i, req in enumerate(requests):
+            self._validate_token_len(obj[i], input_ids_list[i])
+            tokenized_objs.append(
+                self._create_tokenized_object(
+                    req, req.text, input_ids_list[i], None, None
+                )
+            )
+        logger.debug(f"Completed batch processing for {batch_size} requests")
+        return tokenized_objs
+    def _validate_batch_tokenization_constraints(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        """Validate constraints for batch tokenization processing."""
+        for i in range(batch_size):
+            if self.is_generation and obj[i].image_data:
+                raise ValueError(
+                    "For image input processing do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_ids is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for pre-tokenized input_ids. Do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_embeds is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for input_embeds. Do not set `enable_tokenizer_batch_encode`."
+                )
     def _send_one_request(
         self,
         obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -560,14 +634,27 @@ class TokenizerManager:
         generators = []
         rids = []
         if getattr(obj, "parallel_sample_num", 1) == 1:
-            # Send all requests
-            for i in range(batch_size):
-                tmp_obj = obj[i]
-                tokenized_obj = await self._tokenize_one_request(tmp_obj)
-                self._send_one_request(tmp_obj, tokenized_obj, created_time)
-                generators.append(self._wait_one_response(tmp_obj, request))
-                rids.append(tmp_obj.rid)
+            if self.server_args.enable_tokenizer_batch_encode:
+                # Validate batch tokenization constraints
+                self._validate_batch_tokenization_constraints(batch_size, obj)
+                tokenized_objs = await self._batch_tokenize_and_process(batch_size, obj)
+                for i, tokenized_obj in enumerate(tokenized_objs):
+                    tmp_obj = obj[i]
+                    self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, request))
+                    rids.append(tmp_obj.rid)
+            else:
+                # Sequential tokenization and processing
+                for i in range(batch_size):
+                    tmp_obj = obj[i]
+                    tokenized_obj = await self._tokenize_one_request(tmp_obj)
+                    self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, request))
+                    rids.append(tmp_obj.rid)
         else:
             # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
             if batch_size > 128:
@@ -628,9 +715,8 @@ class TokenizerManager:
                     except StopAsyncIteration:
                         pass
-    def flush_cache(self):
-        req = FlushCacheReq()
-        self.send_to_scheduler.send_pyobj(req)
+    async def flush_cache(self) -> FlushCacheReqOutput:
+        return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
     def abort_request(self, rid: str):
         if rid not in self.rid_to_state:
@@ -650,6 +736,7 @@ class TokenizerManager:
             output_dir=output_dir,
             num_steps=num_steps,
             activities=activities,
+            profile_id=str(time.time()),
         )
         result = (await self.start_profile_communicator(req))[0]
         if not result.success:

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -116,6 +116,7 @@ class TpModelWorker:
             ),
             self.model_runner.req_to_token_pool.size,
         )
+        assert self.max_running_requests > 0, "max_running_request is zero"
         self.max_req_len = min(
             self.model_config.context_len - 1,
             self.max_total_num_tokens - 1,

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -29,15 +29,17 @@ class HiRadixCache(RadixCache):
         tp_cache_group: torch.distributed.ProcessGroup,
         page_size: int,
         hicache_ratio: float,
+        hicache_size: int,
+        hicache_write_policy: str,
     ):
         self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
         if isinstance(self.kv_cache, MHATokenToKVPool):
             self.token_to_kv_pool_host = MHATokenToKVPoolHost(
-                self.kv_cache, hicache_ratio, page_size
+                self.kv_cache, hicache_ratio, hicache_size, page_size
             )
         elif isinstance(self.kv_cache, MLATokenToKVPool):
             self.token_to_kv_pool_host = MLATokenToKVPoolHost(
-                self.kv_cache, hicache_ratio, page_size
+                self.kv_cache, hicache_ratio, hicache_size, page_size
             )
         else:
             raise ValueError(f"HiRadixCache only supports MHA and MLA yet")
@@ -50,6 +52,7 @@ class HiRadixCache(RadixCache):
             self.token_to_kv_pool_host,
             page_size,
             load_cache_event=self.load_cache_event,
+            write_policy=hicache_write_policy,
         )
         # record the nodes with ongoing write through
@@ -57,7 +60,9 @@ class HiRadixCache(RadixCache):
         # record the node segments with ongoing load back
         self.ongoing_load_back = {}
         # todo: dynamically adjust the threshold
-        self.write_through_threshold = 1
+        self.write_through_threshold = (
+            1 if hicache_write_policy == "write_through" else 3
+        )
         self.load_back_threshold = 10
         super().__init__(
             req_to_token_pool, token_to_kv_pool_allocator, page_size, disable=False
@@ -76,7 +81,7 @@ class HiRadixCache(RadixCache):
             height += 1
         return height
-    def write_backup(self, node: TreeNode):
+    def write_backup(self, node: TreeNode, write_back=False):
         host_indices = self.cache_controller.write(
             device_indices=node.value,
             node_id=node.id,
@@ -90,21 +95,29 @@ class HiRadixCache(RadixCache):
         if host_indices is not None:
             node.host_value = host_indices
             self.ongoing_write_through[node.id] = node
-            self.inc_lock_ref(node)
+            if not write_back:
+                # no need to lock nodes if write back
+                self.inc_lock_ref(node)
         else:
-            return None
+            return 0
         return len(host_indices)
     def inc_hit_count(self, node: TreeNode):
-        if self.cache_controller.write_policy != "write_through_selective":
+        if node.backuped or self.cache_controller.write_policy == "write_back":
             return
         node.hit_count += 1
-        if node.host_value is None and node.hit_count > self.write_through_threshold:
+        if node.hit_count >= self.write_through_threshold:
             self.write_backup(node)
             node.hit_count = 0
-    def writing_check(self):
+    def writing_check(self, write_back=False):
+        if write_back:
+            # blocking till all write back complete
+            while len(self.ongoing_write_through) > 0:
+                ack_id = self.cache_controller.ack_write_queue.get()
+                del self.ongoing_write_through[ack_id]
+            return
         queue_size = torch.tensor(
             self.cache_controller.ack_write_queue.qsize(), dtype=torch.int
         )
@@ -143,28 +156,25 @@ class HiRadixCache(RadixCache):
         heapq.heapify(leaves)
         num_evicted = 0
-        pending_nodes = []
+        write_back_nodes = []
         while num_evicted < num_tokens and len(leaves):
             x = heapq.heappop(leaves)
             if x.lock_ref > 0:
                 continue
-            if x.host_value is None:
+            if not x.backuped:
                 if self.cache_controller.write_policy == "write_back":
-                    num_evicted += self.write_backup(x)
-                elif self.cache_controller.write_policy == "write_through_selective":
-                    num_evicted += self._evict_write_through_selective(x)
+                    # write to host if the node is not backuped
+                    num_evicted += self.write_backup(x, write_back=True)
+                    write_back_nodes.append(x)
                 else:
-                    assert (
-                        self.cache_controller.write_policy != "write_through"
-                    ), "write_through should be inclusive"
-                    raise NotImplementedError
+                    num_evicted += self._evict_regular(x)
             else:
-                num_evicted += self._evict_write_through(x)
+                num_evicted += self._evict_backuped(x)
             for child in x.parent.children.values():
-                if child in pending_nodes:
+                if child in write_back_nodes:
                     continue
                 if not child.evicted:
                     break
@@ -173,12 +183,12 @@ class HiRadixCache(RadixCache):
                 heapq.heappush(leaves, x.parent)
         if self.cache_controller.write_policy == "write_back":
-            # blocking till all write back complete
-            while len(self.ongoing_write_through) > 0:
-                self.writing_check()
-                time.sleep(0.1)
+            self.writing_check(write_back=True)
+            for node in write_back_nodes:
+                assert node.backuped
+                self._evict_backuped(node)
-    def _evict_write_through(self, node: TreeNode):
+    def _evict_backuped(self, node: TreeNode):
         # evict a node already written to host
         num_evicted = self.cache_controller.evict_device(node.value, node.host_value)
         assert num_evicted > 0
@@ -186,7 +196,7 @@ class HiRadixCache(RadixCache):
         node.value = None
         return num_evicted
-    def _evict_write_through_selective(self, node: TreeNode):
+    def _evict_regular(self, node: TreeNode):
         # evict a node not initiated write to host
         self.cache_controller.mem_pool_device_allocator.free(node.value)
         num_evicted = len(node.value)
@@ -335,11 +345,13 @@ class HiRadixCache(RadixCache):
             prefix_len = self.key_match_fn(child.key, key)
             if prefix_len < len(child.key):
                 new_node = self._split_node(child.key, child, prefix_len)
+                self.inc_hit_count(new_node)
                 if not new_node.evicted:
                     value.append(new_node.value)
                 node = new_node
                 break
             else:
+                self.inc_hit_count(child)
                 if not child.evicted:
                     value.append(child.value)
                 node = child
@@ -365,7 +377,7 @@ class HiRadixCache(RadixCache):
         else:
             new_node.value = child.value[:split_len]
             child.value = child.value[split_len:]
-        if child.host_value is not None:
+        if child.backuped:
             new_node.host_value = child.host_value[:split_len]
             child.host_value = child.host_value[split_len:]
         child.parent = new_node
@@ -422,8 +434,8 @@ class HiRadixCache(RadixCache):
             node.children[child_key] = new_node
             self.evictable_size_ += len(value)
-            if self.cache_controller.write_policy == "write_through":
-                self.write_backup(new_node)
+            if self.cache_controller.write_policy != "write_back":
+                self.inc_hit_count(new_node)
         return total_prefix_length
     def _collect_leaves_device(self):

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl