PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -0
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +7 -7
sglang/srt/disaggregation/decode.py +8 -3
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +4 -5
sglang/srt/entrypoints/openai/protocol.py +0 -9
sglang/srt/entrypoints/openai/serving_chat.py +59 -265
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +8 -10
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/quantization/__init__.py +5 -3
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/modelopt_quant.py +6 -11
sglang/srt/layers/quantization/mxfp4.py +4 -1
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +21 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +6 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +35 -20
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +15 -7
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +25 -26
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +22 -3
sglang/srt/model_executor/forward_batch_info.py +26 -5
sglang/srt/model_executor/model_runner.py +129 -35
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_v2.py +74 -35
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +9 -9
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +136 -19
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/registry.py +1 -1
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/server_args.py +115 -139
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +12 -4
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -169,12 +169,13 @@ class StorageOperation:
         host_indices: torch.Tensor,
         token_ids: List[int],
         last_hash: Optional[str] = None,
+        hash_value: Optional[List[str]] = None,
     ):
         self.host_indices = host_indices
         self.token_ids = token_ids
         self.last_hash = last_hash
         self.completed_tokens = 0
-        self.hash_value = []
+        self.hash_value = hash_value if hash_value is not None else []
         self.id = StorageOperation.counter
         StorageOperation.counter += 1
@@ -259,6 +260,7 @@ class HiCacheController:
                 self.storage_backend = MooncakeStore()
                 self.get_hash_str = get_hash_str_mooncake
                 self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
+                assert self.mem_pool_host.layout == "page_first"
             elif storage_backend == "hf3fs":
                 from sglang.srt.distributed import get_tensor_model_parallel_rank
                 from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
@@ -433,7 +435,9 @@ class HiCacheController:
         if self.io_backend == "kernel":
             return host_indices.to(self.mem_pool_device.device), device_indices
         elif self.io_backend == "direct":
-            return host_indices, device_indices.cpu()
+            device_indices = device_indices.cpu()
+            host_indices, idx = host_indices.sort()
+            return host_indices, device_indices.index_select(0, idx)
         else:
             raise ValueError(f"Unsupported io backend")
@@ -570,10 +574,6 @@ class HiCacheController:
                     )
                     completed_tokens += self.page_size
             else:
-                # operation terminated by controller, release pre-allocated memory
-                self.mem_pool_host.free(
-                    operation.host_indices[operation.completed_tokens :]
-                )
                 break
     def mooncake_page_transfer(self, operation):
@@ -599,6 +599,14 @@ class HiCacheController:
                     self.generic_page_transfer(operation, batch_size=128)
                 else:
                     self.generic_page_transfer(operation)
+                if self.tp_world_size > 1:
+                    # to ensure all TP workers release the host memory at the same time
+                    torch.distributed.barrier(group=self.prefetch_tp_group)
+                # operation terminated by controller, release pre-allocated memory
+                self.mem_pool_host.free(
+                    operation.host_indices[operation.completed_tokens :]
+                )
             except Empty:
                 continue
@@ -626,7 +634,9 @@ class HiCacheController:
                     continue
                 storage_hit_count = 0
-                if self.prefetch_rate_limit_check():
+                if (
+                    operation.host_indices is not None
+                ) and self.prefetch_rate_limit_check():
                     last_hash = operation.last_hash
                     tokens_to_fetch = operation.token_ids
@@ -670,7 +680,8 @@ class HiCacheController:
                 if storage_hit_count < self.prefetch_threshold:
                     # not to prefetch if not enough benefits
                     self.prefetch_revoke_queue.put(operation.request_id)
-                    self.mem_pool_host.free(operation.host_indices)
+                    if operation.host_indices is not None:
+                        self.mem_pool_host.free(operation.host_indices)
                     logger.debug(
                         f"Revoking prefetch for request {operation.request_id} due to insufficient hits ({storage_hit_count})."
                     )
@@ -693,12 +704,12 @@ class HiCacheController:
         self,
         host_indices: torch.Tensor,
         token_ids: List[int],
-        last_hash: Optional[str] = None,
+        hash_value: Optional[List[str]] = None,
     ) -> int:
         """
         Write KV caches from host memory to storage backend.
         """
-        operation = StorageOperation(host_indices, token_ids, last_hash)
+        operation = StorageOperation(host_indices, token_ids, hash_value=hash_value)
         self.backup_queue.put(operation)
         return operation.id
@@ -753,24 +764,6 @@ class HiCacheController:
                 if operation is None:
                     continue
-                last_hash = operation.last_hash
-                tokens_to_backup = operation.token_ids
-                backup_hit_count = 0
-                remaining_tokens = len(tokens_to_backup)
-                hash_value = []
-                while remaining_tokens >= self.page_size:
-                    last_hash = self.get_hash_str(
-                        tokens_to_backup[
-                            backup_hit_count : backup_hit_count + self.page_size
-                        ],
-                        last_hash,
-                    )
-                    backup_hit_count += self.page_size
-                    hash_value.append(last_hash)
-                    remaining_tokens -= self.page_size
-                operation.hash_value = hash_value
                 if self.is_mooncake_backend():
                     self.mooncake_page_backup(operation)
                 elif self.storage_backend_type == "hf3fs":
@@ -793,7 +786,6 @@ class HiCacheController:
                 self.ack_backup_queue.put(
                     (
                         operation.id,
-                        operation.hash_value[: min_completed_tokens // self.page_size],
                         min_completed_tokens,
                     )
                 )

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -216,7 +216,7 @@ class DetokenizerManager:
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             output_strs=output_strs,
-            output_ids=recv_obj.decode_ids,
+            output_ids=recv_obj.output_ids,
             prompt_tokens=recv_obj.prompt_tokens,
             completion_tokens=recv_obj.completion_tokens,
             cached_tokens=recv_obj.cached_tokens,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -99,25 +99,24 @@ class GenerateReqInput:
     stream: bool = False
     # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
     log_metrics: bool = True
+    # Whether to return hidden states
+    return_hidden_states: Union[List[bool], bool] = False
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
     # The path to the LoRA adaptors
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     # The uid of LoRA adaptors, should be initialized by tokenizer manager
     lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    # Session info for continual prompting
-    session_params: Optional[Union[List[Dict], Dict]] = None
     # Custom logit processor for advanced sampling control. Must be a serialized instance
     # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
     # Use the processor's `to_str()` method to generate the serialized string.
     custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
-    # Whether to return hidden states
-    return_hidden_states: Union[List[bool], bool] = False
     # For disaggregated inference
     bootstrap_host: Optional[Union[List[str], str]] = None
     bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
@@ -456,6 +455,7 @@ class GenerateReqInput:
             log_metrics=self.log_metrics,
             modalities=self.modalities[i] if self.modalities else None,
             lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            lora_id=self.lora_id[i] if self.lora_id is not None else None,
             custom_logit_processor=(
                 self.custom_logit_processor[i]
                 if self.custom_logit_processor is not None

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -614,8 +614,7 @@ def general_mm_embed_routine(
         input_ids: Input token IDs tensor
         forward_batch: Batch information for model forward pass
         language_model: Base language model to use
-        image_data_embedding_func: Function to embed image data
-        audio_data_embedding_func: Function to embed audio data
+        data_embedding_funcs: A dictionary mapping from modality type to the corresponding embedding function.
         placeholder_tokens: Token IDs for multimodal placeholders
         **kwargs: Additional arguments passed to language model

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -20,7 +20,7 @@ def import_processors():
             try:
                 module = importlib.import_module(name)
             except Exception as e:
-                logger.warning(f"Ignore import error when loading {name}: " f"{e}")
+                logger.warning(f"Ignore import error when loading {name}: {e}")
                 continue
             all_members = inspect.getmembers(module, inspect.isclass)
             classes = [

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -37,6 +37,7 @@ import logging
 import threading
 from enum import Enum, auto
 from http import HTTPStatus
+from itertools import chain
 from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -57,6 +58,7 @@ from sglang.srt.mem_cache.allocator import (
 )
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
+from sglang.srt.mem_cache.lora_radix_cache import LoRAKey, LoRARadixCache
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
 from sglang.srt.metrics.collector import TimeStats
@@ -638,14 +640,26 @@ class Req:
     ):
         self.fill_ids = self.origin_input_ids + self.output_ids
         if tree_cache is not None:
-            (
-                self.prefix_indices,
-                self.last_node,
-                self.last_host_node,
-                self.host_hit_length,
-            ) = tree_cache.match_prefix(
-                key=self.adjust_max_prefix_ids(),
-            )
+            if isinstance(tree_cache, LoRARadixCache):
+                (
+                    self.prefix_indices,
+                    self.last_node,
+                    self.last_host_node,
+                    self.host_hit_length,
+                ) = tree_cache.match_prefix_with_lora_id(
+                    key=LoRAKey(
+                        lora_id=self.lora_id, token_ids=self.adjust_max_prefix_ids()
+                    ),
+                )
+            else:
+                (
+                    self.prefix_indices,
+                    self.last_node,
+                    self.last_host_node,
+                    self.host_hit_length,
+                ) = tree_cache.match_prefix(
+                    key=self.adjust_max_prefix_ids(),
+                )
         self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
     def adjust_max_prefix_ids(self):
@@ -1145,9 +1159,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         req_pool_indices_tensor = torch.tensor(req_pool_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
-        input_ids_tensor = torch.tensor(sum(input_ids, []), dtype=torch.int64).to(
-            self.device, non_blocking=True
-        )
+        input_ids_tensor = torch.tensor(
+            list(chain.from_iterable(input_ids)), dtype=torch.int64
+        ).to(self.device, non_blocking=True)
         seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
@@ -1713,15 +1727,16 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             attention_backend_str = global_server_args_dict["prefill_attention_backend"]
         # Create seq_lens_cpu when needed
         if (
-            attention_backend_str == "fa3"
-            or (
-                global_server_args_dict["use_mla_backend"]
-                and attention_backend_str == "flashinfer"
-            )
-            or attention_backend_str == "flashmla"
-            or attention_backend_str == "cutlass_mla"
-            or attention_backend_str == "ascend"
-            or attention_backend_str == "trtllm_mha"
+            attention_backend_str
+            in [
+                "fa3",
+                "flashinfer",
+                "flashmla",
+                "cutlass_mla",
+                "ascend",
+                "trtllm_mha",
+                "aiter",
+            ]
             or global_server_args_dict["enable_two_batch_overlap"]
         ):
             seq_lens_cpu = (

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -36,7 +36,7 @@ if TYPE_CHECKING:
 # This can prevent the server from being too conservative.
 # Note that this only clips the estimation in the scheduler but does not change the stop
 # condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
-CLIP_MAX_NEW_TOKENS_ESTIMATION = int(
+CLIP_MAX_NEW_TOKENS = int(
     os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096")
 )
@@ -305,7 +305,7 @@ class PrefillAdder:
                 [
                     min(
                         (r.sampling_params.max_new_tokens - len(r.output_ids)),
-                        CLIP_MAX_NEW_TOKENS_ESTIMATION,
+                        CLIP_MAX_NEW_TOKENS,
                     )
                     * self.new_token_ratio
                     for r in running_batch.reqs
@@ -388,7 +388,7 @@ class PrefillAdder:
             0,
             req.extend_input_len,
             (
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
                 if not truncated
                 else 0
             ),
@@ -477,7 +477,7 @@ class PrefillAdder:
             self._update_prefill_budget(
                 0,
                 req.extend_input_len,
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION),
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
             )
         else:
             if self.rem_chunk_tokens == 0:
@@ -499,7 +499,7 @@ class PrefillAdder:
             return self.add_one_req_ignore_eos(req, has_chunked_req)
         total_tokens = req.extend_input_len + min(
-            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION
+            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS
         )
         # adjusting the input_tokens based on host_hit_length and page_size
@@ -544,7 +544,7 @@ class PrefillAdder:
                     input_tokens,
                     min(
                         req.sampling_params.max_new_tokens,
-                        CLIP_MAX_NEW_TOKENS_ESTIMATION,
+                        CLIP_MAX_NEW_TOKENS,
                     ),
                 )
             else:

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -130,6 +130,7 @@ from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
 from sglang.srt.managers.utils import DPBalanceMeta, validate_input_length
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
+from sglang.srt.mem_cache.lora_radix_cache import LoRARadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
 from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
@@ -611,12 +612,7 @@ class Scheduler(
                     hicache_ratio=server_args.hicache_ratio,
                     hicache_size=server_args.hicache_size,
                     hicache_write_policy=server_args.hicache_write_policy,
-                    hicache_io_backend=(
-                        "direct"
-                        if server_args.attention_backend
-                        == "fa3"  # hot fix for incompatibility
-                        else server_args.hicache_io_backend
-                    ),
+                    hicache_io_backend=server_args.hicache_io_backend,
                     hicache_mem_layout=server_args.hicache_mem_layout,
                     hicache_storage_backend=server_args.hicache_storage_backend,
                     hicache_storage_prefetch_policy=server_args.hicache_storage_prefetch_policy,
@@ -635,7 +631,19 @@ class Scheduler(
                     page_size=self.page_size,
                     disable=server_args.disable_radix_cache,
                 )
+            elif self.enable_lora:
+                assert (
+                    not self.enable_hierarchical_cache
+                ), "LoRA radix cache doesn't support hierarchical cache"
+                assert (
+                    self.schedule_policy == "fcfs"
+                ), "LoRA radix cache only supports FCFS policy"
+                self.tree_cache = LoRARadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                )
             else:
                 self.tree_cache = RadixCache(
                     req_to_token_pool=self.req_to_token_pool,

sglang/srt/managers/scheduler_profiler_mixin.py CHANGED Viewed

@@ -8,6 +8,18 @@ import torch
 from sglang.srt.managers.io_struct import ProfileReq, ProfileReqOutput, ProfileReqType
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.utils import is_npu
+_is_npu = is_npu()
+if _is_npu:
+    import torch_npu
+    patches = [
+        ["profiler.profile", torch_npu.profiler.profile],
+        ["profiler.ProfilerActivity.CUDA", torch_npu.profiler.ProfilerActivity.NPU],
+        ["profiler.ProfilerActivity.CPU", torch_npu.profiler.ProfilerActivity.CPU],
+    ]
+    torch_npu._apply_patches(patches)
 logger = logging.getLogger(__name__)
@@ -136,6 +148,13 @@ class SchedulerProfilerMixin:
                 activities=torchprof_activities,
                 with_stack=with_stack if with_stack is not None else True,
                 record_shapes=record_shapes if record_shapes is not None else False,
+                on_trace_ready=(
+                    None
+                    if not _is_npu
+                    else torch_npu.profiler.tensorboard_trace_handler(
+                        self.torch_profiler_output_dir
+                    )
+                ),
             )
             self.torch_profiler.start()
             self.profile_in_progress = True
@@ -166,15 +185,16 @@ class SchedulerProfilerMixin:
         logger.info("Stop profiling" + stage_suffix + "...")
         if self.torch_profiler is not None:
             self.torch_profiler.stop()
-            self.torch_profiler.export_chrome_trace(
-                os.path.join(
-                    self.torch_profiler_output_dir,
-                    self.profile_id
-                    + f"-TP-{self.tp_rank}"
-                    + stage_suffix
-                    + ".trace.json.gz",
+            if not _is_npu:
+                self.torch_profiler.export_chrome_trace(
+                    os.path.join(
+                        self.torch_profiler_output_dir,
+                        self.profile_id
+                        + f"-TP-{self.tp_rank}"
+                        + stage_suffix
+                        + ".trace.json.gz",
+                    )
                 )
-            )
             torch.distributed.barrier(self.tp_cpu_group)
         if self.rpd_profiler is not None:

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -269,10 +269,9 @@ class TokenizerManager:
         self.asyncio_tasks = set()
         # Health check
-        self.health_check_failed = False
+        self.server_status = ServerStatus.Starting
         self.gracefully_exit = False
         self.last_receive_tstamp = 0
-        self.server_status = ServerStatus.Starting
         # Dumping
         self.dump_requests_folder = ""  # By default do not dump
@@ -291,8 +290,8 @@ class TokenizerManager:
         self.model_update_result: Optional[Awaitable[UpdateWeightFromDiskReqOutput]] = (
             None
         )
-        self._is_updating = False
-        self._is_updating_cond = asyncio.Condition()
+        self.is_pause = False
+        self.is_pause_cond = asyncio.Condition()
         # LoRA
         # Initialize the `LoRARegistry` with initial LoRA adapter paths provided in `server_args`.
@@ -476,16 +475,20 @@ class TokenizerManager:
         self.auto_create_handle_loop()
         obj.normalize_batch_and_arguments()
-        async with self._is_updating_cond:
-            await self._is_updating_cond.wait_for(lambda: not self._is_updating)
         if self.log_requests:
             max_length, skip_names, _ = self.log_request_metadata
             logger.info(
                 f"Receive: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}"
             )
+        async with self.is_pause_cond:
+            await self.is_pause_cond.wait_for(lambda: not self.is_pause)
         async with self.model_update_lock.reader_lock:
+            if self.server_args.enable_lora and obj.lora_path:
+                # Look up the LoRA ID from the registry and start tracking ongoing LoRA requests.
+                obj.lora_id = await self.lora_registry.acquire(obj.lora_path)
             if obj.is_single:
                 tokenized_obj = await self._tokenize_one_request(obj)
                 state = self._send_one_request(obj, tokenized_obj, created_time)
@@ -553,11 +556,6 @@ class TokenizerManager:
         else:
             mm_inputs = None
-        if self.server_args.enable_lora and obj.lora_path:
-            # Start tracking ongoing requests for LoRA adapters and replace the user-friendly LoRA names in
-            # `lora_path` with their corresponding unique LoRA IDs, as required for internal processing.
-            obj.lora_id = await self.lora_registry.acquire(obj.lora_path)
         self._validate_one_request(obj, input_ids)
         return self._create_tokenized_object(
             obj, input_text, input_ids, input_embeds, mm_inputs, token_type_ids
@@ -775,10 +773,6 @@ class TokenizerManager:
                         msg = f"Finish: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}, out={dataclass_to_string_truncated(out, max_length, skip_names=out_skip_names)}"
                     logger.info(msg)
-                # Mark ongoing LoRA request as finished.
-                if self.server_args.enable_lora and obj.lora_path:
-                    await self.lora_registry.release(obj.lora_id)
                 # Check if this was an abort/error created by scheduler
                 if isinstance(out["meta_info"].get("finish_reason"), dict):
                     finish_reason = out["meta_info"]["finish_reason"]
@@ -797,6 +791,11 @@ class TokenizerManager:
                         # Delete the key to prevent resending abort request to the scheduler and
                         # to ensure aborted request state is cleaned up.
                         del self.rid_to_state[state.obj.rid]
+                        # Mark ongoing LoRA request as finished.
+                        if self.server_args.enable_lora and state.obj.lora_path:
+                            await self.lora_registry.release(state.obj.lora_id)
                         raise fastapi.HTTPException(
                             status_code=finish_reason["status_code"],
                             detail=finish_reason["message"],
@@ -982,14 +981,14 @@ class TokenizerManager:
         await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
     async def pause_generation(self):
-        async with self._is_updating_cond:
-            self._is_updating = True
+        async with self.is_pause_cond:
+            self.is_pause = True
             self.abort_request(abort_all=True)
     async def continue_generation(self):
-        async with self._is_updating_cond:
-            self._is_updating = False
-            self._is_updating_cond.notify_all()
+        async with self.is_pause_cond:
+            self.is_pause = False
+            self.is_pause_cond.notify_all()
     async def update_weights_from_disk(
         self,
@@ -1474,7 +1473,7 @@ class TokenizerManager:
         while True:
             remain_num_req = len(self.rid_to_state)
-            if self.health_check_failed:
+            if self.server_status == ServerStatus.UnHealthy:
                 # if health check failed, we should exit immediately
                 logger.error(
                     "Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
@@ -1600,6 +1599,10 @@ class TokenizerManager:
                 meta_info["e2e_latency"] = state.finished_time - state.created_time
                 del self.rid_to_state[rid]
+                # Mark ongoing LoRA request as finished.
+                if self.server_args.enable_lora and state.obj.lora_path:
+                    asyncio.create_task(self.lora_registry.release(state.obj.lora_id))
             state.out_list.append(out_dict)
             state.event.set()
@@ -1965,10 +1968,6 @@ class ServerStatus(Enum):
     Up = "Up"
     Starting = "Starting"
     UnHealthy = "UnHealthy"
-    Crashed = "Crashed"
-    def is_healthy(self) -> bool:
-        return self == ServerStatus.Up
 def _determine_tensor_transport_mode(server_args: ServerArgs) -> TensorTransportMode:

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc1py3-none-any.whl