PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # ==============================================================================
 """A scheduler that manages a tensor parallel GPU worker."""
+import datetime
 import faulthandler
 import logging
 import os
@@ -58,6 +59,7 @@ from sglang.srt.disaggregation.utils import (
     prepare_abort,
 )
 from sglang.srt.distributed import get_pp_group, get_world_group
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
 from sglang.srt.hf_transformers_utils import (
     get_processor,
     get_tokenizer,
@@ -65,9 +67,6 @@ from sglang.srt.hf_transformers_utils import (
 )
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.managers.expert_distribution import (
-    get_global_expert_distribution_recorder,
-)
 from sglang.srt.managers.io_struct import (
     AbortReq,
     CloseSessionReqInput,
@@ -82,6 +81,8 @@ from sglang.srt.managers.io_struct import (
     HealthCheckOutput,
     InitWeightsUpdateGroupReqInput,
     InitWeightsUpdateGroupReqOutput,
+    LoadLoRAAdapterReqInput,
+    LoadLoRAAdapterReqOutput,
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
@@ -99,6 +100,8 @@ from sglang.srt.managers.io_struct import (
     SlowDownReqOutput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
+    UnloadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqOutput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightFromDiskReqOutput,
     UpdateWeightsFromDistributedReqInput,
@@ -126,7 +129,8 @@ from sglang.srt.managers.session_controller import Session
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
 from sglang.srt.managers.utils import validate_input_length
-from sglang.srt.mem_cache.chunk_cache import ChunkCache
+from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
+from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
@@ -146,6 +150,7 @@ from sglang.srt.utils import (
     get_available_gpu_memory,
     get_bool_env_var,
     get_zmq_socket,
+    is_cpu,
     kill_itself_when_parent_died,
     point_to_point_pyobj,
     pyspy_dump_schedulers,
@@ -164,6 +169,8 @@ TEST_RETRACT = get_bool_env_var("SGLANG_TEST_RETRACT")
 RECORD_STEP_TIME = get_bool_env_var("SGLANG_RECORD_STEP_TIME")
 GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))
+_is_cpu = is_cpu()
 @dataclass
 class GenerationBatchResult:
@@ -415,14 +422,16 @@ class Scheduler(
         self.last_decode_stats_tic = time.perf_counter()
         self.last_prefill_stats_tic = time.perf_counter()
         self.return_health_check_ct = 0
+        self.num_retracted_reqs: int = 0
+        self.num_paused_reqs: int = 0
+        self.kv_transfer_speed_gb_s: float = 0.0
+        self.kv_transfer_latency_ms: float = 0.0
+        self.sessions: Dict[str, Session] = {}
         self.current_stream = torch.get_device_module(self.device).current_stream()
         if self.device == "cpu":
             self.current_stream.synchronize = lambda: None  # No-op for CPU
         self.forward_sleep_time = None
-        # Init session info
-        self.sessions: Dict[str, Session] = {}
         # Init chunked prefill
         self.chunked_prefill_size = server_args.chunked_prefill_size
         if self.chunked_prefill_size <= 0:  # -1 means disable
@@ -470,26 +479,12 @@ class Scheduler(
         t = threading.Thread(target=self.watchdog_thread, daemon=True)
         t.start()
         self.parent_process = psutil.Process().parent()
+        # Init memory saver, profiler and metric stats
         self.memory_saver_adapter = TorchMemorySaverAdapter.create(
             enable=server_args.enable_memory_saver
         )
-        # Init profiler
-        self.torch_profiler = None
-        self.torch_profiler_output_dir: Optional[str] = None
-        self.profiler_activities: Optional[List[str]] = None
-        self.profile_id: Optional[str] = None
-        self.profiler_target_forward_ct: Optional[int] = None
-        self.profiler_target_prefill_ct: Optional[int] = None
-        self.profiler_target_decode_ct: Optional[int] = None
-        self.profiler_prefill_ct: Optional[int] = None
-        self.profiler_decode_ct: Optional[int] = None
-        self.profile_by_stage: bool = False
-        self.profile_steps: Optional[int] = None
-        self.profile_in_progress: bool = False
-        self.rpd_profiler = None
-        # Init metrics stats
+        self.init_profier()
         self.init_metrics()
         self.init_kv_events(server_args.kv_events_config)
@@ -518,9 +513,12 @@ class Scheduler(
                 (SetInternalStateReq, self.set_internal_state),
                 (RpcReqInput, self.handle_rpc_request),
                 (ExpertDistributionReq, self.expert_distribution_handle),
+                (LoadLoRAAdapterReqInput, self.load_lora_adapter),
+                (UnloadLoRAAdapterReqInput, self.unload_lora_adapter),
             ]
         )
+        # Init disaggregation
         self.disaggregation_mode = DisaggregationMode(
             self.server_args.disaggregation_mode
         )
@@ -570,7 +568,11 @@ class Scheduler(
             server_args.chunked_prefill_size is not None
             and server_args.disable_radix_cache
         ):
-            self.tree_cache = ChunkCache(
+            if self.model_config.is_hybrid:
+                ChunkCacheClass = SWAChunkCache
+            else:
+                ChunkCacheClass = ChunkCache
+            self.tree_cache = ChunkCacheClass(
                 req_to_token_pool=self.req_to_token_pool,
                 token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
                 page_size=self.page_size,
@@ -589,6 +591,12 @@ class Scheduler(
                     hicache_ratio=server_args.hicache_ratio,
                     hicache_size=server_args.hicache_size,
                     hicache_write_policy=server_args.hicache_write_policy,
+                    hicache_io_backend=(
+                        "direct"
+                        if server_args.attention_backend
+                        == "fa3"  # hot fix for incompatibility
+                        else server_args.hicache_io_backend
+                    ),
                 )
                 self.tp_worker.register_hicache_layer_transfer_counter(
                     self.tree_cache.cache_controller.layer_done_counter
@@ -615,6 +623,21 @@ class Scheduler(
             )
         )
+    def init_profier(self):
+        self.torch_profiler = None
+        self.torch_profiler_output_dir: Optional[str] = None
+        self.profiler_activities: Optional[List[str]] = None
+        self.profile_id: Optional[str] = None
+        self.profiler_target_forward_ct: Optional[int] = None
+        self.profiler_target_prefill_ct: Optional[int] = None
+        self.profiler_target_decode_ct: Optional[int] = None
+        self.profiler_prefill_ct: Optional[int] = None
+        self.profiler_decode_ct: Optional[int] = None
+        self.profile_by_stage: bool = False
+        self.profile_steps: Optional[int] = None
+        self.profile_in_progress: bool = False
+        self.rpd_profiler = None
     def init_metrics(self):
         self.last_gen_throughput: float = 0.0
         self.last_input_throughput: float = 0.0
@@ -694,9 +717,6 @@ class Scheduler(
                 transfer_backend=self.transfer_backend,
             )
-            # Metric for pre-allocation
-            self.num_tokens_pre_allocated = 0
         elif self.disaggregation_mode == DisaggregationMode.PREFILL:
             # *2 for the headroom.
             buffer_size = self.max_running_requests * 2
@@ -915,7 +935,7 @@ class Scheduler(
                         point_to_point_pyobj(
                             recv_reqs,
                             self.pp_rank * self.tp_size + dp_offset,
-                            self.world_group.cpu_group,
+                            self.world_group.device_group,
                             self.pp_rank * self.tp_size + dp_offset,
                             (self.pp_rank + 1) * self.tp_size + dp_offset,
                         )
@@ -962,7 +982,7 @@ class Scheduler(
                 recv_reqs = point_to_point_pyobj(
                     [],
                     self.pp_rank * self.tp_size + dp_offset,
-                    self.world_group.cpu_group,
+                    self.world_group.device_group,
                     (self.pp_rank - 1) * self.tp_size + dp_offset,
                     self.pp_rank * self.tp_size + dp_offset,
                 )
@@ -1087,7 +1107,7 @@ class Scheduler(
                 recv_req.session_params is not None
                 and recv_req.session_params.id is not None
             ):
-                req.finished_reason = FINISH_ABORT(
+                req.set_finish_with_abort(
                     f"Invalid request: session id {recv_req.session_params.id} does not exist"
                 )
                 self._add_request_to_queue(req)
@@ -1283,9 +1303,8 @@ class Scheduler(
         self.last_input_throughput = self.last_prefill_tokens / gap_latency
         self.last_prefill_tokens = adder.log_input_tokens
-        num_used = self.max_total_num_tokens - (
-            self.token_to_kv_pool_allocator.available_size()
-            + self.tree_cache.evictable_size()
+        usage_msg, num_used = self.token_to_kv_pool_allocator.log_usage(
+            self.tree_cache.evictable_size()
         )
         num_new_seq = len(can_run_list)
@@ -1294,17 +1313,19 @@ class Scheduler(
             f"#new-seq: {num_new_seq}, "
             f"#new-token: {adder.log_input_tokens}, "
             f"#cached-token: {adder.log_hit_tokens}, "
-            f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
+            f"{usage_msg}"
         )
         if self.disaggregation_mode == DisaggregationMode.PREFILL:
             f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
             f += f"#queue-req: {len(self.waiting_queue)}, "
             f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
-            f += f"input throughput (token/s): {self.last_input_throughput:.2f} "
+            f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
         else:
             f += f"#running-req: {running_bs}, "
-            f += f"#queue-req: {len(self.waiting_queue)}"
+            f += f"#queue-req: {len(self.waiting_queue)}, "
+        f += f"timestamp: {datetime.datetime.now().isoformat()}"
         logger.info(f)
@@ -1337,9 +1358,8 @@ class Scheduler(
         self.last_gen_throughput = self.num_generated_tokens / gap_latency
         self.num_generated_tokens = 0
         num_running_reqs = len(batch.reqs)
-        num_used = self.max_total_num_tokens - (
-            self.token_to_kv_pool_allocator.available_size()
-            + self.tree_cache.evictable_size()
+        usage_msg, num_used = self.token_to_kv_pool_allocator.log_usage(
+            self.tree_cache.evictable_size()
         )
         if RECORD_STEP_TIME:
@@ -1347,12 +1367,7 @@ class Scheduler(
                 gap_latency / self.server_args.decode_log_interval
             )
-        msg = (
-            f"Decode batch. "
-            f"#running-req: {num_running_reqs}, "
-            f"#token: {num_used}, "
-            f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
-        )
+        msg = f"Decode batch. " f"#running-req: {num_running_reqs}, " f"{usage_msg}"
         if self.spec_algorithm.is_none():
             spec_accept_length = 0
@@ -1366,13 +1381,14 @@ class Scheduler(
             msg += f"accept len: {spec_accept_length:.2f}, "
         if self.disaggregation_mode == DisaggregationMode.DECODE:
-            msg += f"pre-allocated usage: {self.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
+            msg += f"pre-allocated usage: {self.disagg_decode_prealloc_queue.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
             msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
         msg += (
             f"cuda graph: {can_run_cuda_graph}, "
             f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
-            f"#queue-req: {len(self.waiting_queue)}"
+            f"#queue-req: {len(self.waiting_queue)}, "
+            f"timestamp: {datetime.datetime.now().isoformat()}"
         )
         logger.info(msg)
@@ -1390,10 +1406,11 @@ class Scheduler(
         self._publish_kv_events()
     def check_memory(self):
-        available_size = (
-            self.token_to_kv_pool_allocator.available_size()
-            + self.tree_cache.evictable_size()
-        )
+        if isinstance(self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator):
+            available_token_size = self.token_to_kv_pool_allocator.full_available_size()
+        else:
+            available_token_size = self.token_to_kv_pool_allocator.available_size()
+        available_size = available_token_size + self.tree_cache.evictable_size()
         protected_size = self.tree_cache.protected_size()
         memory_leak = available_size != (
             self.max_total_num_tokens
@@ -1404,7 +1421,7 @@ class Scheduler(
             msg = (
                 "token_to_kv_pool_allocator memory leak detected! "
                 f"{available_size=}, {protected_size=}, {self.max_total_num_tokens=}\n"
-                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
+                f"{available_token_size=}\n"
                 f"{self.tree_cache.evictable_size()=}\n"
             )
             raise ValueError(msg)
@@ -1483,7 +1500,7 @@ class Scheduler(
         if need_dp_attn_preparation and not self.spec_algorithm.is_none():
             # In speculative decoding, prefill batches and decode batches cannot be processed in the same DP attention group.
             # We prepare idle batches in advance to skip preparing decode batches when there are prefill batches in the group.
-            new_batch, _ = self.prepare_mlp_sync_batch(new_batch)
+            new_batch = self.prepare_mlp_sync_batch(new_batch)
             need_dp_attn_preparation = new_batch is None
         if new_batch is not None:
@@ -1499,7 +1516,7 @@ class Scheduler(
         # Handle DP attention
         if need_dp_attn_preparation:
-            ret, _ = self.prepare_mlp_sync_batch(ret)
+            ret = self.prepare_mlp_sync_batch(ret)
         return ret
@@ -1916,8 +1933,7 @@ class Scheduler(
             if not disable_cuda_graph:
                 local_batch.can_run_dp_cuda_graph = can_cuda_graph
-        # TODO(ch-wan): refactor: any(is_extend_in_batch) now is a part of local_batch. Remove it from here.
-        return local_batch, any(is_extend_in_batch)
+        return local_batch
     def get_idle_batch(self):
         idle_batch = ScheduleBatch.init_new(
@@ -2104,6 +2120,21 @@ class Scheduler(
     def get_internal_state(self, recv_req: GetInternalStateReq):
         ret = dict(global_server_args_dict)
         ret["last_gen_throughput"] = self.last_gen_throughput
+        ret["memory_usage"] = {
+            "weight": round(
+                self.tp_worker.worker.model_runner.weight_load_mem_usage, 2
+            ),
+            "kvcache": round(
+                self.token_to_kv_pool_allocator.get_kvcache().mem_usage, 2
+            ),
+            "token_capacity": int(self.max_total_num_tokens),
+        }
+        if not _is_cpu:
+            ret["memory_usage"]["cuda_graph"] = round(
+                self.tp_worker.worker.model_runner.cuda_graph_mem_usage, 2
+            )
         if not self.spec_algorithm.is_none() and self.cum_spec_accept_count > 0:
             ret["avg_spec_accept_length"] = (
                 self.cum_spec_accept_length / self.cum_spec_accept_count
@@ -2192,7 +2223,7 @@ class Scheduler(
         # Delete requests in the waiting queue
         to_del = []
         for i, req in enumerate(self.waiting_queue):
-            if req.rid.startswith(recv_req.rid):
+            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
                 to_del.append(i)
         # Sort in reverse order to avoid index issues when deleting
@@ -2209,7 +2240,7 @@ class Scheduler(
             # Abort method 2: call `set_finish_with_abort`
             # The request will still run one prefill forward pass.
             # In this case, we change the input_ids to be only one token to make this prefill cheap.
-            if req.rid.startswith(recv_req.rid):
+            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
                 logger.debug(f"Abort grammar queue request. {req.rid=}")
                 if req.grammar:
                     req.grammar.cancel()
@@ -2222,7 +2253,9 @@ class Scheduler(
             reqs = self.running_batch.reqs + self.cur_batch.reqs
         for req in reqs:
-            if req.rid.startswith(recv_req.rid) and not req.finished():
+            if not req.finished() and (
+                recv_req.abort_all or req.rid.startswith(recv_req.rid)
+            ):
                 # Abort method 3: set `to_abort=True`
                 # The request will still run one decode forward pass.
                 # Then we reuse all existing code to clean up the KV cache allocation.
@@ -2242,6 +2275,36 @@ class Scheduler(
             logger.error(message)
         return UpdateWeightFromDiskReqOutput(success, message, 0)
+    def load_lora_adapter(
+        self, recv_req: LoadLoRAAdapterReqInput
+    ) -> LoadLoRAAdapterReqOutput:
+        """In-place loading a new lora adapter from disk or huggingface."""
+        result = self.tp_worker.load_lora_adapter(recv_req)
+        if result.success:
+            flush_cache_success = self.flush_cache()
+            assert flush_cache_success, "Cache flush failed after loading lora adapter."
+        else:
+            logger.error(result.error_message)
+        return result
+    def unload_lora_adapter(
+        self, recv_req: UnloadLoRAAdapterReqInput
+    ) -> UnloadLoRAAdapterReqOutput:
+        """Unload the lora adapter."""
+        result = self.tp_worker.unload_lora_adapter(recv_req)
+        if result.success:
+            flush_cache_success = self.flush_cache()
+            assert (
+                flush_cache_success
+            ), "Cache flush failed after unloading LoRA weights"
+        else:
+            logger.error(result.error_message)
+        return result
     def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
         """Initialize the online model parameter update group."""
         success, message = self.tp_worker.init_weights_update_group(recv_req)
@@ -2254,8 +2317,9 @@ class Scheduler(
         """Update the online model parameter."""
         success, message = self.tp_worker.update_weights_from_distributed(recv_req)
         if success:
-            flush_cache_success = self.flush_cache()
-            assert flush_cache_success, "Cache flush failed after updating weights"
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
         else:
             logger.error(message)
         return UpdateWeightsFromDistributedReqOutput(success, message)
@@ -2279,9 +2343,8 @@ class Scheduler(
     def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
         tags = recv_req.tags
-        import subprocess
-        if tags is None:
+        if tags is None or len(tags) == 0:
             tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
         if GPU_MEMORY_TYPE_KV_CACHE in tags:
@@ -2292,17 +2355,20 @@ class Scheduler(
             self.stashed_model_static_state = _export_static_state(
                 self.tp_worker.worker.model_runner.model
             )
+            torch.distributed.barrier(self.tp_cpu_group)
             self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
         return ReleaseMemoryOccupationReqOutput()
     def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
         tags = recv_req.tags
         if tags is None or len(tags) == 0:
             tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
         if GPU_MEMORY_TYPE_WEIGHTS in tags:
             self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
+            torch.distributed.barrier(self.tp_cpu_group)
             _import_static_state(
                 self.tp_worker.worker.model_runner.model,
                 self.stashed_model_static_state,

sglang/srt/managers/scheduler_output_processor_mixin.py CHANGED Viewed

@@ -521,11 +521,17 @@ class SchedulerOutputProcessorMixin:
                     stream_interval = (
                         req.sampling_params.stream_interval or self.stream_interval
                     )
-                    should_output = len(req.output_ids) % stream_interval == 0
+                    should_output = (
+                        len(req.output_ids) % stream_interval == 1
+                        if not self.model_config.is_multimodal_gen
+                        and stream_interval > 1
+                        else len(req.output_ids) % stream_interval == 0
+                    )
                 else:
                     should_output = (
                         len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
-                        and not self.model_config.is_multimodal_gen
+                        if not self.model_config.is_multimodal_gen
+                        else False
                     )
             if should_output:

sglang/srt/managers/session_controller.py CHANGED Viewed

@@ -54,7 +54,7 @@ class SessionReqNode:
             prefix += " -- " + self.childs[0].req.rid
             ret = self.childs[0]._str_helper(prefix)
             for child in self.childs[1:]:
-                prefix = " " * len(origin_prefix) + r" \- " + child.req.rid
+                prefix = " " * len(origin_prefix) + " \- " + child.req.rid
                 ret += child._str_helper(prefix)
             return ret
@@ -106,14 +106,22 @@ class Session:
                 last_req.origin_input_ids
                 + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
             )
+            if session_params.drop_previous_output:
+                input_ids = last_req.origin_input_ids[:]
             if session_params.offset and session_params.offset != 0:
                 input_ids = input_ids[: session_params.offset] + req.input_ids
             else:
                 input_ids += req.input_ids
             input_ids_unpadded = (
                 last_req.origin_input_ids_unpadded
                 + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
             )
+            if session_params.drop_previous_output:
+                input_ids_unpadded = last_req.origin_input_ids_unpadded[:]
             if session_params.offset and session_params.offset != 0:
                 input_ids_unpadded = (
                     input_ids_unpadded[: session_params.offset] + req.input_ids
@@ -138,10 +146,11 @@ class Session:
             token_ids_logprob=req.token_ids_logprob,
         )
         if last_req is not None:
-            new_req.multimodal_inputs = last_req.mm_inputs
+            new_req.multimodal_inputs = last_req.multimodal_inputs
         new_req.tokenizer = tokenizer
         if abort:
-            new_req.to_abort = True
+            new_req.set_finish_with_abort("Invalid request session id")
         else:
             new_req_node = SessionReqNode(new_req, last_req_node)
             self.req_nodes[req.rid] = new_req_node

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl