PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+import enum
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,10 +37,11 @@ import copy
 import dataclasses
 import logging
 import threading
+import time
 from enum import Enum, auto
 from http import HTTPStatus
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
 import numpy as np
 import torch
@@ -51,6 +54,7 @@ from sglang.srt.disaggregation.base import BaseKVSender
 from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
     ScheduleBatchDisaggregationDecodeMixin,
 )
+from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
 from sglang.srt.mem_cache.allocator import (
     BaseTokenToKVPoolAllocator,
@@ -58,10 +62,10 @@ from sglang.srt.mem_cache.allocator import (
 )
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
-from sglang.srt.mem_cache.lora_radix_cache import LoRAKey, LoRARadixCache
 from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import RadixKey
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
-from sglang.srt.metrics.collector import TimeStats
+from sglang.srt.metrics.collector import SchedulerMetricsCollector, TimeStats
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -70,8 +74,7 @@ from sglang.srt.utils import flatten_nested_list, support_triton
 if TYPE_CHECKING:
     from sglang.srt.configs.model_config import ModelConfig
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-    from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+    from sglang.srt.speculative.spec_info import SpecInput, SpeculativeAlgorithm
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -86,6 +89,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "disable_flashinfer_cutlass_moe_fp4_allgather",
     "disable_radix_cache",
     "enable_dp_lm_head",
+    "enable_fp32_lm_head",
     "flashinfer_mxfp4_moe_precision",
     "enable_flashinfer_allreduce_fusion",
     "moe_dense_tp_size",
@@ -107,6 +111,9 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "enable_symm_mem",
     "enable_custom_logit_processor",
     "disaggregation_mode",
+    "enable_deterministic_inference",
+    "nsa_prefill",
+    "nsa_decode",
 ]
 # Put some global args for easy access
@@ -407,6 +414,23 @@ class MultimodalInputs:
         # other args would be kept intact
+class RequestStage(str, enum.Enum):
+    # prefill
+    PREFILL_WAITING = "prefill_waiting"
+    # disaggregation prefill
+    PREFILL_PREPARE = "prefill_prepare"
+    PREFILL_BOOTSTRAP = "prefill_bootstrap"
+    PREFILL_FORWARD = "prefill_forward"
+    PREFILL_TRANSFER_KV_CACHE = "prefill_transfer_kv_cache"
+    # disaggregation decode
+    DECODE_PREPARE = "decode_prepare"
+    DECODE_BOOTSTRAP = "decode_bootstrap"
+    DECODE_WAITING = "decode_waiting"
+    DECODE_TRANSFERRED = "decode_transferred"
 class Req:
     """The input and output status of a request."""
@@ -431,8 +455,12 @@ class Req:
         bootstrap_host: Optional[str] = None,
         bootstrap_port: Optional[int] = None,
         bootstrap_room: Optional[int] = None,
+        disagg_mode: Optional[DisaggregationMode] = None,
         data_parallel_rank: Optional[int] = None,
         vocab_size: Optional[int] = None,
+        priority: Optional[int] = None,
+        metrics_collector: Optional[SchedulerMetricsCollector] = None,
+        extra_key: Optional[str] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -465,6 +493,14 @@ class Req:
         self.sampling_params = sampling_params
         self.custom_logit_processor = custom_logit_processor
         self.return_hidden_states = return_hidden_states
+        # extra key for classifying the request (e.g. cache_salt)
+        if lora_id is not None:
+            extra_key = (
+                extra_key or ""
+            ) + lora_id  # lora_id is concatenated to the extra key
+        self.extra_key = extra_key
         self.lora_id = lora_id
         # Memory pool info
@@ -483,6 +519,7 @@ class Req:
         self.stream = stream
         self.eos_token_ids = eos_token_ids
         self.vocab_size = vocab_size
+        self.priority = priority
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -512,6 +549,8 @@ class Req:
         self.host_hit_length = 0
         # The node to lock until for swa radix tree lock ref
         self.swa_uuid_for_lock: Optional[int] = None
+        # The prefix length of the last prefix matching
+        self.last_matched_prefix_len: int = 0
         # Whether or not if it is chunked. It increments whenever
         # it is chunked, and decrement whenever chunked request is
@@ -573,6 +612,8 @@ class Req:
             ) = None
         self.hidden_states: List[List[float]] = []
         self.hidden_states_tensor = None  # Note: use tensor instead of list to transfer hidden_states when PD + MTP
+        self.output_topk_p = None
+        self.output_topk_index = None
         # Embedding (return values)
         self.embedding = None
@@ -590,10 +631,10 @@ class Req:
         self.spec_verify_ct = 0
         # For metrics
-        self.time_stats: TimeStats = TimeStats()
+        self.metrics_collector = metrics_collector
+        self.time_stats: TimeStats = TimeStats(disagg_mode=disagg_mode)
         self.has_log_time_stats: bool = False
-        self.queue_time_start = None
-        self.queue_time_end = None
+        self.last_tic = time.monotonic()
         # For disaggregation
         self.bootstrap_host: str = bootstrap_host
@@ -624,7 +665,21 @@ class Req:
     @property
     def is_prefill_only(self) -> bool:
         """Check if this request is prefill-only (no token generation needed)."""
-        return self.sampling_params.max_new_tokens == 0
+        # NOTE: when spec is enabled, prefill_only optimizations are disabled
+        return (
+            self.sampling_params.max_new_tokens == 0
+            and global_server_args_dict["speculative_algorithm"] is None
+        )
+    def add_latency(self, stage: RequestStage):
+        if self.metrics_collector is None:
+            return
+        now = time.monotonic()
+        self.metrics_collector.observe_per_stage_req_latency(
+            stage.value, now - self.last_tic
+        )
+        self.last_tic = now
     def extend_image_inputs(self, image_inputs):
         if self.multimodal_inputs is None:
@@ -642,26 +697,17 @@ class Req:
     ):
         self.fill_ids = self.origin_input_ids + self.output_ids
         if tree_cache is not None:
-            if isinstance(tree_cache, LoRARadixCache):
-                (
-                    self.prefix_indices,
-                    self.last_node,
-                    self.last_host_node,
-                    self.host_hit_length,
-                ) = tree_cache.match_prefix_with_lora_id(
-                    key=LoRAKey(
-                        lora_id=self.lora_id, token_ids=self.adjust_max_prefix_ids()
-                    ),
-                )
-            else:
-                (
-                    self.prefix_indices,
-                    self.last_node,
-                    self.last_host_node,
-                    self.host_hit_length,
-                ) = tree_cache.match_prefix(
-                    key=self.adjust_max_prefix_ids(),
-                )
+            (
+                self.prefix_indices,
+                self.last_node,
+                self.last_host_node,
+                self.host_hit_length,
+            ) = tree_cache.match_prefix(
+                key=RadixKey(
+                    token_ids=self.adjust_max_prefix_ids(), extra_key=self.extra_key
+                ),
+            )
+            self.last_matched_prefix_len = len(self.prefix_indices)
         self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
     def adjust_max_prefix_ids(self):
@@ -794,10 +840,10 @@ class Req:
             return
         if self.bootstrap_room is not None:
-            prefix = f"Req Time Stats(rid={self.rid}, bootstrap_room={self.bootstrap_room}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
+            prefix = f"Req Time Stats(rid={self.rid}, bootstrap_room={self.bootstrap_room}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.disagg_mode_str()})"
         else:
-            prefix = f"Req Time Stats(rid={self.rid}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
-        logger.info(f"{prefix}: {self.time_stats}")
+            prefix = f"Req Time Stats(rid={self.rid}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.disagg_mode_str()})"
+        logger.info(f"{prefix}: {self.time_stats.convert_to_duration()}")
         self.has_log_time_stats = True
     def set_finish_with_abort(self, error_msg: str):
@@ -820,10 +866,6 @@ class Req:
         )
-# Batch id
-bid = 0
 @dataclasses.dataclass
 class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     """Store all information of a batch on the scheduler."""
@@ -860,6 +902,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     token_type_ids: torch.Tensor = None  # shape: [b], int64
     req_pool_indices: torch.Tensor = None  # shape: [b], int64
     seq_lens: torch.Tensor = None  # shape: [b], int64
+    seq_lens_cpu: torch.Tensor = None  # shape: [b], int64
     # The output locations of the KV cache
     out_cache_loc: torch.Tensor = None  # shape: [b], int64
     output_ids: torch.Tensor = None  # shape: [b], int64
@@ -915,7 +958,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
-    spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None
+    # spec_info: Optional[SpecInput] = None
+    spec_info: Optional[SpecInput] = None
     # Whether to return hidden states
     return_hidden_states: bool = False
@@ -1014,7 +1058,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     def alloc_paged_token_slots_extend(
         self,
         prefix_lens: torch.Tensor,
+        prefix_lens_cpu: torch.Tensor,
         seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
         last_loc: torch.Tensor,
         extend_num_tokens: int,
         backup_state: bool = False,
@@ -1022,7 +1068,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         # Over estimate the number of tokens: assume each request needs a new page.
         num_tokens = (
             extend_num_tokens
-            + len(seq_lens) * self.token_to_kv_pool_allocator.page_size
+            + len(seq_lens_cpu) * self.token_to_kv_pool_allocator.page_size
         )
         self._evict_tree_cache_if_needed(num_tokens)
@@ -1030,7 +1076,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             state = self.token_to_kv_pool_allocator.backup_state()
         out_cache_loc = self.token_to_kv_pool_allocator.alloc_extend(
-            prefix_lens, seq_lens, last_loc, extend_num_tokens
+            prefix_lens,
+            prefix_lens_cpu,
+            seq_lens,
+            seq_lens_cpu,
+            last_loc,
+            extend_num_tokens,
         )
         if out_cache_loc is None:
             error_msg = (
@@ -1049,6 +1100,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     def alloc_paged_token_slots_decode(
         self,
         seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
         last_loc: torch.Tensor,
         backup_state: bool = False,
     ):
@@ -1059,7 +1111,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if backup_state:
             state = self.token_to_kv_pool_allocator.backup_state()
-        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(
+            seq_lens, seq_lens_cpu, last_loc
+        )
         if out_cache_loc is None:
             error_msg = (
                 f"Decode out of memory. Try to lower your batch size.\n"
@@ -1128,6 +1182,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
+        self.seq_lens_cpu = torch.tensor(seq_lens, dtype=torch.int64)
         if not decoder_out_cache_loc:
             self.out_cache_loc = torch.zeros(0, dtype=torch.int64).to(
@@ -1176,12 +1231,14 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
+        seq_lens_cpu = torch.tensor(seq_lens, dtype=torch.int64)
         orig_seq_lens_tensor = torch.tensor(orig_seq_lens, dtype=torch.int32).to(
             self.device, non_blocking=True
         )
         prefix_lens_tensor = torch.tensor(
             prefix_lens, dtype=torch.int64, device=self.device
         )
+        prefix_lens_cpu_tensor = torch.tensor(prefix_lens, dtype=torch.int64)
         token_type_ids_tensor = None
         if len(token_type_ids) > 0:
@@ -1308,13 +1365,19 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 prefix_lens_tensor,
             )
             out_cache_loc = self.alloc_paged_token_slots_extend(
-                prefix_lens_tensor, seq_lens_tensor, last_loc, extend_num_tokens
+                prefix_lens_tensor,
+                prefix_lens_cpu_tensor,
+                seq_lens_tensor,
+                seq_lens_cpu,
+                last_loc,
+                extend_num_tokens,
             )
         # Set fields
         self.input_ids = input_ids_tensor
         self.req_pool_indices = req_pool_indices_tensor
         self.seq_lens = seq_lens_tensor
+        self.seq_lens_cpu = seq_lens_cpu
         self.orig_seq_lens = orig_seq_lens_tensor
         self.out_cache_loc = out_cache_loc
         self.input_embeds = (
@@ -1457,7 +1520,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
         retracted_reqs = []
-        seq_lens_cpu = self.seq_lens.cpu().numpy()
         first_iter = True
         while first_iter or (
             not self.check_decode_mem(selected_indices=sorted_indices)
@@ -1484,37 +1546,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             idx = sorted_indices.pop()
             req = self.reqs[idx]
             retracted_reqs.append(req)
-            if server_args.disaggregation_mode == "decode":
-                req.offload_kv_cache(
-                    self.req_to_token_pool, self.token_to_kv_pool_allocator
-                )
-            if isinstance(self.tree_cache, ChunkCache):
-                # ChunkCache does not have eviction
-                token_indices = self.req_to_token_pool.req_to_token[
-                    req.req_pool_idx, : seq_lens_cpu[idx]
-                ]
-                self.token_to_kv_pool_allocator.free(token_indices)
-                self.req_to_token_pool.free(req.req_pool_idx)
-            else:
-                # TODO: apply more fine-grained retraction
-                last_uncached_pos = (
-                    len(req.prefix_indices) // server_args.page_size
-                ) * server_args.page_size
-                token_indices = self.req_to_token_pool.req_to_token[
-                    req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
-                ]
-                self.token_to_kv_pool_allocator.free(token_indices)
-                self.req_to_token_pool.free(req.req_pool_idx)
-                # release the last node
-                if self.is_hybrid:
-                    self.tree_cache.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
-                else:
-                    self.tree_cache.dec_lock_ref(req.last_node)
-            req.reset_for_retract()
+            self.release_req(idx, len(sorted_indices), server_args)
             if len(retracted_reqs) == 0:
                 # Corner case: only one request left
@@ -1533,7 +1565,45 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         ) / total_max_new_tokens
         new_estimate_ratio = min(1.0, new_estimate_ratio)
-        return retracted_reqs, new_estimate_ratio
+        return retracted_reqs, new_estimate_ratio, []
+    def release_req(self, idx: int, remaing_req_count: int, server_args: ServerArgs):
+        req = self.reqs[idx]
+        seq_lens_cpu = self.seq_lens_cpu.numpy()
+        if server_args.disaggregation_mode == "decode":
+            req.offload_kv_cache(
+                self.req_to_token_pool, self.token_to_kv_pool_allocator
+            )
+        if isinstance(self.tree_cache, ChunkCache):
+            # ChunkCache does not have eviction
+            token_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : seq_lens_cpu[idx]
+            ]
+            self.token_to_kv_pool_allocator.free(token_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+        else:
+            # TODO: apply more fine-grained retraction
+            last_uncached_pos = (
+                len(req.prefix_indices) // server_args.page_size
+            ) * server_args.page_size
+            token_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
+            ]
+            self.token_to_kv_pool_allocator.free(token_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            # release the last node
+            if self.is_hybrid:
+                self.tree_cache.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+            else:
+                self.tree_cache.dec_lock_ref(req.last_node)
+            # NOTE(lsyin): we should use the newly evictable memory instantly.
+            num_tokens = remaing_req_count * global_config.retract_decode_steps
+            self._evict_tree_cache_if_needed(num_tokens)
+        req.reset_for_retract()
     def prepare_encoder_info_decode(self):
         # Reset the encoder cached status
@@ -1543,6 +1613,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.forward_mode = ForwardMode.IDLE
         self.input_ids = torch.empty(0, dtype=torch.int64, device=self.device)
         self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.seq_lens_cpu = torch.empty(0, dtype=torch.int64)
         self.orig_seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int64, device=self.device)
         self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
@@ -1557,7 +1628,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.forward_mode = ForwardMode.DECODE
         bs = len(self.reqs)
-        if self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone():
+        if (
+            self.spec_algorithm.is_eagle()
+            or self.spec_algorithm.is_standalone()
+            or self.spec_algorithm.is_ngram()
+        ):
             # if spec decoding is used, the decode batch is prepared inside
             # `forward_batch_speculative_generation` after running draft models.
             return
@@ -1598,10 +1673,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if self.enable_overlap:
             # Do not use in-place operations in the overlap mode
             self.seq_lens = self.seq_lens + 1
+            self.seq_lens_cpu = self.seq_lens_cpu + 1
             self.orig_seq_lens = self.orig_seq_lens + 1
         else:
             # A faster in-place version
             self.seq_lens.add_(1)
+            self.seq_lens_cpu.add_(1)
             self.orig_seq_lens.add_(1)
         self.seq_lens_sum += bs
@@ -1620,7 +1697,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 self.req_pool_indices, self.seq_lens - 2
             ]
             self.out_cache_loc = self.alloc_paged_token_slots_decode(
-                self.seq_lens, last_loc
+                self.seq_lens, self.seq_lens_cpu, last_loc
             )
         self.req_to_token_pool.write(
@@ -1666,6 +1743,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
         self.req_pool_indices = self.req_pool_indices[keep_indices_device]
         self.seq_lens = self.seq_lens[keep_indices_device]
+        self.seq_lens_cpu = self.seq_lens_cpu[keep_indices]
         self.orig_seq_lens = self.orig_seq_lens[keep_indices_device]
         self.out_cache_loc = None
         self.seq_lens_sum = self.seq_lens.sum().item()
@@ -1683,7 +1761,14 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.sampling_info.filter_batch(keep_indices, keep_indices_device)
         if self.spec_info:
-            self.spec_info.filter_batch(keep_indices_device)
+            if chunked_req_to_exclude is not None and len(chunked_req_to_exclude) > 0:
+                has_been_filtered = False
+            else:
+                has_been_filtered = True
+            self.spec_info.filter_batch(
+                new_indices=keep_indices_device,
+                has_been_filtered=has_been_filtered,
+            )
     def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
@@ -1699,6 +1784,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             [self.req_pool_indices, other.req_pool_indices]
         )
         self.seq_lens = torch.cat([self.seq_lens, other.seq_lens])
+        self.seq_lens_cpu = torch.cat([self.seq_lens_cpu, other.seq_lens_cpu])
         self.orig_seq_lens = torch.cat([self.orig_seq_lens, other.orig_seq_lens])
         self.out_cache_loc = None
         self.seq_lens_sum += other.seq_lens_sum
@@ -1742,15 +1828,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 self.sampling_info.grammars = None
         seq_lens_cpu = (
-            seq_lens_cpu_cache
-            if seq_lens_cpu_cache is not None
-            else self.seq_lens.cpu()
+            seq_lens_cpu_cache if seq_lens_cpu_cache is not None else self.seq_lens_cpu
         )
-        global bid
-        bid += 1
         return ModelWorkerBatch(
-            bid=bid,
             forward_mode=self.forward_mode,
             input_ids=self.input_ids,
             req_pool_indices=self.req_pool_indices,
@@ -1870,8 +1951,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
 @dataclasses.dataclass
 class ModelWorkerBatch:
-    # The batch id
-    bid: int
     # The forward mode
     forward_mode: ForwardMode
     # The input ids
@@ -1932,7 +2011,9 @@ class ModelWorkerBatch:
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
-    spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
+    spec_info: Optional[SpecInput] = None
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
     hicache_consumer_index: int = -1

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl