PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/mamba_radix_cache.py ADDED Viewed

@@ -0,0 +1,993 @@
+from __future__ import annotations
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+The radix tree data structure for managing the hybrid (full and Mamba) KV cache.
+"""
+import heapq
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Tuple
+import torch
+from sglang.srt.mem_cache.allocator import TokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import (
+    RadixKey,
+    _key_match_page_size1,
+    get_child_key,
+)
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+import logging
+logger = logging.getLogger(__name__)
+class TreeNode:
+    counter = 0
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: RadixKey = None
+        self.value: Optional[torch.Tensor] = None
+        self.mamba_value: Optional[torch.Tensor] = None
+        # invariant: for any node, if mamba_lock_ref is locked, full_lock_ref must be locked;
+        # if full_lock_ref is locked, mamba_lock_ref doesn't need to be locked. So,
+        # full_lock_ref is always >= mamba_lock_ref.
+        # for full_lock, once it is locked, its parent must be locked as well
+        # for mamba_lock, it only need lock node itself
+        self.full_lock_ref = 0
+        self.mamba_lock_ref = 0
+        # last access time is only used for sanity check. LRU is maintained by the lru list.
+        self.last_access_time = time.monotonic()
+        self.hit_count = 0
+        # store the host indices of KV cache
+        self.host_value = None
+        # for lru list, invariant:
+        # 1. prev has greater last_access_time
+        # 2. next has smaller last_access_time
+        self.prev = None
+        self.next = None
+        self.mamba_prev = None
+        self.mamba_next = None
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+    @property
+    def evicted(self):
+        return self.value is None
+    @property
+    def backuped(self):
+        return self.host_value is not None
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+class LRUList:
+    def __init__(self, mamba: bool = False):
+        self.mamba = mamba
+        if self.mamba:
+            self.prv = "mamba_prev"
+            self.nxt = "mamba_next"
+            self.lock_ref = "mamba_lock_ref"
+        else:
+            self.prv = "prev"
+            self.nxt = "next"
+            self.lock_ref = "full_lock_ref"
+        # Initialize dummy head and tail nodes
+        self.head = TreeNode()  # Most recently used side
+        self.tail = TreeNode()  # Least recently used side
+        setattr(self.head, self.nxt, self.tail)  # self.head.next = self.tail
+        setattr(self.tail, self.prv, self.head)  # self.tail.prev = self.head
+        self.cache = {}
+    def _add_node(self, node):
+        """Helper to add node right after head (most recently used)"""
+        self._add_node_after(self.head, node)
+    def _add_node_after(self, old_node, new_node):
+        """Helper to add node right after old_node"""
+        setattr(new_node, self.prv, old_node)  # new_node.prev = old_node
+        setattr(
+            new_node, self.nxt, getattr(old_node, self.nxt)
+        )  # new_node.next = old_node.next
+        setattr(
+            getattr(old_node, self.nxt), self.prv, new_node
+        )  # old_node.next.prev = new_node
+        setattr(old_node, self.nxt, new_node)  # old_node.next = new_node
+    def _remove_node(self, node):
+        """Helper to remove node from linked list"""
+        setattr(
+            getattr(node, self.prv), self.nxt, getattr(node, self.nxt)
+        )  # node.prev.next = node.next
+        setattr(
+            getattr(node, self.nxt), self.prv, getattr(node, self.prv)
+        )  # node.next.prev = node.prev
+    def _get_lru(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node
+        """
+        if len(self.cache) == 0:
+            return None
+        return getattr(self.tail, self.prv)
+    def reset_node_mru(self, node):
+        """
+        Move a (existing) node to most recently used position
+        """
+        assert node.id in self.cache, f"Resetting node {node.id=} not in lru list"
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Resetting mamba tombstone node in mamba lru list: {node.id=}"
+        self._remove_node(node)
+        self._add_node(node)
+    def reset_node_and_parents_mru(self, node, root_node):
+        """
+        Move an (existing) node and its parents to most recently used position. Child node is
+        more recently used than parent node.
+        """
+        prev_node = self.head
+        while node != root_node:
+            if not self.mamba or node.mamba_value is not None:
+                assert (
+                    node.id in self.cache
+                ), f"Resetting node {node.id=} not in lru list when resetting node and parents mru"
+                self._remove_node(node)
+                self._add_node_after(prev_node, node)
+                prev_node = node
+            node = node.parent
+    def insert_mru(self, node):
+        """
+        Insert a (new) node as most recently used
+        """
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Inserting mamba tombstone node in mamba lru list: {node.id=}"
+        assert (
+            node.id not in self.cache
+        ), f"Inserting node {node.id=} already in lru list, existing node: {self.cache[node.id].id=}"
+        self.cache[node.id] = node
+        self._add_node(node)
+    def remove_node(self, node: TreeNode):
+        """
+        Remove node from lru list
+        """
+        assert node.id in self.cache, f"Removing node {node.id=} not in lru list"
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Removing mamba tombstone node from mamba lru list: {node.id=}"
+        del self.cache[node.id]
+        self._remove_node(node)
+    def get_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node that is not locked
+        """
+        return self.get_prev_no_lock(self.tail, check_id=False)
+    def get_leaf_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used leaf node that is not locked
+        """
+        return self.get_prev_leaf_no_lock(self.tail, check_id=False)
+    def get_prev_no_lock(
+        self, node: TreeNode, check_id: bool = True
+    ) -> Optional[TreeNode]:
+        """
+        Get the previous (i.e. more recently used) node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+    def get_prev_leaf_no_lock(self, node: TreeNode, check_id: bool = True):
+        """
+        Get the previous (i.e. more recently used) leaf node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0 or len(x.children) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no leaf node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+    def in_list(self, node: Optional[TreeNode]):
+        """
+        Check if the node is in the lru list
+        """
+        if not node:
+            return False
+        return node.id in self.cache
+    # Note: this is expensive, only use for debug
+    def sanity_check_evictable_size(self):
+        """
+        Check the evictable size (i.e. the size of the nodes that are not locked)
+        """
+        node = self.get_lru_no_lock()
+        evictable_size = 0
+        while self.in_list(node):
+            evictable_size += (
+                len(node.value) if not self.mamba else len(node.mamba_value)
+            )
+            node = self.get_prev_no_lock(node)
+        return evictable_size
+    # Note: this is expensive, only use for debug or idle check
+    def sanity_check(self, tree_cache: "MambaRadixCache"):
+        """
+        Check if the lru list is valid by rebuilding the lru list from the tree, heapifying it, and
+        checking if the lru list is valid.
+        """
+        try:
+            if self.mamba:
+                nodes = tree_cache._collect_nontombstone_nodes()
+            else:
+                nodes = tree_cache._collect_all_nodes()
+            total_nodes = len(nodes)
+            total_lru = len(self.cache)
+            # heapify based on last_access_time
+            heapq.heapify(nodes)
+            # the root node is not in the lru list
+            assert len(nodes) == (
+                total_lru + (0 if self.mamba else 1)
+            ), f"len(nodes): {len(nodes)}, total_lru: {total_lru}"
+            x_lru = self._get_lru()
+            while len(nodes):
+                x = heapq.heappop(nodes)
+                if x == tree_cache.root_node:
+                    # root node is not in the lru list
+                    continue
+                assert (
+                    x == x_lru
+                ), f"Incorrect LRU list, {self.mamba=}, x: {x.id=} != x_lru: {x_lru.id=}"
+                assert (
+                    x_lru.full_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.full_lock_ref=}, {x_lru.id=}"
+                assert (
+                    x_lru.mamba_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.mamba_lock_ref=}, {x_lru.id=}"
+                x_lru = getattr(x, self.prv)
+            if self.mamba:
+                evictable_size = tree_cache.mamba_evictable_size()
+                lru_list_evictable_size = tree_cache.mamba_lru_list_evictable_size()
+            else:
+                evictable_size = tree_cache.full_evictable_size()
+                lru_list_evictable_size = tree_cache.full_lru_list_evictable_size()
+            assert (
+                evictable_size == lru_list_evictable_size
+            ), f"{self.mamba=}, total nodes: {total_nodes}, total lru: {total_lru}, evictable size: {evictable_size} != lru list evictable size: {lru_list_evictable_size}"
+        except Exception as e:
+            msg = f"Mamba Radix tree sanity check failed, ping @yizhang2077: {e}"
+            logger.error(msg)
+            raise Exception(msg)
+class MambaRadixCache(BasePrefixCache):
+    def __init__(
+        self,
+        req_to_token_pool: HybridReqToTokenPool,
+        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        page_size: int,
+        disable: bool = False,
+    ):
+        assert isinstance(token_to_kv_pool_allocator, TokenToKVPoolAllocator)
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        assert page_size == 1, "Only support page_size=1 in mamba radix cache now."
+        self.page_size = page_size
+        self.disable = disable
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+        self.key_match_fn = _key_match_page_size1
+        self.get_child_key_fn = get_child_key
+        self.reset()
+    ##### Public API #####
+    def reset(self) -> None:
+        self.root_node = TreeNode()
+        self.root_node.key = []
+        self.root_node.value = []
+        self.root_node.full_lock_ref = 1
+        self.root_node.mamba_lock_ref = 1
+        self.full_evictable_size_ = 0
+        self.mamba_evictable_size_ = 0
+        self.full_protected_size_ = 0
+        self.mamba_protected_size_ = 0
+        # LRU lists are used to maintain the order of eviction of the nodes in the tree
+        self.full_lru_list = LRUList(mamba=False)
+        self.mamba_lru_list = LRUList(mamba=True)
+    def match_prefix(self, key: RadixKey, **kwargs) -> MatchResult:
+        """Find the matching prefix from the radix tree.
+        Args:
+            key: A RadixKey contains token IDs to find a matching prefix.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+        cow_mamba: bool = kwargs.get("cow_mamba", False)
+        req: Req = kwargs.get("req", None)
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+        value, last_node = self._match_prefix_helper(key)
+        # copy mamba state to req local space if cow is true
+        if cow_mamba and last_node.mamba_value is not None:
+            assert req.req_pool_idx is None  # req_pool_idx is uninitialed
+            # for reqs without mamba cache
+            if req.mamba_pool_idx is None:
+                dst_index = self.req_to_token_pool.mamba_pool.alloc(1)
+                # try to alloc again, protect last_node from eviction
+                if dst_index is None:
+                    self.inc_lock_ref(last_node)
+                    self.evict_mamba(1)
+                    dst_index = self.req_to_token_pool.mamba_pool.alloc(1)
+                    self.dec_lock_ref(last_node)
+                    assert dst_index is not None, "Can not alloc mamba cache"
+                src_index = last_node.mamba_value
+                self.req_to_token_pool.mamba_pool.copy_from(src_index, dst_index)
+                req.mamba_pool_idx = dst_index[0]
+            else:
+                src_index = last_node.mamba_value
+                dst_index = req.mamba_pool_idx.unsqueeze(0)
+                self.req_to_token_pool.mamba_pool.copy_from(src_index, dst_index)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+    def insert(self, key: RadixKey, value=None, mamba_value=None) -> Tuple[int, bool]:
+        if self.disable:
+            return 0
+        if value is None:
+            value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
+        return self._insert_helper(self.root_node, key, value, mamba_value)
+    def cache_finished_req(self, req: Req) -> None:
+        """Cache request when it finishes."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx,
+                : len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0),
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            return
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        page_aligned_len = len(kv_indices)
+        page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+        # Radix Cache takes one ref in memory pool
+        # insert the token_ids and kv_indices into the radix tree
+        # Note: the insert function already frees the overlapped kv_indices
+        mamba_value = (
+            self.req_to_token_pool.get_mamba_indices(req.req_pool_idx)
+            .unsqueeze(-1)
+            .clone()
+        )
+        new_prefix_len, mamba_exist = self.insert(
+            RadixKey(token_ids[:page_aligned_len], req.extra_key),
+            page_aligned_kv_indices,
+            mamba_value,
+        )
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+        self.req_to_token_pool.free(req.req_pool_idx, free_mamba_cache=mamba_exist)
+        self.dec_lock_ref(req.last_node)
+    def cache_unfinished_req(self, req: Req, chunked=False) -> None:
+        """Cache request when it is unfinished."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.fill_ids)
+            ]
+            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            req.prefix_indices = kv_indices
+            return
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        page_aligned_len = len(kv_indices)
+        page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+        page_aligned_token_ids = token_ids[:page_aligned_len]
+        mamba_value = self.req_to_token_pool.get_mamba_indices(
+            req.req_pool_idx
+        ).unsqueeze(-1)
+        # radix tree mamba value is forked from req space
+        mamba_value_forked = self.req_to_token_pool.mamba_pool.fork_from(mamba_value)
+        # if alloc mamba cache failed, do evict and alloc again
+        if mamba_value_forked is None:
+            self.evict_mamba(1)
+            mamba_value_forked = self.req_to_token_pool.mamba_pool.fork_from(
+                mamba_value
+            )
+            assert mamba_value_forked is not None, "Can not alloc mamba cache"
+        new_prefix_len, mamba_exist = self.insert(
+            RadixKey(page_aligned_token_ids, req.extra_key),
+            page_aligned_kv_indices,
+            mamba_value_forked,
+        )
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+        # there is a mamba cache in radix cache, release it
+        if mamba_exist:
+            self.req_to_token_pool.mamba_pool.free(mamba_value_forked)
+        # The prefix indices could be updated, reuse it
+        new_indices, new_last_node, _, _ = self.match_prefix(
+            RadixKey(page_aligned_token_ids, req.extra_key)
+        )
+        if not mamba_exist:
+            assert torch.equal(new_last_node.mamba_value, mamba_value_forked)
+        assert len(req.prefix_indices) <= len(
+            new_indices
+        ), f"{req.prefix_indices=}, {new_indices=}"
+        assert new_prefix_len <= len(new_indices), f"{new_prefix_len=}, {new_indices=}"
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
+            new_indices[len(req.prefix_indices) :],
+        )
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        req.prefix_indices = new_indices
+        req.last_node = new_last_node
+    def pretty_print(self) -> None:
+        self._print_helper(self.root_node, 0)
+        total_size, total_mamba_size = self._total_size_helper()
+        print(f"#full_tokens: {total_size}, #mamba_num: {total_mamba_size}")
+    def total_size(self) -> Tuple[int, int]:
+        return self._total_size_helper()
+    def _evict_leaf_node(
+        self, x: TreeNode, is_evict_mamba: bool
+    ) -> Tuple[int, int, TreeNode, TreeNode]:
+        assert (
+            x.full_lock_ref == 0 and x.mamba_lock_ref == 0
+        ), f"evict leaf node invalid with {x.id=} {x.full_lock_ref=} {x.mamba_lock_ref=}"
+        assert x.mamba_value is not None, f"leaf node mamba value is not None, {x.id=}"
+        # 1. a leaf node, free full tokens and mamba
+        self.token_to_kv_pool_allocator.free(x.value)
+        full_num_evicted = len(x.value)
+        self.req_to_token_pool.mamba_pool.free(x.mamba_value)
+        mamba_num_evicted = len(x.mamba_value)
+        # 2. get the next node, update the lru lists
+        if is_evict_mamba:
+            x_next = self.mamba_lru_list.get_prev_no_lock(x)
+        else:
+            x_next = self.full_lru_list.get_prev_leaf_no_lock(x)
+        self.full_lru_list.remove_node(x)
+        self.mamba_lru_list.remove_node(x)
+        # 3. delete the leaf node
+        self._delete_leaf(x)
+        # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+        x, leaf_full_num_evicted = self._iteratively_delete_tombstone_leaf(x)
+        full_num_evicted += leaf_full_num_evicted
+        return full_num_evicted, mamba_num_evicted, x, x_next
+    def evict_mamba(self, mamba_num: int) -> None:
+        if self.disable or mamba_num <= 0:
+            return
+        # get the least recently used node that is not locked, doesn't have to be a leaf
+        x = self.mamba_lru_list.get_lru_no_lock()
+        mamba_num_evicted = 0
+        # evict lru leaf nodes until mamba_num_tokens is reached
+        while mamba_num_evicted < mamba_num and (self.mamba_lru_list.in_list(x)):
+            assert x.mamba_value is not None, f"node has no mamba value, {x.id=}"
+            assert (
+                len(x.mamba_value) == 1
+            ), f"node has abnormal mamba length, {x.id=}, {len(x.mamba_value)=}"
+            assert x != self.root_node, f"root node is not evictable, {x.id=}"
+            assert x.mamba_lock_ref == 0, f"node is in use by mamba kv indices, {x.id=}"
+            if len(x.children) > 0:
+                # 1. an internal node, free mamba tokens.
+                self.req_to_token_pool.mamba_pool.free(x.mamba_value)
+                mamba_num_evicted += len(x.mamba_value)
+                # 2. get the next node, update the lru lists
+                x_next = self.mamba_lru_list.get_prev_no_lock(x)
+                self.mamba_lru_list.remove_node(x)
+                # 3. tombstone the node
+                self._tombstone_internal_node(x)
+            else:
+                _, mamba_evicted_delta, _, x_next = self._evict_leaf_node(x, True)
+                mamba_num_evicted += mamba_evicted_delta
+            x = x_next
+    def evict(self, full_num_tokens: int) -> None:
+        if self.disable or full_num_tokens <= 0:
+            return
+        full_num_evicted = 0
+        # get the least recently used leaf node that is not locked
+        x = self.full_lru_list.get_leaf_lru_no_lock()
+        while full_num_evicted < full_num_tokens and self.full_lru_list.in_list(x):
+            assert (
+                x != self.root_node
+            ), f"root node should not exist in full lru list, {x.id=}"
+            full_num_evicted_delta, _, x, x_next = self._evict_leaf_node(x, False)
+            full_num_evicted += full_num_evicted_delta
+            # if parent has no more children, it is a leaf. It is possible that this node is lru, so
+            # we need to get the first leaf node in the lru list
+            if len(x.parent.children) == 0:
+                x_next = self.full_lru_list.get_leaf_lru_no_lock()
+            x = x_next
+    def inc_lock_ref(self, node: TreeNode) -> Optional[int]:
+        """
+        Increment the lock reference count for the node.
+        It locks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It locks the mamba_lock_ref for current node if its mamba_value exists.
+        """
+        if self.disable:
+            return None
+        # protect mamba value in current node if it exists
+        if node.mamba_value is not None:
+            if node.mamba_lock_ref == 0:
+                self.mamba_evictable_size_ -= len(node.mamba_value)
+                self.mamba_protected_size_ += len(node.mamba_value)
+            node.mamba_lock_ref += 1
+        while node != self.root_node:
+            # lock full from node to root
+            assert (
+                node.full_lock_ref >= 0
+            ), f"inc_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 0:
+                self.full_evictable_size_ -= len(node.value)
+                self.full_protected_size_ += len(node.value)
+            node.full_lock_ref += 1
+            node = node.parent
+        return None
+    def dec_lock_ref(self, node: TreeNode):
+        """
+        Decrement the lock reference count for the node.
+        It unlocks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It unlocks the mamba_lock_ref for current node if its mamba_value exists.
+        """
+        if self.disable:
+            return
+        if node.mamba_value is not None:
+            assert (
+                node.mamba_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.mamba_lock_ref=}, {node.id=}"
+            if node.mamba_lock_ref == 1:
+                self.mamba_evictable_size_ += len(node.mamba_value)
+                self.mamba_protected_size_ -= len(node.mamba_value)
+            node.mamba_lock_ref -= 1
+        while node != self.root_node:
+            assert (
+                node.full_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 1:
+                self.full_evictable_size_ += len(node.value)
+                self.full_protected_size_ -= len(node.value)
+            node.full_lock_ref -= 1
+            node = node.parent
+    def sanity_check(self):
+        self.full_lru_list.sanity_check(self)
+        self.mamba_lru_list.sanity_check(self)
+    def evictable_size(self) -> Tuple[int, int]:
+        # Note: use full_evictable_size() and mamba_evictable_size() instead.
+        raise NotImplementedError
+    def full_evictable_size(self) -> int:
+        return self.full_evictable_size_
+    def mamba_evictable_size(self) -> int:
+        return self.mamba_evictable_size_
+    # Note: this is expensive, only use for debug
+    def full_lru_list_evictable_size(self) -> int:
+        return self.full_lru_list.sanity_check_evictable_size()
+    # Note: this is expensive, only use for debug
+    def mamba_lru_list_evictable_size(self) -> int:
+        return self.mamba_lru_list.sanity_check_evictable_size()
+    def protected_size(self) -> Tuple[int, int]:
+        # Note: use full_protected_size() and mamba_protected_size() instead.
+        raise NotImplementedError
+    def full_protected_size(self) -> int:
+        # protected size refers to the size of the full cache that is locked
+        return self.full_protected_size_
+    def mamba_protected_size(self) -> int:
+        # protected size refers to the size of the mamba cache that is locked
+        return self.mamba_protected_size_
+    def all_values_flatten(self) -> torch.Tensor:
+        values = []
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+    ##### Internal Helper Functions #####
+    def _match_prefix_helper(
+        self, key: RadixKey
+    ) -> Tuple[List[torch.Tensor], TreeNode]:
+        """
+        Mamba prefix matching helper. It factors in the sliding window size such that
+        the matched node is guaranteed to either 1. connected to root without mamba tombstone,
+        or 2. the number of matching tokens from the matched node to the last mamba tombstone
+        node is greater than or equal to the sliding window size.
+        """
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+        value = []
+        best_value_len = 0
+        best_last_node = node
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            # update best_value_len and best_last_node if needed
+            if node.mamba_value is not None:
+                best_value_len = len(value)
+                best_last_node = node
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+        # handle best_value_len and best_last_node, for the case that last node is fully matched
+        if node.mamba_value is not None:
+            best_value_len = len(value)
+            best_last_node = node
+        # update time for matched nodes, and make nodes closer to root to be least recently used
+        # this allows mamba to evict nodes closer to root first
+        self.full_lru_list.reset_node_and_parents_mru(best_last_node, self.root_node)
+        self.mamba_lru_list.reset_node_and_parents_mru(best_last_node, self.root_node)
+        # This last_access_time is for sanity check, can be deleted after validation in production
+        cur_time = time.monotonic()
+        while node:
+            node.last_access_time = cur_time
+            cur_time -= 0.0001
+            node = node.parent
+        return value[:best_value_len], best_last_node
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode:
+        # new_node -> child
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.mamba_value = None  # mamba cache can not be split
+        new_node.full_lock_ref = child.full_lock_ref
+        new_node.mamba_lock_ref = 0
+        new_node.key = child.key[:split_len]
+        new_node.value = child.value[:split_len]
+        # child time should be later than parent's time for mamba tombstone
+        child.last_access_time = time.monotonic()
+        self.full_lru_list.remove_node(child)
+        if child.mamba_value is not None:
+            self.mamba_lru_list.remove_node(child)
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        child.value = child.value[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+        # insert the new node and child into the lru lists, insert
+        # parent first so that parent is after child in the lru list
+        self.full_lru_list.insert_mru(new_node)
+        self.full_lru_list.insert_mru(child)
+        if child.mamba_value is not None:
+            self.mamba_lru_list.insert_mru(child)
+        return new_node
+    def _insert_helper(
+        self,
+        node: TreeNode,
+        key: RadixKey,
+        value,
+        mamba_value,
+    ) -> Tuple[int, bool]:
+        # Update the last access time from root to leaf, so that
+        # mamba will tombstone the node closer to root first
+        assert mamba_value is not None, "Mamba value should not be None here."
+        node.last_access_time = time.monotonic()
+        if node != self.root_node:
+            self.full_lru_list.reset_node_mru(node)
+            if node.mamba_value is not None:
+                self.mamba_lru_list.reset_node_mru(node)
+        if len(key) == 0:
+            return 0, True
+        child_key = self.get_child_key_fn(key)
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            self.full_lru_list.reset_node_mru(node)
+            if node.mamba_value is not None:
+                self.mamba_lru_list.reset_node_mru(node)
+            prefix_len = self.key_match_fn(node.key, key)
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+        mamba_value_exist = False
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value
+            new_node.mamba_value = mamba_value
+            self.full_lru_list.insert_mru(new_node)
+            self.full_evictable_size_ += len(value)
+            self.mamba_evictable_size_ += len(mamba_value)
+            self.mamba_lru_list.insert_mru(new_node)
+            node.children[child_key] = new_node
+        elif node.mamba_value is None:  # add for mamba tombstone
+            node.mamba_value = mamba_value
+            self.mamba_evictable_size_ += len(mamba_value)
+            self.mamba_lru_list.insert_mru(node)
+        else:
+            mamba_value_exist = True
+            self.mamba_lru_list.reset_node_mru(node)
+        return total_prefix_length, mamba_value_exist
+    def _iteratively_delete_tombstone_leaf(
+        self, node: TreeNode
+    ) -> Tuple[TreeNode, int]:
+        full_num_evicted = 0
+        while node.parent.mamba_value is None and len(node.parent.children) == 0:
+            # root node is not evictable
+            if node.parent == self.root_node:
+                break
+            # if locked, means node is in use, skip
+            if node.parent.full_lock_ref > 0:
+                break
+            assert (
+                node.parent.mamba_lock_ref == 0
+            ), f"tombstone mamba_lock_ref should always be 0, {node.parent.full_lock_ref=}, {node.parent.mamba_lock_ref=}, {node.parent.id=}"
+            # delete tombstone node evicts full tokens
+            self.token_to_kv_pool_allocator.free(node.parent.value)
+            full_num_evicted += len(node.parent.value)
+            self.full_lru_list.remove_node(node.parent)
+            self._delete_tombstone_leaf(node.parent)
+            node = node.parent
+        return node, full_num_evicted
+    def _delete_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.mamba_value is not None
+        ), f"Invariant violated: leaf node is a tombstone, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.full_evictable_size_ -= len(node.key)
+        self.mamba_evictable_size_ -= len(node.mamba_value)
+    def _tombstone_internal_node(self, node: TreeNode) -> None:
+        assert len(node.children) != 0, f"Cannot tombstone a leaf node, {node.id=}"
+        self.mamba_evictable_size_ -= len(node.mamba_value)
+        node.mamba_value = None
+    def _delete_tombstone_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.mamba_value is None
+        ), f"Deleting a unexpected non-tombstone leaf node, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.full_evictable_size_ -= len(node.key)
+    def _collect_leaves(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            if len(cur_node.children) == 0:
+                ret_list.append(cur_node)
+            else:
+                stack.extend(cur_node.children.values())
+        return ret_list
+    def _collect_nontombstone_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            if cur_node.mamba_value is not None:
+                ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+        return ret_list
+    def _collect_all_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+        return ret_list
+    def _print_helper(self, node: TreeNode, indent: int) -> None:
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                f"[{current_node.id}]",
+                len(current_node.key),
+                f"fr={current_node.full_lock_ref}",
+                f"mr={current_node.mamba_lock_ref}",
+                f"fll={self.full_lru_list.in_list(current_node)}",
+                f"mll={self.mamba_lru_list.in_list(current_node)}",
+                f"mv={current_node.mamba_value}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+    def _total_size_helper(self) -> Tuple[int, int]:
+        total_size = 0
+        total_mamba_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            if current_node.mamba_value is not None:
+                total_mamba_size += len(current_node.mamba_value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size, total_mamba_size

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl