PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/lora/backend/chunked_backend.py ADDED Viewed

@@ -0,0 +1,348 @@
+from typing import Optional
+import torch
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
+from sglang.srt.lora.triton_ops import (
+    chunked_sgmv_lora_expand_forward,
+    chunked_sgmv_lora_shrink_forward,
+)
+from sglang.srt.lora.utils import LoRABatchInfo
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import ServerArgs
+MIN_CHUNK_SIZE = 16
+class ChunkedSgmvLoRABackend(BaseLoRABackend):
+    """
+    Chunked LoRA backend using segmented matrix-vector multiplication.
+    This backend is largely based on the SGMV (Segmented Gather Matrix-Vector multiplication) algorithm
+    introduced in the Punica paper (https://arxiv.org/pdf/2310.18547). One main variation made here is to
+    segment the input sequences into fixed-size chunks, which reduces excessive kernel launches especially
+    when the LoRA distribution is skewed.
+    """
+    name = "csgmv"
+    def __init__(
+        self,
+        max_loras_per_batch: int,
+        device: torch.device,
+        server_args: ServerArgs,
+    ):
+        super().__init__(max_loras_per_batch, device)
+        self.max_chunk_size = server_args.max_lora_chunk_size
+    def run_lora_a_sgemm(
+        self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        return chunked_sgmv_lora_shrink_forward(
+            x=x,
+            weights=weights,
+            batch_info=self.batch_info,
+            num_slices=1,
+        )
+    def run_lora_b_sgemm(
+        self,
+        x: torch.Tensor,
+        weights: torch.Tensor,
+        output_offset: torch.Tensor,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        # For simple lora B, we use slice offsets [0, output_dim]
+        output_dim = weights.shape[-2]
+        max_slice_size = output_dim
+        return chunked_sgmv_lora_expand_forward(
+            x=x,
+            weights=weights,
+            batch_info=self.batch_info,
+            slice_offsets=output_offset,
+            max_slice_size=max_slice_size,
+            base_output=base_output,
+        )
+    def run_qkv_lora(
+        self,
+        x: torch.Tensor,
+        qkv_lora_a: torch.Tensor,
+        qkv_lora_b: torch.Tensor,
+        output_offset: torch.Tensor,
+        max_qkv_out_dim: int,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        # x: (s, input_dim)
+        # qkv_lora_a: (num_lora, 3 * r, input_dim)
+        # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
+        assert isinstance(qkv_lora_b, torch.Tensor)
+        lora_a_output = chunked_sgmv_lora_shrink_forward(
+            x=x,
+            weights=qkv_lora_a,
+            batch_info=self.batch_info,
+            num_slices=3,
+        )
+        lora_output = chunked_sgmv_lora_expand_forward(
+            x=lora_a_output,
+            weights=qkv_lora_b,
+            batch_info=self.batch_info,
+            slice_offsets=output_offset,
+            max_slice_size=max_qkv_out_dim,
+            base_output=base_output,
+        )
+        return lora_output
+    def run_gate_up_lora(
+        self,
+        x: torch.Tensor,
+        gate_up_lora_a: torch.Tensor,
+        gate_up_lora_b: torch.Tensor,
+        output_offset: torch.Tensor,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        # x: (s, input_dim)
+        # gate_up_lora_a: (num_lora, 2 * r, input_dim)
+        # gate_up_lora_b: (num_lora, 2 * output_dim, r)
+        assert isinstance(gate_up_lora_b, torch.Tensor)
+        output_dim = gate_up_lora_b.shape[-2] // 2
+        # lora_a_output: (s, 2 * r)
+        lora_a_output = chunked_sgmv_lora_shrink_forward(
+            x=x,
+            weights=gate_up_lora_a,
+            batch_info=self.batch_info,
+            num_slices=2,
+        )
+        lora_output = chunked_sgmv_lora_expand_forward(
+            x=lora_a_output,
+            weights=gate_up_lora_b,
+            batch_info=self.batch_info,
+            slice_offsets=output_offset,
+            max_slice_size=output_dim,
+            base_output=base_output,
+        )
+        return lora_output
+    def _determine_chunk_size(self, forward_batch: ForwardBatch) -> int:
+        """
+        Heuristically determine the chunk size based on token token number in a batch.
+        Args:
+            forward_batch (ForwardBatch): The batch information containing sequence lengths.
+        Returns:
+            The determined chunk size
+        """
+        if self.max_chunk_size <= MIN_CHUNK_SIZE:
+            return MIN_CHUNK_SIZE
+        num_tokens = (
+            forward_batch.extend_num_tokens
+            if forward_batch.forward_mode.is_extend()
+            else forward_batch.batch_size
+        )
+        if num_tokens >= 256:
+            chunk_size = 128
+        elif num_tokens >= 64:
+            chunk_size = 32
+        else:  # num_tokens < 64
+            chunk_size = 16
+        return min(self.max_chunk_size, chunk_size)
+    def prepare_lora_batch(
+        self,
+        forward_batch: ForwardBatch,
+        weight_indices: list[int],
+        lora_ranks: list[int],
+        scalings: list[float],
+        batch_info: Optional[LoRABatchInfo] = None,
+    ):
+        chunk_size = self._determine_chunk_size(forward_batch)
+        permutation, weight_indices_reordered = ChunkedSgmvLoRABackend._get_permutation(
+            seq_weight_indices=weight_indices,
+            forward_batch=forward_batch,
+        )
+        seg_weight_indices, seg_indptr = self._get_segments_info(
+            weights_reordered=weight_indices_reordered,
+            chunk_size=chunk_size,
+        )
+        num_segments = len(seg_weight_indices)
+        lora_ranks_tensor = torch.tensor(
+            lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
+        )
+        scalings_tensor = torch.tensor(
+            scalings, dtype=torch.float, pin_memory=True, device="cpu"
+        )
+        if batch_info is None:
+            batch_info = LoRABatchInfo(
+                bs=forward_batch.batch_size,
+                num_segments=num_segments,
+                max_len=chunk_size,
+                use_cuda_graph=False,
+                seg_indptr=torch.empty(
+                    (num_segments + 1,), dtype=torch.int32, device=self.device
+                ),
+                weight_indices=torch.empty(
+                    (num_segments,), dtype=torch.int32, device=self.device
+                ),
+                lora_ranks=torch.empty(
+                    (self.max_loras_per_batch,), dtype=torch.int32, device=self.device
+                ),
+                scalings=torch.empty(
+                    (self.max_loras_per_batch,), dtype=torch.float, device=self.device
+                ),
+                permutation=torch.empty(
+                    (len(permutation),), dtype=torch.int32, device=self.device
+                ),
+                # Not used in chunked kernels
+                seg_lens=None,
+            )
+        else:
+            batch_info.bs = forward_batch.batch_size
+            batch_info.num_segments = num_segments
+            batch_info.max_len = chunk_size
+        # Copy to device asynchronously
+        batch_info.lora_ranks[: self.max_loras_per_batch].copy_(
+            lora_ranks_tensor, non_blocking=True
+        )
+        batch_info.scalings[: self.max_loras_per_batch].copy_(
+            scalings_tensor, non_blocking=True
+        )
+        batch_info.weight_indices[:num_segments].copy_(
+            seg_weight_indices, non_blocking=True
+        )
+        batch_info.seg_indptr[: num_segments + 1].copy_(seg_indptr, non_blocking=True)
+        batch_info.permutation[: len(permutation)].copy_(permutation, non_blocking=True)
+        self.batch_info = batch_info
+    @staticmethod
+    def _get_permutation(seq_weight_indices, forward_batch: ForwardBatch):
+        """
+        Computes permutation indices for reordering tokens by their LoRA adapter assignments.
+        This function implements the "gather" step in Chunked Segmented Gather Matrix Vector
+        multiplication by creating a permutation that groups tokens by their LoRA adapter.
+        Tokens using the same LoRA adapter are placed together to enable efficient batched
+        computation.
+        Example:
+            seq_weight_indices = [0, 1, 0]  # 3 sequences using adapters [0, 1, 0]
+            extend_seq_lens = [2, 1, 3]     # sequence lengths [2, 1, 3 tokens]
+            # Creates row_weight_indices: [0, 0, 1, 0, 0, 0] (6 tokens total)
+            # Returns permutation: [0, 1, 3, 4, 5, 2] (groups adapter 0 tokens together)
+            # weights_reordered: [0, 0, 0, 0, 0, 1] (sorted by adapter)
+        Args:
+            seq_weight_indices: List of LoRA adapter indices for each sequence
+            forward_batch (ForwardBatch): Batch information containing sequence lengths
+        Returns:
+            tuple: (permutation, weights_reordered) where:
+                - permutation: Token reordering indices to group by adapter
+                - weights_reordered: Sorted adapter indices for each token
+        """
+        with torch.device("cpu"):
+            seq_weight_indices = torch.tensor(seq_weight_indices, dtype=torch.int32)
+            seg_lens_cpu = (
+                torch.tensor(
+                    forward_batch.extend_seq_lens_cpu,
+                    dtype=torch.int32,
+                )
+                if forward_batch.forward_mode.is_extend()
+                else torch.ones(forward_batch.batch_size, dtype=torch.int32)
+            )
+            row_weight_indices = torch.repeat_interleave(
+                seq_weight_indices, seg_lens_cpu
+            )
+            permutation = torch.empty(
+                (len(row_weight_indices),), dtype=torch.long, pin_memory=True
+            )
+            torch.argsort(row_weight_indices, stable=True, out=permutation)
+            weights_reordered = row_weight_indices[permutation]
+            return permutation, weights_reordered
+    def _get_segments_info(self, weights_reordered: torch.Tensor, chunk_size: int):
+        """
+        Computes segment information for chunked SGMV operations.
+        This function takes the reordered weight indices and creates segments of fixed size
+        (self.segment_size) for efficient kernel execution. Each segment contains tokens
+        that use the same LoRA adapter, enabling vectorized computation.
+        The segmentation is necessary because:
+        1. GPU kernels work efficiently on fixed-size blocks
+        2. Large groups of tokens using the same adapter are split into manageable chunks
+        3. Each segment can be processed independently in parallel
+        Example:
+            weights_reordered = [0, 0, 0, 0, 0, 1]  # 5 tokens with adapter 0, 1 with adapter 1
+            segment_size = 3
+            # Creates segments:
+            # Segment 0: tokens 0-2 (adapter 0), length=3
+            # Segment 1: tokens 3-4 (adapter 0), length=2
+            # Segment 2: token 5 (adapter 1), length=1
+            # Returns:
+            # weight_indices_list: [0, 0, 1] (adapter for each segment)
+            # seg_indptr: [0, 3, 5, 6] (cumulative segment boundaries)
+        Args:
+            weights_reordered (torch.Tensor): Sorted adapter indices for each token
+            chunk_size (int): Fixed size for each segment
+        Returns:
+            tuple: (weight_indices_list, seg_indptr) where:
+                - weight_indices_list: LoRA adapter index for each segment
+                - seg_indptr: Cumulative segment boundaries (CSR-style indptr)
+        """
+        with torch.device("cpu"):
+            unique_weights, counts = torch.unique_consecutive(
+                weights_reordered, return_counts=True
+            )
+            weight_indices_list = []
+            seg_lens_list = []
+            for weight_idx, group_len in zip(unique_weights, counts):
+                group_len = group_len.item()
+                num_segs = (group_len + chunk_size - 1) // chunk_size
+                weight_indices_list.extend([weight_idx.item()] * num_segs)
+                seg_lens_list.extend([chunk_size] * (num_segs - 1))
+                seg_lens_list.append(group_len - (num_segs - 1) * chunk_size)
+            seg_lens = torch.tensor(seg_lens_list, dtype=torch.int32)
+            weight_indices_list = torch.tensor(
+                weight_indices_list, dtype=torch.int32, pin_memory=True
+            )
+            seg_indptr = torch.empty(
+                (len(seg_lens) + 1,), dtype=torch.int32, pin_memory=True
+            )
+            seg_indptr[0] = 0
+            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
+            return weight_indices_list, seg_indptr

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -16,7 +16,12 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 class TritonLoRABackend(BaseLoRABackend):
     name = "triton"
-    def __init__(self, max_loras_per_batch: int, device: torch.device):
+    def __init__(
+        self,
+        max_loras_per_batch: int,
+        device: torch.device,
+        **kwargs,
+    ):
         super().__init__(max_loras_per_batch, device)
     def run_lora_a_sgemm(
@@ -30,7 +35,7 @@ class TritonLoRABackend(BaseLoRABackend):
         weights: torch.Tensor,
         base_output: torch.Tensor = None,
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output)
@@ -43,7 +48,7 @@ class TritonLoRABackend(BaseLoRABackend):
         max_qkv_out_dim: int,
         base_output: torch.Tensor = None,
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         # x: (s, input_dim)
@@ -69,7 +74,7 @@ class TritonLoRABackend(BaseLoRABackend):
         gate_up_lora_b: torch.Tensor,
         base_output: torch.Tensor = None,
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         # x: (s, input_dim)

sglang/srt/lora/eviction_policy.py ADDED Viewed

@@ -0,0 +1,139 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Eviction policies for LoRA adapter memory management.
+"""
+import logging
+import time
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import Optional, Set
+logger = logging.getLogger(__name__)
+class EvictionPolicy(ABC):
+    """Abstract base class for LoRA adapter eviction policies."""
+    @abstractmethod
+    def mark_used(self, uid: Optional[str]) -> None:
+        """Marks an adapter as used."""
+        pass
+    @abstractmethod
+    def select_victim(self, candidates: Set[Optional[str]]) -> Optional[str]:
+        """Selects an adapter to evict from candidates."""
+        pass
+    @abstractmethod
+    def remove(self, uid: Optional[str]) -> None:
+        """Removes an adapter from the policy's tracking."""
+        pass
+class LRUEvictionPolicy(EvictionPolicy):
+    """LRU eviction policy - evicts the least recently used adapter."""
+    def __init__(self):
+        self.access_order = OrderedDict()  # key=uid, value=last_access_time
+        self.total_accesses = 0
+        self.eviction_count = 0
+    def mark_used(self, uid: Optional[str]) -> None:
+        if uid is not None:
+            current_time = time.monotonic()
+            # Remove and re-add to move to end (most recent)
+            self.access_order.pop(uid, None)
+            self.access_order[uid] = current_time
+            self.total_accesses += 1
+            logger.debug(f"LoRA {uid} marked as used at {current_time}")
+    def select_victim(self, candidates: Set[Optional[str]]) -> Optional[str]:
+        """Select the least recently used adapter from candidates."""
+        # Base model (currently None, will be replaced with special UID in future)
+        # always has lowest priority - evict it first if available
+        BASE_MODEL_UID = None  # TODO: Replace with special UID constant
+        if BASE_MODEL_UID in candidates:
+            logger.debug(f"Selected base model for eviction (LRU)")
+            self.eviction_count += 1
+            return BASE_MODEL_UID
+        # Iterate through access_order (oldest first) to find LRU victim
+        for uid in list(self.access_order.keys()):
+            if uid in candidates:
+                logger.debug(f"Selected LoRA {uid} for eviction (LRU)")
+                self.eviction_count += 1
+                return uid
+        # Should never reach here if candidates is non-empty
+        assert False, f"Failed to select LRU victim from candidates: {candidates}"
+    def remove(self, uid: Optional[str]) -> None:
+        if uid is not None:
+            self.access_order.pop(uid, None)
+            logger.debug(f"Removed LoRA {uid} from LRU tracking")
+class FIFOEvictionPolicy(EvictionPolicy):
+    """FIFO eviction policy - for backward compatibility."""
+    def __init__(self):
+        self.insertion_order = (
+            OrderedDict()
+        )  # key=uid, OrderedDict maintains insertion order
+        self.eviction_count = 0
+    def mark_used(self, uid: Optional[str]) -> None:
+        """For FIFO, we only track insertion order (not access time)."""
+        if uid is not None and uid not in self.insertion_order:
+            self.insertion_order[uid] = (
+                True  # Value unused, OrderedDict tracks insertion order
+            )
+    def select_victim(self, candidates: Set[Optional[str]]) -> Optional[str]:
+        """Select the first inserted adapter from candidates."""
+        # Base model (currently None, will be replaced with special UID in future)
+        # always has lowest priority - evict it first if available
+        BASE_MODEL_UID = None  # TODO: Replace with special UID constant
+        if BASE_MODEL_UID in candidates:
+            logger.debug(f"Selected base model for eviction (FIFO)")
+            self.eviction_count += 1
+            return BASE_MODEL_UID
+        # Iterate through insertion_order (oldest first) to find FIFO victim
+        for uid in list(self.insertion_order.keys()):
+            if uid in candidates:
+                logger.debug(f"Selected LoRA {uid} for eviction (FIFO)")
+                self.eviction_count += 1
+                return uid
+        # Should never reach here if candidates is non-empty
+        assert False, f"Failed to select FIFO victim from candidates: {candidates}"
+    def remove(self, uid: Optional[str]) -> None:
+        if uid is not None:
+            self.insertion_order.pop(uid, None)
+def get_eviction_policy(policy_name: str) -> EvictionPolicy:
+    """Factory function to create eviction policy instances."""
+    policies = {
+        "fifo": FIFOEvictionPolicy,
+        "lru": LRUEvictionPolicy,
+    }
+    if policy_name not in policies:
+        raise ValueError(f"Unknown eviction policy: {policy_name}")
+    return policies[policy_name]()

sglang/srt/lora/lora.py CHANGED Viewed

@@ -26,16 +26,17 @@ import torch
 from torch import nn
 from sglang.srt.configs.load_config import LoadConfig
-from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend
-# from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
+from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
 from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader
+from sglang.srt.utils.hf_transformers_utils import AutoConfig
 logger = logging.getLogger(__name__)
+SUPPORTED_BACKENDS = (TritonLoRABackend, ChunkedSgmvLoRABackend)
 class LoRALayer(nn.Module):
     def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
@@ -48,6 +49,7 @@ class LoRALayer(nn.Module):
 class LoRAAdapter(nn.Module):
     def __init__(
         self,
         uid: str,
@@ -159,8 +161,8 @@ class LoRAAdapter(nn.Module):
                 gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
                 if up_name not in weights:
                     weights[up_name] = torch.zeros_like(weights[weight_name])
-                    assert isinstance(self.lora_backend, TritonLoRABackend), (
-                        f"LoRA weight initialization currently only supported for 'triton' backend. "
+                    assert isinstance(self.lora_backend, SUPPORTED_BACKENDS), (
+                        f"LoRA weight initialization currently only supported for LoRA backends: {', '.join(b.name for b in SUPPORTED_BACKENDS)}"
                         f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
                         f"or consider implementing custom initialization logic for other backends."
                     )

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -16,12 +16,11 @@
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
-from typing import Dict, Iterable, List, Optional, Set, Tuple
+from typing import Dict, Iterable, List, Optional
 import torch
 from sglang.srt.configs.load_config import LoadConfig
-from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
 from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
 from sglang.srt.lora.lora import LoRAAdapter
@@ -35,9 +34,11 @@ from sglang.srt.lora.utils import (
     get_normalized_target_modules,
     get_target_module_name,
 )
-from sglang.srt.managers.io_struct import LoRAUpdateResult
+from sglang.srt.managers.io_struct import LoRAUpdateOutput
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import replace_submodule
+from sglang.srt.utils.hf_transformers_utils import AutoConfig
 logger = logging.getLogger(__name__)
@@ -56,6 +57,7 @@ class LoRAManager:
         max_lora_rank: Optional[int] = None,
         target_modules: Optional[Iterable[str]] = None,
         lora_paths: Optional[List[LoRARef]] = None,
+        server_args: Optional[ServerArgs] = None,
     ):
         self.base_model: torch.nn.Module = base_model
         self.base_hf_config: AutoConfig = base_hf_config
@@ -66,12 +68,16 @@ class LoRAManager:
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
+        # Store eviction policy from server args
+        self.eviction_policy = server_args.lora_eviction_policy
         # LoRA backend for running sgemm kernels
         logger.info(f"Using {lora_backend} as backend of LoRA kernels.")
         backend_type = get_backend_from_name(lora_backend)
         self.lora_backend: BaseLoRABackend = backend_type(
             max_loras_per_batch=max_loras_per_batch,
             device=self.device,
+            server_args=server_args,
         )
         # Initialize mutable internal state of the LoRAManager.
@@ -104,8 +110,8 @@ class LoRAManager:
     def create_lora_update_result(
         self, success: bool, error_message: str = ""
-    ) -> LoRAUpdateResult:
-        return LoRAUpdateResult(
+    ) -> LoRAUpdateOutput:
+        return LoRAUpdateOutput(
             success=success,
             error_message=error_message,
             loaded_adapters={
@@ -114,7 +120,7 @@ class LoRAManager:
             },
         )
-    def load_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateResult:
+    def load_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateOutput:
         """
         Load a single LoRA adapter from the specified path.
@@ -128,6 +134,16 @@ class LoRAManager:
             lora_ref.lora_id not in self.loras
         ), f"LoRA adapter with ID {lora_ref.lora_id} is already loaded. This should have been verified before request is sent to the backend."
+        if lora_ref.pinned and self.num_pinned_loras >= self.max_loras_per_batch - 1:
+            return self.create_lora_update_result(
+                success=False,
+                error_message=(
+                    f"Already have {self.num_pinned_loras} pinned adapters, "
+                    f"max allowed is {self.max_loras_per_batch - 1} (reserving 1 slot for dynamic use). "
+                    f"Please unpin some adapters or increase max_loras_per_batch."
+                ),
+            )
         try:
             # load configs
             new_adapter = LoRAConfig(lora_ref.lora_path)
@@ -153,6 +169,15 @@ class LoRAManager:
         Validate if an adapter can be loaded into the current LoRA memory pool and generate error if it is incompatible.
         """
+        # Check if this LoRA adapter is already loaded
+        if any(
+            lora_ref.lora_name == existing_lora_ref.lora_name
+            for existing_lora_ref in self.lora_refs.values()
+        ):
+            raise ValueError(
+                f"Failed to load LoRA adapter {lora_ref.lora_name} because it is already loaded"
+            )
         # Check if the LoRA adapter shape is compatible with the current LoRA memory pool configuration.
         memory_pool = getattr(self, "memory_pool", None)
         incompatible = memory_pool and not memory_pool.can_support(lora_config)
@@ -171,7 +196,7 @@ class LoRAManager:
                 "`--max-loras-per-batch` or load it as unpinned LoRA adapters."
             )
-    def unload_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateResult:
+    def unload_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateOutput:
         """
         Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
         delete the corresponding LoRA modules.
@@ -408,6 +433,7 @@ class LoRAManager:
             max_lora_rank=self.max_lora_rank,
             target_modules=self.target_modules,
             base_model=self.base_model,
+            eviction_policy=self.eviction_policy,
         )
     def set_lora_module(self, module_name, module):

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -18,8 +18,8 @@ from dataclasses import dataclass, field, fields
 from typing import Dict, List, Optional, Union
 from uuid import uuid4
-from sglang.srt.aio_rwlock import RWLock
 from sglang.srt.utils import ConcurrentCounter
+from sglang.srt.utils.aio_rwlock import RWLock
 @dataclass(frozen=True)

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl