sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ from sglang.srt.layers.attention.flashinfer_backend import (
|
|
30
30
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
31
31
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
32
32
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
33
|
-
from sglang.srt.speculative.
|
33
|
+
from sglang.srt.speculative.spec_info import SpecInput
|
34
34
|
from sglang.srt.utils import (
|
35
35
|
is_flashinfer_available,
|
36
36
|
is_sm100_supported,
|
@@ -40,7 +40,7 @@ from sglang.srt.utils import (
|
|
40
40
|
if TYPE_CHECKING:
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
42
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
43
|
-
from sglang.srt.speculative.spec_info import
|
43
|
+
from sglang.srt.speculative.spec_info import SpecInput
|
44
44
|
|
45
45
|
if is_flashinfer_available():
|
46
46
|
from flashinfer import (
|
@@ -96,6 +96,7 @@ class FlashInferMhaChunkKVRunner:
|
|
96
96
|
def update_wrapper(
|
97
97
|
self,
|
98
98
|
forward_batch: ForwardBatch,
|
99
|
+
disable_flashinfer_ragged: bool = False,
|
99
100
|
):
|
100
101
|
assert forward_batch.num_prefix_chunks is not None
|
101
102
|
num_prefix_chunks = forward_batch.num_prefix_chunks
|
@@ -128,16 +129,17 @@ class FlashInferMhaChunkKVRunner:
|
|
128
129
|
causal=False,
|
129
130
|
)
|
130
131
|
# ragged prefill
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
132
|
+
if not disable_flashinfer_ragged:
|
133
|
+
self.ragged_wrapper.begin_forward(
|
134
|
+
qo_indptr=qo_indptr,
|
135
|
+
kv_indptr=qo_indptr,
|
136
|
+
num_qo_heads=self.num_local_heads,
|
137
|
+
num_kv_heads=self.num_local_heads,
|
138
|
+
head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
|
139
|
+
head_dim_vo=self.v_head_dim,
|
140
|
+
q_data_type=self.q_data_type,
|
141
|
+
causal=True,
|
142
|
+
)
|
141
143
|
|
142
144
|
def forward(
|
143
145
|
self,
|
@@ -359,7 +361,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
|
|
359
361
|
seq_lens: torch.Tensor,
|
360
362
|
encoder_lens: Optional[torch.Tensor],
|
361
363
|
forward_mode: ForwardMode,
|
362
|
-
spec_info: Optional[
|
364
|
+
spec_info: Optional[SpecInput],
|
363
365
|
):
|
364
366
|
if forward_mode.is_decode_or_idle():
|
365
367
|
decode_wrapper = BatchMLAPagedAttentionWrapper(
|
@@ -439,7 +441,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
|
|
439
441
|
seq_lens_sum: int,
|
440
442
|
encoder_lens: Optional[torch.Tensor],
|
441
443
|
forward_mode: ForwardMode,
|
442
|
-
spec_info: Optional[
|
444
|
+
spec_info: Optional[SpecInput],
|
443
445
|
seq_lens_cpu: Optional[torch.Tensor],
|
444
446
|
):
|
445
447
|
if forward_mode.is_decode_or_idle():
|
@@ -491,9 +493,11 @@ class FlashInferMLAAttnBackend(AttentionBackend):
|
|
491
493
|
def get_cuda_graph_seq_len_fill_value(self):
|
492
494
|
return 1
|
493
495
|
|
494
|
-
def init_mha_chunk_metadata(
|
496
|
+
def init_mha_chunk_metadata(
|
497
|
+
self, forward_batch: ForwardBatch, disable_flashinfer_ragged: bool = False
|
498
|
+
):
|
495
499
|
"""Init the metadata for a forward pass."""
|
496
|
-
self.mha_chunk_kv_cache.update_wrapper(forward_batch)
|
500
|
+
self.mha_chunk_kv_cache.update_wrapper(forward_batch, disable_flashinfer_ragged)
|
497
501
|
|
498
502
|
def forward_extend(
|
499
503
|
self,
|
@@ -659,7 +663,7 @@ class FlashInferMLAIndicesUpdaterDecode:
|
|
659
663
|
seq_lens_sum: int,
|
660
664
|
decode_wrapper: BatchMLAPagedAttentionWrapper,
|
661
665
|
init_metadata_replay: bool = False,
|
662
|
-
spec_info: Optional[
|
666
|
+
spec_info: Optional[SpecInput] = None,
|
663
667
|
**fast_decode_kwargs,
|
664
668
|
):
|
665
669
|
decode_wrapper = decode_wrapper or self.decode_wrapper
|
@@ -684,7 +688,7 @@ class FlashInferMLAIndicesUpdaterDecode:
|
|
684
688
|
q_indptr: torch.Tensor,
|
685
689
|
kv_indptr: torch.Tensor,
|
686
690
|
init_metadata_replay: bool = False,
|
687
|
-
spec_info: Optional[
|
691
|
+
spec_info: Optional[SpecInput] = None,
|
688
692
|
**fast_decode_kwargs,
|
689
693
|
):
|
690
694
|
bs = len(req_pool_indices)
|
@@ -772,7 +776,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
|
|
772
776
|
prefix_lens: torch.Tensor,
|
773
777
|
prefill_wrapper_paged: BatchMLAPagedAttentionWrapper,
|
774
778
|
use_ragged: bool,
|
775
|
-
spec_info: Optional[
|
779
|
+
spec_info: Optional[SpecInput] = None,
|
776
780
|
):
|
777
781
|
if use_ragged:
|
778
782
|
paged_kernel_lens = prefix_lens
|
@@ -807,7 +811,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
|
|
807
811
|
kv_indptr: torch.Tensor,
|
808
812
|
qo_indptr: torch.Tensor,
|
809
813
|
use_ragged: bool,
|
810
|
-
spec_info: Optional[
|
814
|
+
spec_info: Optional[SpecInput] = None,
|
811
815
|
):
|
812
816
|
bs = len(seq_lens)
|
813
817
|
sm_scale = self.scaling
|
@@ -834,9 +838,7 @@ class FlashInferMLAIndicesUpdaterPrefill:
|
|
834
838
|
qo_indptr = qo_indptr[: bs + 1]
|
835
839
|
custom_mask = None
|
836
840
|
else:
|
837
|
-
assert isinstance(spec_info,
|
838
|
-
spec_info, EagleVerifyInput
|
839
|
-
)
|
841
|
+
assert isinstance(spec_info, SpecInput)
|
840
842
|
# TODO: Support topk > 1 with custom mask
|
841
843
|
kv_indices, kv_indptr, qo_indptr, custom_mask = (
|
842
844
|
spec_info.generate_attn_arg_prefill(
|
@@ -890,7 +892,7 @@ class FlashInferMLAMultiStepDraftBackend:
|
|
890
892
|
topk: int,
|
891
893
|
speculative_num_steps: int,
|
892
894
|
):
|
893
|
-
from sglang.srt.speculative.
|
895
|
+
from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
|
894
896
|
|
895
897
|
if topk > 1:
|
896
898
|
raise ValueError(
|
@@ -959,7 +961,7 @@ class FlashInferMLAMultiStepDraftBackend:
|
|
959
961
|
)
|
960
962
|
|
961
963
|
assert forward_batch.spec_info is not None
|
962
|
-
assert
|
964
|
+
assert forward_batch.spec_info.is_draft_input()
|
963
965
|
|
964
966
|
for i in range(self.speculative_num_steps - 1):
|
965
967
|
forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
|
@@ -979,8 +981,6 @@ class FlashInferMLAMultiStepDraftBackend:
|
|
979
981
|
)
|
980
982
|
|
981
983
|
def call_fn(i, forward_batch):
|
982
|
-
assert forward_batch.spec_info is not None
|
983
|
-
assert isinstance(forward_batch.spec_info, EagleDraftInput)
|
984
984
|
forward_batch.spec_info.kv_indptr = (
|
985
985
|
forward_batch.spec_info.kv_indptr.clone()
|
986
986
|
)
|
@@ -1060,7 +1060,7 @@ def fast_mla_decode_plan(
|
|
1060
1060
|
|
1061
1061
|
try:
|
1062
1062
|
# Standard version with just the required arguments (no use_profiler)
|
1063
|
-
self._cached_module.plan
|
1063
|
+
self._cached_module.plan(
|
1064
1064
|
self._float_workspace_buffer,
|
1065
1065
|
self._int_workspace_buffer,
|
1066
1066
|
self._pin_memory_int_workspace_buffer,
|
@@ -19,7 +19,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
|
|
19
19
|
if TYPE_CHECKING:
|
20
20
|
from sglang.srt.layers.radix_attention import RadixAttention
|
21
21
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
22
|
-
from sglang.srt.speculative.spec_info import
|
22
|
+
from sglang.srt.speculative.spec_info import SpecInput
|
23
23
|
|
24
24
|
|
25
25
|
# FlashMLA only supports pagesize=64
|
@@ -187,7 +187,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
|
187
187
|
seq_lens: torch.Tensor,
|
188
188
|
encoder_lens: Optional[torch.Tensor],
|
189
189
|
forward_mode: ForwardMode,
|
190
|
-
spec_info: Optional[
|
190
|
+
spec_info: Optional[SpecInput],
|
191
191
|
):
|
192
192
|
if forward_mode.is_decode_or_idle():
|
193
193
|
max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
|
@@ -201,9 +201,10 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
|
201
201
|
self.req_to_token.stride(0),
|
202
202
|
self.cuda_graph_kv_indices.stride(0),
|
203
203
|
)
|
204
|
+
num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
|
204
205
|
mla_metadata, num_splits = get_mla_metadata(
|
205
206
|
seq_lens.to(torch.int32),
|
206
|
-
|
207
|
+
num_q_heads,
|
207
208
|
1,
|
208
209
|
)
|
209
210
|
self.cuda_graph_mla_metadata.copy_(mla_metadata)
|
@@ -257,7 +258,7 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
|
257
258
|
seq_lens_sum: int,
|
258
259
|
encoder_lens: Optional[torch.Tensor],
|
259
260
|
forward_mode: ForwardMode,
|
260
|
-
spec_info: Optional[
|
261
|
+
spec_info: Optional[SpecInput],
|
261
262
|
seq_lens_cpu: Optional[torch.Tensor],
|
262
263
|
):
|
263
264
|
|
@@ -275,9 +276,10 @@ class FlashMLABackend(FlashInferMLAAttnBackend):
|
|
275
276
|
self.req_to_token.stride(0),
|
276
277
|
self.cuda_graph_kv_indices.stride(0),
|
277
278
|
)
|
279
|
+
num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
|
278
280
|
mla_metadata, num_splits = get_mla_metadata(
|
279
281
|
seq_lens.to(torch.int32),
|
280
|
-
|
282
|
+
num_q_heads,
|
281
283
|
1,
|
282
284
|
)
|
283
285
|
self.cuda_graph_mla_metadata.copy_(mla_metadata)
|
@@ -3,10 +3,11 @@ from typing import Optional, Union
|
|
3
3
|
import torch
|
4
4
|
|
5
5
|
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
6
|
+
from sglang.srt.layers.attention.nsa.nsa_indexer import BaseIndexerMetadata
|
6
7
|
from sglang.srt.layers.radix_attention import RadixAttention
|
7
8
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
8
9
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
9
|
-
from sglang.srt.speculative.
|
10
|
+
from sglang.srt.speculative.spec_info import SpecInput
|
10
11
|
|
11
12
|
|
12
13
|
class HybridAttnBackend(AttentionBackend):
|
@@ -21,18 +22,46 @@ class HybridAttnBackend(AttentionBackend):
|
|
21
22
|
self.model_runner = model_runner
|
22
23
|
self.prefill_backend = prefill_backend
|
23
24
|
self.decode_backend = decode_backend
|
25
|
+
self.data_type = model_runner.kv_cache_dtype
|
24
26
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
27
|
+
def _select_backend(self, forward_mode: ForwardMode) -> AttentionBackend:
|
28
|
+
"""
|
29
|
+
Select the appropriate attention backend based on the forward mode.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
forward_mode: The current forward mode indicating the operation type
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
The selected attention backend (prefill or decode)
|
36
|
+
|
37
|
+
Note:
|
38
|
+
- decode_or_idle: Always uses decode backend
|
39
|
+
- target_verify or draft_extend: Uses decode backend if speculative_attention_mode is "decode", otherwise prefill backend
|
40
|
+
- prefill: Always uses prefill backend
|
41
|
+
"""
|
42
|
+
if forward_mode.is_decode_or_idle():
|
43
|
+
return self.decode_backend
|
44
|
+
elif forward_mode.is_target_verify() or forward_mode.is_draft_extend():
|
45
|
+
return (
|
46
|
+
self.decode_backend
|
47
|
+
if self.model_runner.server_args.speculative_attention_mode == "decode"
|
48
|
+
else self.prefill_backend
|
49
|
+
)
|
28
50
|
else:
|
29
|
-
self.prefill_backend
|
51
|
+
return self.prefill_backend
|
52
|
+
|
53
|
+
def init_forward_metadata(self, forward_batch: ForwardBatch):
|
54
|
+
backend = self._select_backend(forward_batch.forward_mode)
|
55
|
+
backend.init_forward_metadata(forward_batch)
|
30
56
|
|
31
57
|
def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
|
32
58
|
self.decode_backend.init_cuda_graph_state(max_bs, max_num_tokens)
|
33
|
-
if
|
34
|
-
|
35
|
-
|
59
|
+
if (
|
60
|
+
self.model_runner.server_args.speculative_algorithm is not None
|
61
|
+
and self.model_runner.server_args.speculative_attention_mode == "prefill"
|
62
|
+
):
|
63
|
+
# When speculative decoding is enabled, we need to initialize the backend
|
64
|
+
# that will be used for target_verify.
|
36
65
|
self.prefill_backend.init_cuda_graph_state(max_bs, max_num_tokens)
|
37
66
|
|
38
67
|
def init_forward_metadata_capture_cuda_graph(
|
@@ -43,28 +72,18 @@ class HybridAttnBackend(AttentionBackend):
|
|
43
72
|
seq_lens: torch.Tensor,
|
44
73
|
encoder_lens: Optional[torch.Tensor],
|
45
74
|
forward_mode: ForwardMode,
|
46
|
-
spec_info: Optional[
|
75
|
+
spec_info: Optional[SpecInput],
|
47
76
|
):
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
else:
|
59
|
-
self.prefill_backend.init_forward_metadata_capture_cuda_graph(
|
60
|
-
bs,
|
61
|
-
num_tokens,
|
62
|
-
req_pool_indices,
|
63
|
-
seq_lens,
|
64
|
-
encoder_lens,
|
65
|
-
forward_mode,
|
66
|
-
spec_info,
|
67
|
-
)
|
77
|
+
backend = self._select_backend(forward_mode)
|
78
|
+
backend.init_forward_metadata_capture_cuda_graph(
|
79
|
+
bs,
|
80
|
+
num_tokens,
|
81
|
+
req_pool_indices,
|
82
|
+
seq_lens,
|
83
|
+
encoder_lens,
|
84
|
+
forward_mode,
|
85
|
+
spec_info,
|
86
|
+
)
|
68
87
|
|
69
88
|
def init_forward_metadata_replay_cuda_graph(
|
70
89
|
self,
|
@@ -74,31 +93,20 @@ class HybridAttnBackend(AttentionBackend):
|
|
74
93
|
seq_lens_sum: int,
|
75
94
|
encoder_lens: Optional[torch.Tensor],
|
76
95
|
forward_mode: ForwardMode,
|
77
|
-
spec_info: Optional[
|
96
|
+
spec_info: Optional[SpecInput],
|
78
97
|
seq_lens_cpu: Optional[torch.Tensor],
|
79
98
|
):
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
else:
|
92
|
-
self.prefill_backend.init_forward_metadata_replay_cuda_graph(
|
93
|
-
bs,
|
94
|
-
req_pool_indices,
|
95
|
-
seq_lens,
|
96
|
-
seq_lens_sum,
|
97
|
-
encoder_lens,
|
98
|
-
forward_mode,
|
99
|
-
spec_info,
|
100
|
-
seq_lens_cpu,
|
101
|
-
)
|
99
|
+
backend = self._select_backend(forward_mode)
|
100
|
+
backend.init_forward_metadata_replay_cuda_graph(
|
101
|
+
bs,
|
102
|
+
req_pool_indices,
|
103
|
+
seq_lens,
|
104
|
+
seq_lens_sum,
|
105
|
+
encoder_lens,
|
106
|
+
forward_mode,
|
107
|
+
spec_info,
|
108
|
+
seq_lens_cpu,
|
109
|
+
)
|
102
110
|
|
103
111
|
def get_cuda_graph_seq_len_fill_value(self):
|
104
112
|
return self.decode_backend.get_cuda_graph_seq_len_fill_value()
|
@@ -127,6 +135,13 @@ class HybridAttnBackend(AttentionBackend):
|
|
127
135
|
save_kv_cache: bool = True,
|
128
136
|
**kwargs,
|
129
137
|
):
|
130
|
-
|
138
|
+
backend = self._select_backend(forward_batch.forward_mode)
|
139
|
+
return backend.forward_extend(
|
131
140
|
q, k, v, layer, forward_batch, save_kv_cache, **kwargs
|
132
141
|
)
|
142
|
+
|
143
|
+
def get_indexer_metadata(
|
144
|
+
self, layer_id: int, forward_batch: ForwardBatch
|
145
|
+
) -> Optional[BaseIndexerMetadata]:
|
146
|
+
backend = self._select_backend(forward_batch.forward_mode)
|
147
|
+
return backend.get_indexer_metadata(layer_id, forward_batch)
|