sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -11,9 +11,8 @@ import triton.language as tl
|
|
11
11
|
from sglang.srt.configs.model_config import AttentionArch
|
12
12
|
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
13
13
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
14
|
-
from sglang.srt.mem_cache.memory_pool import SWAKVPool
|
15
14
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
16
|
-
from sglang.srt.speculative.
|
15
|
+
from sglang.srt.speculative.spec_info import SpecInput
|
17
16
|
|
18
17
|
if TYPE_CHECKING:
|
19
18
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -305,6 +304,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
305
304
|
speculative_step_id=0,
|
306
305
|
topk=0,
|
307
306
|
speculative_num_steps=0,
|
307
|
+
fa_impl_ver=3,
|
308
308
|
):
|
309
309
|
super().__init__()
|
310
310
|
|
@@ -338,6 +338,8 @@ class FlashAttentionBackend(AttentionBackend):
|
|
338
338
|
)
|
339
339
|
self.speculative_step_id = speculative_step_id
|
340
340
|
|
341
|
+
self.fa_impl_ver = fa_impl_ver
|
342
|
+
|
341
343
|
# Local attention settings
|
342
344
|
self.attention_chunk_size = (
|
343
345
|
model_runner.attention_chunk_size
|
@@ -352,6 +354,13 @@ class FlashAttentionBackend(AttentionBackend):
|
|
352
354
|
self.sliding_window_size is not None and self.sliding_window_size > -1
|
353
355
|
)
|
354
356
|
|
357
|
+
# If num_splits == 0, we use a heuristic to automatically determine the number of splits.
|
358
|
+
# We set nums splits to 1 if deterministic inference is enabled.
|
359
|
+
# See https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/ for more details.
|
360
|
+
self.num_splits = (
|
361
|
+
1 if model_runner.server_args.enable_deterministic_inference else 0
|
362
|
+
)
|
363
|
+
|
355
364
|
def init_forward_metadata(self, forward_batch: ForwardBatch):
|
356
365
|
"""Initialize forward metadata hence all layers in the forward pass can reuse it."""
|
357
366
|
metadata = FlashAttentionMetadata()
|
@@ -682,8 +691,13 @@ class FlashAttentionBackend(AttentionBackend):
|
|
682
691
|
k_descale, v_descale = None, None
|
683
692
|
# only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
|
684
693
|
# has corresponding quantization method so that layer.k_scale is not None,
|
685
|
-
# 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case
|
686
|
-
|
694
|
+
# 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case,
|
695
|
+
# 4) fa_impl_ver != 4 since fa4 does not currently support fp8 queries and keys.
|
696
|
+
if (
|
697
|
+
self.kv_cache_dtype_str != "auto"
|
698
|
+
and layer.head_dim <= 256
|
699
|
+
and self.fa_impl_ver != 4
|
700
|
+
):
|
687
701
|
if layer.k_scale is not None:
|
688
702
|
descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
|
689
703
|
k_descale = layer.k_scale.expand(descale_shape)
|
@@ -712,6 +726,8 @@ class FlashAttentionBackend(AttentionBackend):
|
|
712
726
|
|
713
727
|
# For fa3 interface version compatibility, we put new fields into conditional keyword args
|
714
728
|
kwargs = {}
|
729
|
+
if self.fa_impl_ver != 3:
|
730
|
+
kwargs["ver"] = self.fa_impl_ver
|
715
731
|
if sinks is not None:
|
716
732
|
kwargs["sinks"] = sinks
|
717
733
|
|
@@ -738,6 +754,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
738
754
|
|
739
755
|
# Use Flash Attention for prefill
|
740
756
|
if not self.use_mla:
|
757
|
+
assert self.fa_impl_ver in [3], "Only FA3 support here"
|
741
758
|
# Do multi-head attention
|
742
759
|
key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
|
743
760
|
layer.layer_id
|
@@ -770,6 +787,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
770
787
|
k_descale=k_descale,
|
771
788
|
v_descale=v_descale,
|
772
789
|
return_softmax_lse=use_cascade_attn,
|
790
|
+
num_splits=self.num_splits,
|
773
791
|
**kwargs,
|
774
792
|
)
|
775
793
|
|
@@ -791,6 +809,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
791
809
|
k_descale=k_descale,
|
792
810
|
v_descale=v_descale,
|
793
811
|
return_softmax_lse=True,
|
812
|
+
num_splits=self.num_splits,
|
794
813
|
**kwargs,
|
795
814
|
)
|
796
815
|
o, _ = merge_state_v2_wrapper(
|
@@ -830,6 +849,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
830
849
|
softmax_scale=layer.scaling,
|
831
850
|
causal=False,
|
832
851
|
return_softmax_lse=True,
|
852
|
+
**kwargs,
|
833
853
|
)
|
834
854
|
else:
|
835
855
|
# MHA for extend part of sequence without attending prefix kv cache
|
@@ -844,6 +864,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
844
864
|
softmax_scale=layer.scaling,
|
845
865
|
causal=True,
|
846
866
|
return_softmax_lse=forward_batch.mha_return_lse,
|
867
|
+
**kwargs,
|
847
868
|
)
|
848
869
|
if forward_batch.mha_return_lse:
|
849
870
|
output, lse, *rest = output
|
@@ -851,6 +872,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
851
872
|
return output, lse
|
852
873
|
return output
|
853
874
|
else:
|
875
|
+
assert self.fa_impl_ver in [3], "Only FA3 support here"
|
854
876
|
# Do absorbed multi-latent attention
|
855
877
|
kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(
|
856
878
|
layer.layer_id
|
@@ -892,6 +914,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
892
914
|
k_descale=k_descale,
|
893
915
|
v_descale=v_descale,
|
894
916
|
return_softmax_lse=use_cascade_attn,
|
917
|
+
num_splits=self.num_splits,
|
895
918
|
)
|
896
919
|
if use_cascade_attn:
|
897
920
|
o, softmax_lse, *rest = result
|
@@ -913,6 +936,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
913
936
|
k_descale=k_descale,
|
914
937
|
v_descale=v_descale,
|
915
938
|
return_softmax_lse=True,
|
939
|
+
num_splits=self.num_splits,
|
916
940
|
)
|
917
941
|
)
|
918
942
|
o, _ = merge_state_v2_wrapper(
|
@@ -939,6 +963,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
939
963
|
k_rope: Optional[torch.Tensor] = None,
|
940
964
|
sinks: Optional[torch.Tensor] = None,
|
941
965
|
) -> torch.Tensor:
|
966
|
+
assert self.fa_impl_ver in [3], "Only FA3 support decoding"
|
942
967
|
if k is not None:
|
943
968
|
assert v is not None
|
944
969
|
if save_kv_cache:
|
@@ -985,6 +1010,8 @@ class FlashAttentionBackend(AttentionBackend):
|
|
985
1010
|
|
986
1011
|
# For fa3 interface version compatibility, we put new fields into conditional keyword args
|
987
1012
|
kwargs = {}
|
1013
|
+
if self.fa_impl_ver != 3:
|
1014
|
+
kwargs["ver"] = self.fa_impl_ver
|
988
1015
|
if sinks is not None:
|
989
1016
|
kwargs["sinks"] = sinks
|
990
1017
|
|
@@ -1030,6 +1057,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1030
1057
|
softcap=layer.logit_cap,
|
1031
1058
|
k_descale=k_descale,
|
1032
1059
|
v_descale=v_descale,
|
1060
|
+
num_splits=self.num_splits,
|
1033
1061
|
**kwargs,
|
1034
1062
|
)
|
1035
1063
|
elif use_local_attn:
|
@@ -1049,6 +1077,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1049
1077
|
softcap=layer.logit_cap,
|
1050
1078
|
k_descale=k_descale,
|
1051
1079
|
v_descale=v_descale,
|
1080
|
+
num_splits=self.num_splits,
|
1052
1081
|
**kwargs,
|
1053
1082
|
)
|
1054
1083
|
else:
|
@@ -1077,6 +1106,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1077
1106
|
k_descale=k_descale,
|
1078
1107
|
v_descale=v_descale,
|
1079
1108
|
return_softmax_lse=use_cascade_attn,
|
1109
|
+
num_splits=self.num_splits,
|
1080
1110
|
**kwargs,
|
1081
1111
|
)
|
1082
1112
|
if use_cascade_attn:
|
@@ -1098,6 +1128,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1098
1128
|
k_descale=k_descale,
|
1099
1129
|
v_descale=v_descale,
|
1100
1130
|
return_softmax_lse=True,
|
1131
|
+
num_splits=self.num_splits,
|
1101
1132
|
**kwargs,
|
1102
1133
|
)
|
1103
1134
|
)
|
@@ -1153,6 +1184,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1153
1184
|
k_descale=k_descale,
|
1154
1185
|
v_descale=v_descale,
|
1155
1186
|
return_softmax_lse=use_cascade_attn, # softmax_lse is needed for merge states
|
1187
|
+
num_splits=self.num_splits,
|
1156
1188
|
)
|
1157
1189
|
if use_cascade_attn:
|
1158
1190
|
o, softmax_lse, *rest = result
|
@@ -1173,6 +1205,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1173
1205
|
k_descale=k_descale,
|
1174
1206
|
v_descale=v_descale,
|
1175
1207
|
return_softmax_lse=True,
|
1208
|
+
num_splits=self.num_splits,
|
1176
1209
|
)
|
1177
1210
|
o, _ = merge_state_v2(
|
1178
1211
|
o,
|
@@ -1453,7 +1486,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1453
1486
|
seq_lens: torch.Tensor,
|
1454
1487
|
encoder_lens: Optional[torch.Tensor],
|
1455
1488
|
forward_mode: ForwardMode,
|
1456
|
-
spec_info: Optional[
|
1489
|
+
spec_info: Optional[SpecInput],
|
1457
1490
|
):
|
1458
1491
|
"""Initialize forward metadata for capturing CUDA graph."""
|
1459
1492
|
metadata = FlashAttentionMetadata()
|
@@ -1688,7 +1721,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1688
1721
|
seq_lens_sum: int,
|
1689
1722
|
encoder_lens: Optional[torch.Tensor],
|
1690
1723
|
forward_mode: ForwardMode,
|
1691
|
-
spec_info: Optional[
|
1724
|
+
spec_info: Optional[SpecInput],
|
1692
1725
|
seq_lens_cpu: Optional[torch.Tensor],
|
1693
1726
|
out_cache_loc: Optional[torch.Tensor] = None,
|
1694
1727
|
):
|
@@ -2306,7 +2339,7 @@ class FlashAttentionMultiStepBackend:
|
|
2306
2339
|
forward_batch: ForwardBatch,
|
2307
2340
|
):
|
2308
2341
|
assert forward_batch.spec_info is not None
|
2309
|
-
assert
|
2342
|
+
assert forward_batch.spec_info.is_draft_input()
|
2310
2343
|
|
2311
2344
|
for i in range(self.speculative_num_steps - 1):
|
2312
2345
|
self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
|
@@ -2323,7 +2356,7 @@ class FlashAttentionMultiStepBackend:
|
|
2323
2356
|
self, forward_batch: ForwardBatch, bs: int
|
2324
2357
|
):
|
2325
2358
|
assert forward_batch.spec_info is not None
|
2326
|
-
assert
|
2359
|
+
assert forward_batch.spec_info.is_draft_input()
|
2327
2360
|
|
2328
2361
|
for i in range(self.speculative_num_steps - 1):
|
2329
2362
|
# TODO: incrementally update the metadata for the later steps,
|