sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ from sglang.srt.managers.schedule_batch import (
|
|
19
19
|
get_last_loc,
|
20
20
|
global_server_args_dict,
|
21
21
|
)
|
22
|
+
from sglang.srt.managers.scheduler import GenerationBatchResult
|
22
23
|
from sglang.srt.managers.tp_worker import TpModelWorker
|
23
24
|
from sglang.srt.model_executor.forward_batch_info import (
|
24
25
|
CaptureHiddenMode,
|
@@ -33,20 +34,23 @@ from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
|
|
33
34
|
from sglang.srt.speculative.eagle_draft_extend_cuda_graph_runner import (
|
34
35
|
EAGLEDraftExtendCudaGraphRunner,
|
35
36
|
)
|
36
|
-
from sglang.srt.speculative.
|
37
|
+
from sglang.srt.speculative.eagle_info import (
|
37
38
|
EagleDraftInput,
|
38
39
|
EagleVerifyInput,
|
39
40
|
EagleVerifyOutput,
|
41
|
+
)
|
42
|
+
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
43
|
+
from sglang.srt.speculative.spec_utils import (
|
40
44
|
assign_draft_cache_locs,
|
41
45
|
fast_topk,
|
42
46
|
generate_token_bitmask,
|
43
47
|
select_top_k_tokens,
|
44
48
|
)
|
45
|
-
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
46
49
|
from sglang.srt.utils import (
|
47
50
|
empty_context,
|
48
51
|
get_available_gpu_memory,
|
49
52
|
get_bool_env_var,
|
53
|
+
is_blackwell,
|
50
54
|
is_cuda,
|
51
55
|
next_power_of_2,
|
52
56
|
)
|
@@ -190,7 +194,7 @@ class EAGLEWorker(TpModelWorker):
|
|
190
194
|
# Initialize decode attention backend
|
191
195
|
self.draft_attn_backend = self._create_decode_backend()
|
192
196
|
|
193
|
-
# Initialize
|
197
|
+
# Initialize draft extend attention backend (respects speculative_attention_mode setting)
|
194
198
|
self.draft_extend_attn_backend = self._create_draft_extend_backend()
|
195
199
|
|
196
200
|
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
@@ -213,6 +217,11 @@ class EAGLEWorker(TpModelWorker):
|
|
213
217
|
"triton": self._create_triton_decode_backend,
|
214
218
|
"aiter": self._create_aiter_decode_backend,
|
215
219
|
"fa3": self._create_fa3_decode_backend,
|
220
|
+
"hybrid_linear_attn": (
|
221
|
+
self._create_fa3_decode_backend
|
222
|
+
if not is_blackwell()
|
223
|
+
else self._create_triton_decode_backend
|
224
|
+
),
|
216
225
|
"flashmla": self._create_flashmla_decode_backend,
|
217
226
|
"trtllm_mha": self._create_trtllm_mha_decode_backend,
|
218
227
|
"trtllm_mla": self._create_trtllm_mla_decode_backend,
|
@@ -230,14 +239,24 @@ class EAGLEWorker(TpModelWorker):
|
|
230
239
|
"triton": self._create_triton_prefill_backend,
|
231
240
|
"aiter": self._create_aiter_prefill_backend,
|
232
241
|
"fa3": self._create_fa3_prefill_backend,
|
242
|
+
"hybrid_linear_attn": (
|
243
|
+
self._create_fa3_prefill_backend
|
244
|
+
if not is_blackwell()
|
245
|
+
else self._create_triton_prefill_backend
|
246
|
+
),
|
247
|
+
"flashmla": self._create_flashmla_prefill_backend,
|
233
248
|
"trtllm_mha": self._create_trtllm_mha_prefill_backend,
|
234
249
|
"trtllm_mla": self._create_trtllm_mla_prefill_backend,
|
235
250
|
}
|
236
|
-
|
251
|
+
backend_name = (
|
252
|
+
"decode_attention_backend"
|
253
|
+
if self.server_args.speculative_attention_mode == "decode"
|
254
|
+
else "prefill_attention_backend"
|
255
|
+
)
|
237
256
|
return self._create_backend(
|
238
|
-
|
257
|
+
backend_name,
|
239
258
|
backend_map,
|
240
|
-
"EAGLE is not supported in
|
259
|
+
"EAGLE is not supported in attention backend {backend_type}",
|
241
260
|
)
|
242
261
|
|
243
262
|
def _create_flashinfer_decode_backend(self):
|
@@ -365,6 +384,12 @@ class EAGLEWorker(TpModelWorker):
|
|
365
384
|
|
366
385
|
return TRTLLMMLABackend(self.draft_model_runner, skip_prefill=False)
|
367
386
|
|
387
|
+
def _create_flashmla_prefill_backend(self):
|
388
|
+
logger.warning(
|
389
|
+
"flashmla prefill backend is not yet supported for draft extend."
|
390
|
+
)
|
391
|
+
return None
|
392
|
+
|
368
393
|
def init_cuda_graphs(self):
|
369
394
|
"""Capture cuda graphs."""
|
370
395
|
self.cuda_graph_runner = None
|
@@ -404,9 +429,7 @@ class EAGLEWorker(TpModelWorker):
|
|
404
429
|
def draft_model_runner(self):
|
405
430
|
return self.model_runner
|
406
431
|
|
407
|
-
def
|
408
|
-
self, batch: ScheduleBatch
|
409
|
-
) -> Tuple[LogitsProcessorOutput, torch.Tensor, int, int, bool]:
|
432
|
+
def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResult:
|
410
433
|
"""Run speculative decoding forward.
|
411
434
|
|
412
435
|
NOTE: Many states of batch is modified as you go through. It is not guaranteed that
|
@@ -419,14 +442,19 @@ class EAGLEWorker(TpModelWorker):
|
|
419
442
|
the batch id (used for overlap schedule), and number of accepted tokens.
|
420
443
|
"""
|
421
444
|
if batch.forward_mode.is_extend() or batch.is_extend_in_batch:
|
422
|
-
logits_output, next_token_ids,
|
423
|
-
|
445
|
+
logits_output, next_token_ids, seq_lens_cpu = self.forward_target_extend(
|
446
|
+
batch
|
424
447
|
)
|
425
448
|
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
426
449
|
self.forward_draft_extend(
|
427
450
|
batch, logits_output.hidden_states, next_token_ids, seq_lens_cpu
|
428
451
|
)
|
429
|
-
return
|
452
|
+
return GenerationBatchResult(
|
453
|
+
logits_output=logits_output,
|
454
|
+
next_token_ids=next_token_ids,
|
455
|
+
num_accepted_tokens=0,
|
456
|
+
can_run_cuda_graph=False,
|
457
|
+
)
|
430
458
|
else:
|
431
459
|
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
432
460
|
spec_info = self.draft(batch)
|
@@ -444,12 +472,11 @@ class EAGLEWorker(TpModelWorker):
|
|
444
472
|
# decode is not finished
|
445
473
|
self.forward_draft_extend_after_decode(batch)
|
446
474
|
|
447
|
-
return (
|
448
|
-
logits_output,
|
449
|
-
verify_output.verified_id,
|
450
|
-
|
451
|
-
|
452
|
-
can_run_cuda_graph,
|
475
|
+
return GenerationBatchResult(
|
476
|
+
logits_output=logits_output,
|
477
|
+
next_token_ids=verify_output.verified_id,
|
478
|
+
num_accepted_tokens=sum(verify_output.accept_length_per_req_cpu),
|
479
|
+
can_run_cuda_graph=can_run_cuda_graph,
|
453
480
|
)
|
454
481
|
|
455
482
|
def check_forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
@@ -481,19 +508,19 @@ class EAGLEWorker(TpModelWorker):
|
|
481
508
|
Returns:
|
482
509
|
logits_output: The output of logits. It will contain the full hidden states.
|
483
510
|
next_token_ids: Next token ids generated.
|
484
|
-
bid: The model batch ID. Used for overlap schedule.
|
485
511
|
"""
|
486
512
|
# Forward with the target model and get hidden states.
|
487
513
|
# We need the full hidden states to prefill the KV cache of the draft model.
|
488
514
|
model_worker_batch = batch.get_model_worker_batch()
|
489
515
|
model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
|
490
|
-
|
491
|
-
|
516
|
+
batch_result = self.target_worker.forward_batch_generation(model_worker_batch)
|
517
|
+
logits_output, next_token_ids = (
|
518
|
+
batch_result.logits_output,
|
519
|
+
batch_result.next_token_ids,
|
492
520
|
)
|
493
521
|
return (
|
494
522
|
logits_output,
|
495
523
|
next_token_ids,
|
496
|
-
model_worker_batch.bid,
|
497
524
|
model_worker_batch.seq_lens_cpu,
|
498
525
|
)
|
499
526
|
|
@@ -525,6 +552,8 @@ class EAGLEWorker(TpModelWorker):
|
|
525
552
|
batch.seq_lens,
|
526
553
|
self.speculative_num_steps,
|
527
554
|
)
|
555
|
+
prefix_lens_cpu = batch.seq_lens_cpu
|
556
|
+
seq_lens_cpu = batch.seq_lens_cpu + self.speculative_num_steps
|
528
557
|
extend_num_tokens = num_seqs * self.speculative_num_steps
|
529
558
|
else:
|
530
559
|
# In this case, the last partial page needs to be duplicated.
|
@@ -560,14 +589,23 @@ class EAGLEWorker(TpModelWorker):
|
|
560
589
|
self.topk,
|
561
590
|
self.page_size,
|
562
591
|
)
|
563
|
-
|
564
|
-
|
565
|
-
|
592
|
+
prefix_lens_cpu = batch.seq_lens_cpu
|
593
|
+
last_page_lens = prefix_lens_cpu % self.page_size
|
594
|
+
num_new_pages_per_topk = (
|
595
|
+
last_page_lens + self.speculative_num_steps + self.page_size - 1
|
596
|
+
) // self.page_size
|
597
|
+
seq_lens_cpu = (
|
598
|
+
prefix_lens_cpu // self.page_size * self.page_size
|
599
|
+
+ num_new_pages_per_topk * (self.page_size * self.topk)
|
600
|
+
)
|
601
|
+
extend_num_tokens = torch.sum((seq_lens_cpu - prefix_lens_cpu)).item()
|
566
602
|
|
567
603
|
out_cache_loc, token_to_kv_pool_state_backup = (
|
568
604
|
batch.alloc_paged_token_slots_extend(
|
569
605
|
prefix_lens,
|
606
|
+
prefix_lens_cpu,
|
570
607
|
seq_lens,
|
608
|
+
seq_lens_cpu,
|
571
609
|
last_loc,
|
572
610
|
extend_num_tokens,
|
573
611
|
backup_state=True,
|
@@ -729,6 +767,14 @@ class EAGLEWorker(TpModelWorker):
|
|
729
767
|
|
730
768
|
# Set inputs
|
731
769
|
forward_batch.input_ids = input_ids
|
770
|
+
# This is a temporary fix for the case that the user is using standalone
|
771
|
+
# speculative decoding and the draft model architecture is gpt-oss. gpt-oss
|
772
|
+
# rope kernel needs cache_loc to be contiguous.
|
773
|
+
if (
|
774
|
+
self.server_args.speculative_algorithm == "STANDALONE"
|
775
|
+
and self.model_config.hf_config.architectures[0] == "GptOssForCausalLM"
|
776
|
+
):
|
777
|
+
out_cache_loc = out_cache_loc.contiguous()
|
732
778
|
forward_batch.out_cache_loc = out_cache_loc[i]
|
733
779
|
forward_batch.positions.add_(1)
|
734
780
|
forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
|
@@ -747,6 +793,10 @@ class EAGLEWorker(TpModelWorker):
|
|
747
793
|
|
748
794
|
return score_list, token_list, parents_list
|
749
795
|
|
796
|
+
def clear_cache_pool(self):
|
797
|
+
self.model_runner.req_to_token_pool.clear()
|
798
|
+
self.model_runner.token_to_kv_pool_allocator.clear()
|
799
|
+
|
750
800
|
def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
|
751
801
|
spec_info.prepare_for_verify(batch, self.page_size)
|
752
802
|
batch.return_hidden_states = False
|
@@ -770,10 +820,12 @@ class EAGLEWorker(TpModelWorker):
|
|
770
820
|
).cpu()
|
771
821
|
|
772
822
|
# Forward
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
823
|
+
batch_result = self.target_worker.forward_batch_generation(
|
824
|
+
model_worker_batch, is_verify=True
|
825
|
+
)
|
826
|
+
logits_output, can_run_cuda_graph = (
|
827
|
+
batch_result.logits_output,
|
828
|
+
batch_result.can_run_cuda_graph,
|
777
829
|
)
|
778
830
|
|
779
831
|
vocab_mask = None
|
@@ -813,6 +865,21 @@ class EAGLEWorker(TpModelWorker):
|
|
813
865
|
]
|
814
866
|
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
|
815
867
|
|
868
|
+
# QQ: can be optimized
|
869
|
+
if self.target_worker.model_runner.hybrid_gdn_config is not None:
|
870
|
+
# res.draft_input.accept_length is on GPU but may be empty for last verify?
|
871
|
+
accepted_length = (
|
872
|
+
torch.tensor(
|
873
|
+
res.accept_length_per_req_cpu,
|
874
|
+
device=logits_output.hidden_states.device,
|
875
|
+
dtype=torch.int32,
|
876
|
+
)
|
877
|
+
+ 1
|
878
|
+
)
|
879
|
+
self.target_worker.model_runner.attn_backend.update_mamba_state_after_mtp_verify(
|
880
|
+
accepted_length, self.target_worker.model_runner.model
|
881
|
+
)
|
882
|
+
|
816
883
|
if batch.return_logprob:
|
817
884
|
self.add_logprob_values(batch, res, logits_output)
|
818
885
|
|
@@ -958,6 +1025,7 @@ class EAGLEWorker(TpModelWorker):
|
|
958
1025
|
assert isinstance(batch.spec_info, EagleDraftInput)
|
959
1026
|
# Backup fields that will be modified in-place
|
960
1027
|
seq_lens_backup = batch.seq_lens.clone()
|
1028
|
+
seq_lens_cpu_backup = batch.seq_lens_cpu.clone()
|
961
1029
|
req_pool_indices_backup = batch.req_pool_indices
|
962
1030
|
accept_length_backup = batch.spec_info.accept_length
|
963
1031
|
return_logprob_backup = batch.return_logprob
|
@@ -1036,6 +1104,7 @@ class EAGLEWorker(TpModelWorker):
|
|
1036
1104
|
ForwardMode.DECODE if not input_is_idle else ForwardMode.IDLE
|
1037
1105
|
)
|
1038
1106
|
batch.seq_lens = seq_lens_backup
|
1107
|
+
batch.seq_lens_cpu = seq_lens_cpu_backup
|
1039
1108
|
batch.req_pool_indices = req_pool_indices_backup
|
1040
1109
|
batch.spec_info.accept_length = accept_length_backup
|
1041
1110
|
batch.return_logprob = return_logprob_backup
|