sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -30,6 +30,13 @@ import torch
|
|
30
30
|
from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
|
31
31
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
|
32
32
|
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
|
33
|
+
from sglang.srt.mem_cache.radix_cache import (
|
34
|
+
RadixKey,
|
35
|
+
_convert_to_bigram_key,
|
36
|
+
_key_match_page_size1,
|
37
|
+
_key_match_paged,
|
38
|
+
get_child_key,
|
39
|
+
)
|
33
40
|
|
34
41
|
if TYPE_CHECKING:
|
35
42
|
from sglang.srt.managers.schedule_batch import Req
|
@@ -47,7 +54,7 @@ class TreeNode:
|
|
47
54
|
def __init__(self, id: Optional[int] = None):
|
48
55
|
self.children = defaultdict(TreeNode)
|
49
56
|
self.parent: TreeNode = None
|
50
|
-
self.key:
|
57
|
+
self.key: RadixKey = None
|
51
58
|
self.value: Optional[torch.Tensor] = None
|
52
59
|
# swa_tombstone is used to indicate the kv indices have been freed for swa layers
|
53
60
|
self.swa_tombstone = False
|
@@ -60,8 +67,6 @@ class TreeNode:
|
|
60
67
|
self.last_access_time = time.monotonic()
|
61
68
|
|
62
69
|
self.hit_count = 0
|
63
|
-
# indicating the node is loading KV cache from host
|
64
|
-
self.loading = False
|
65
70
|
# store the host indices of KV cache
|
66
71
|
self.host_value = None
|
67
72
|
|
@@ -89,27 +94,6 @@ class TreeNode:
|
|
89
94
|
return self.last_access_time < other.last_access_time
|
90
95
|
|
91
96
|
|
92
|
-
def _key_match_page_size1(key0: List, key1: List):
|
93
|
-
i = 0
|
94
|
-
for k0, k1 in zip(key0, key1):
|
95
|
-
if k0 != k1:
|
96
|
-
break
|
97
|
-
i += 1
|
98
|
-
return i
|
99
|
-
|
100
|
-
|
101
|
-
def _key_match_paged(key0: List, key1: List, page_size: int):
|
102
|
-
min_len = min(len(key0), len(key1))
|
103
|
-
|
104
|
-
i = 0
|
105
|
-
while i < min_len:
|
106
|
-
if key0[i : i + page_size] != key1[i : i + page_size]:
|
107
|
-
break
|
108
|
-
i += page_size
|
109
|
-
|
110
|
-
return i
|
111
|
-
|
112
|
-
|
113
97
|
def gen_swa_uuid() -> int:
|
114
98
|
TreeNode.swa_uuid_counter += 1
|
115
99
|
return TreeNode.swa_uuid_counter
|
@@ -344,12 +328,14 @@ class SWARadixCache(BasePrefixCache):
|
|
344
328
|
sliding_window_size: int,
|
345
329
|
page_size: int,
|
346
330
|
disable: bool = False,
|
331
|
+
is_eagle: bool = False,
|
347
332
|
):
|
348
333
|
assert isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
|
349
334
|
self.req_to_token_pool = req_to_token_pool
|
350
335
|
self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
|
351
336
|
self.page_size = page_size
|
352
337
|
self.disable = disable
|
338
|
+
self.is_eagle = is_eagle
|
353
339
|
|
354
340
|
if self.token_to_kv_pool_allocator:
|
355
341
|
self.device = self.token_to_kv_pool_allocator.device
|
@@ -358,10 +344,15 @@ class SWARadixCache(BasePrefixCache):
|
|
358
344
|
|
359
345
|
if self.page_size == 1:
|
360
346
|
self.key_match_fn = _key_match_page_size1
|
361
|
-
self.get_child_key_fn =
|
347
|
+
self.get_child_key_fn = get_child_key
|
362
348
|
else:
|
363
349
|
self.key_match_fn = partial(_key_match_paged, page_size=page_size)
|
364
|
-
self.get_child_key_fn =
|
350
|
+
self.get_child_key_fn = partial(get_child_key, page_size=page_size)
|
351
|
+
|
352
|
+
if is_eagle:
|
353
|
+
self.key_convert_fn = _convert_to_bigram_key
|
354
|
+
else:
|
355
|
+
self.key_convert_fn = lambda key: key
|
365
356
|
|
366
357
|
self.sliding_window_size = sliding_window_size
|
367
358
|
self.reset()
|
@@ -382,10 +373,10 @@ class SWARadixCache(BasePrefixCache):
|
|
382
373
|
self.full_lru_list = LRUList(swa=False)
|
383
374
|
self.swa_lru_list = LRUList(swa=True)
|
384
375
|
|
385
|
-
def match_prefix(self, key:
|
376
|
+
def match_prefix(self, key: RadixKey, **kwargs) -> MatchResult:
|
386
377
|
"""Find the matching prefix from the radix tree.
|
387
378
|
Args:
|
388
|
-
key: A
|
379
|
+
key: A RadixKey contains token IDs to find a matching prefix.
|
389
380
|
Returns:
|
390
381
|
A tuple of a tensor of matching prefix token IDs and
|
391
382
|
the last node that contains the prefix values. Note that
|
@@ -393,6 +384,8 @@ class SWARadixCache(BasePrefixCache):
|
|
393
384
|
The last node create a new child if the prefix is shorter
|
394
385
|
than the last node's value.
|
395
386
|
"""
|
387
|
+
key.token_ids = self.key_convert_fn(key.token_ids)
|
388
|
+
|
396
389
|
if self.disable or len(key) == 0:
|
397
390
|
return MatchResult(
|
398
391
|
device_indices=torch.empty(
|
@@ -419,12 +412,19 @@ class SWARadixCache(BasePrefixCache):
|
|
419
412
|
last_host_node=last_node,
|
420
413
|
)
|
421
414
|
|
422
|
-
def insert(self, key:
|
415
|
+
def insert(self, key: RadixKey, value=None, prev_prefix_len: int = 0) -> int:
|
423
416
|
if self.disable:
|
424
417
|
return 0
|
425
418
|
|
419
|
+
key.token_ids = self.key_convert_fn(key.token_ids)
|
420
|
+
|
426
421
|
if value is None:
|
427
|
-
value = [x for x in key]
|
422
|
+
value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
|
423
|
+
|
424
|
+
if self.is_eagle:
|
425
|
+
# Make sure the value len equal to the EAGLE bigram key len
|
426
|
+
value = value[: len(key)]
|
427
|
+
|
428
428
|
return self._insert_helper(self.root_node, key, value, prev_prefix_len)
|
429
429
|
|
430
430
|
def cache_finished_req(self, req: Req) -> None:
|
@@ -439,25 +439,41 @@ class SWARadixCache(BasePrefixCache):
|
|
439
439
|
return
|
440
440
|
|
441
441
|
token_ids = (req.origin_input_ids + req.output_ids)[:-1]
|
442
|
+
all_token_len = len(token_ids)
|
443
|
+
# For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
|
444
|
+
# So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
|
445
|
+
actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
|
442
446
|
kv_indices = self.req_to_token_pool.req_to_token[
|
443
|
-
req.req_pool_idx, :
|
447
|
+
req.req_pool_idx, :all_token_len
|
444
448
|
]
|
445
449
|
|
446
450
|
if self.page_size != 1:
|
447
|
-
page_aligned_len =
|
451
|
+
page_aligned_len = actual_kv_len // self.page_size * self.page_size
|
448
452
|
page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
|
449
453
|
self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
|
450
454
|
else:
|
451
|
-
page_aligned_len =
|
455
|
+
page_aligned_len = actual_kv_len
|
452
456
|
page_aligned_kv_indices = kv_indices.clone()
|
457
|
+
if self.is_eagle:
|
458
|
+
self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
|
459
|
+
|
460
|
+
page_aligned_token_len = (
|
461
|
+
page_aligned_len + 1 if self.is_eagle else page_aligned_len
|
462
|
+
)
|
463
|
+
|
464
|
+
old_prefix_len = len(req.prefix_indices)
|
465
|
+
if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
|
466
|
+
# In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
|
467
|
+
# Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
|
468
|
+
old_prefix_len -= 1
|
453
469
|
|
454
470
|
# Radix Cache takes one ref in memory pool
|
455
471
|
# insert the token_ids and kv_indices into the radix tree
|
456
472
|
# Note: the insert function already frees the overlapped kv_indices
|
457
473
|
new_prefix_len = self.insert(
|
458
|
-
token_ids[:
|
474
|
+
RadixKey(token_ids[:page_aligned_token_len], req.extra_key),
|
459
475
|
page_aligned_kv_indices,
|
460
|
-
|
476
|
+
old_prefix_len,
|
461
477
|
)
|
462
478
|
|
463
479
|
# Remove req slot release the cache lock
|
@@ -476,35 +492,56 @@ class SWARadixCache(BasePrefixCache):
|
|
476
492
|
return
|
477
493
|
|
478
494
|
token_ids = req.fill_ids
|
495
|
+
all_token_len = len(token_ids)
|
496
|
+
# For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
|
497
|
+
# So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
|
498
|
+
actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
|
479
499
|
kv_indices = self.req_to_token_pool.req_to_token[
|
480
|
-
req.req_pool_idx, :
|
500
|
+
req.req_pool_idx, :all_token_len
|
481
501
|
]
|
482
502
|
|
483
503
|
if self.page_size != 1:
|
484
|
-
page_aligned_len =
|
504
|
+
page_aligned_len = actual_kv_len // self.page_size * self.page_size
|
485
505
|
page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
|
486
506
|
else:
|
487
|
-
page_aligned_len =
|
507
|
+
page_aligned_len = actual_kv_len
|
488
508
|
page_aligned_kv_indices = kv_indices.clone()
|
489
|
-
|
509
|
+
|
510
|
+
# For EAGLE, the page_aligned_len is for the bigram key, the normal key len should +1
|
511
|
+
page_aligned_token_len = (
|
512
|
+
page_aligned_len + 1 if self.is_eagle else page_aligned_len
|
513
|
+
)
|
514
|
+
page_aligned_token_ids = token_ids[:page_aligned_token_len]
|
515
|
+
|
516
|
+
old_prefix_len = len(req.prefix_indices)
|
517
|
+
if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
|
518
|
+
# In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
|
519
|
+
# Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
|
520
|
+
old_prefix_len -= 1
|
490
521
|
|
491
522
|
# Radix Cache takes one ref in memory pool
|
492
523
|
# Note: the insert function already frees the overlapped kv_indices
|
493
524
|
new_prefix_len = self.insert(
|
494
|
-
page_aligned_token_ids,
|
525
|
+
RadixKey(page_aligned_token_ids, req.extra_key),
|
526
|
+
page_aligned_kv_indices,
|
527
|
+
old_prefix_len,
|
495
528
|
)
|
496
529
|
|
497
530
|
# The prefix indices could be updated, reuse it
|
498
|
-
new_indices, new_last_node, _, _ = self.match_prefix(
|
499
|
-
|
531
|
+
new_indices, new_last_node, _, _ = self.match_prefix(
|
532
|
+
RadixKey(page_aligned_token_ids, req.extra_key)
|
533
|
+
)
|
534
|
+
assert old_prefix_len <= len(
|
500
535
|
new_indices
|
501
536
|
), f"{req.prefix_indices=}, {new_indices=}"
|
502
537
|
assert new_prefix_len <= len(new_indices), f"{new_prefix_len=}, {new_indices=}"
|
503
538
|
self.req_to_token_pool.write(
|
504
|
-
(req.req_pool_idx, slice(
|
505
|
-
new_indices[
|
539
|
+
(req.req_pool_idx, slice(old_prefix_len, len(new_indices))),
|
540
|
+
new_indices[old_prefix_len:],
|
506
541
|
)
|
507
542
|
|
543
|
+
req.last_matched_prefix_len = len(new_indices)
|
544
|
+
|
508
545
|
self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
|
509
546
|
swa_uuid_for_lock = self.inc_lock_ref(new_last_node)
|
510
547
|
|
@@ -514,7 +551,13 @@ class SWARadixCache(BasePrefixCache):
|
|
514
551
|
[new_indices, kv_indices[len(new_indices) :]]
|
515
552
|
)
|
516
553
|
else:
|
517
|
-
|
554
|
+
if self.is_eagle:
|
555
|
+
# Attach the kv index of the last token for EAGLE, it can be used in chunked prefill
|
556
|
+
req.prefix_indices = torch.cat(
|
557
|
+
[new_indices, kv_indices[actual_kv_len:]]
|
558
|
+
)
|
559
|
+
else:
|
560
|
+
req.prefix_indices = new_indices
|
518
561
|
req.last_node = new_last_node
|
519
562
|
req.swa_uuid_for_lock = swa_uuid_for_lock
|
520
563
|
|
@@ -734,7 +777,9 @@ class SWARadixCache(BasePrefixCache):
|
|
734
777
|
|
735
778
|
##### Internal Helper Functions #####
|
736
779
|
|
737
|
-
def _match_prefix_helper(
|
780
|
+
def _match_prefix_helper(
|
781
|
+
self, key: RadixKey
|
782
|
+
) -> Tuple[List[torch.Tensor], TreeNode]:
|
738
783
|
"""
|
739
784
|
SWA prefix matching helper. It factors in the sliding window size such that
|
740
785
|
the matched node is guaranteed to either 1. connected to root without swa tombstone,
|
@@ -798,7 +843,7 @@ class SWARadixCache(BasePrefixCache):
|
|
798
843
|
|
799
844
|
return value[:best_value_len], best_last_node
|
800
845
|
|
801
|
-
def _split_node(self, key:
|
846
|
+
def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode:
|
802
847
|
# new_node -> child
|
803
848
|
new_node = TreeNode()
|
804
849
|
new_node.children = {self.get_child_key_fn(key[split_len:]): child}
|
@@ -833,7 +878,7 @@ class SWARadixCache(BasePrefixCache):
|
|
833
878
|
return new_node
|
834
879
|
|
835
880
|
def _insert_helper(
|
836
|
-
self, node: TreeNode, key:
|
881
|
+
self, node: TreeNode, key: RadixKey, value, update_kv_after_len: int
|
837
882
|
) -> int:
|
838
883
|
# Update the last access time from root to leaf, so that
|
839
884
|
# swa will tombstone the node closer to root first
|