PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -20,7 +20,6 @@ Life cycle of a request in the prefill server
 from __future__ import annotations
 import logging
-import threading
 import time
 from collections import deque
 from http import HTTPStatus
@@ -49,13 +48,12 @@ from sglang.srt.managers.schedule_batch import (
     RequestStage,
     ScheduleBatch,
 )
-from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
-from sglang.srt.utils import (
-    DynamicGradMode,
-    broadcast_pyobj,
-    point_to_point_pyobj,
-    require_mlp_sync,
+from sglang.srt.mem_cache.memory_pool import (
+    HybridLinearKVPool,
+    NSATokenToKVPool,
+    SWAKVPool,
 )
+from sglang.srt.utils import broadcast_pyobj, point_to_point_pyobj, require_mlp_sync
 if TYPE_CHECKING:
     from torch.distributed import ProcessGroup
@@ -146,6 +144,28 @@ class PrefillBootstrapQueue:
         kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
+        if hasattr(self.token_to_kv_pool, "get_state_buf_infos"):
+            state_data_ptrs, state_data_lens, state_item_lens = (
+                self.token_to_kv_pool.get_state_buf_infos()
+            )
+            kv_args.state_data_ptrs = state_data_ptrs
+            kv_args.state_data_lens = state_data_lens
+            kv_args.state_item_lens = state_item_lens
+            if isinstance(self.token_to_kv_pool, SWAKVPool):
+                kv_args.state_type = "swa"
+            elif isinstance(self.token_to_kv_pool, HybridLinearKVPool):
+                kv_args.state_type = "mamba"
+            elif isinstance(self.token_to_kv_pool, NSATokenToKVPool):
+                kv_args.state_type = "nsa"
+            else:
+                kv_args.state_type = "none"
+        else:
+            kv_args.state_data_ptrs = []
+            kv_args.state_data_lens = []
+            kv_args.state_item_lens = []
+            kv_args.state_type = "none"
         kv_manager_class: Type[BaseKVManager] = get_kv_class(
             self.transfer_backend, KVClassType.MANAGER
         )
@@ -332,30 +352,21 @@ class SchedulerDisaggregationPrefillMixin:
             if require_mlp_sync(self.server_args):
                 batch = self.prepare_mlp_sync_batch(batch)
             self.cur_batch = batch
+            batch_result = None
             if batch:
-                result = self.run_batch(batch)
-                self.result_queue.append((batch.copy(), result))
-                if self.last_batch is None:
-                    # Create a dummy first batch to start the pipeline for overlap schedule.
-                    # It is now used for triggering the sampling_info_done event.
-                    tmp_batch = ScheduleBatch(
-                        reqs=None,
-                        forward_mode=ForwardMode.DUMMY_FIRST,
-                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
-                    )
-                    self.set_next_batch_sampling_info_done(tmp_batch)
+                batch_result = self.run_batch(batch)
+                self.result_queue.append((batch.copy(), batch_result))
             if self.last_batch:
                 tmp_batch, tmp_result = self.result_queue.popleft()
-                tmp_batch.next_batch_sampling_info = (
-                    self.tp_worker.cur_sampling_info if batch else None
-                )
                 self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
             if len(self.disagg_prefill_inflight_queue) > 0:
                 self.process_disagg_prefill_inflight_queue()
+            self.launch_batch_sample_if_needed(batch_result)
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
                 self.self_check_during_idle()
@@ -368,7 +379,6 @@ class SchedulerDisaggregationPrefillMixin:
         self: Scheduler,
         batch: ScheduleBatch,
         result: GenerationBatchResult,
-        launch_done: Optional[threading.Event] = None,
     ) -> None:
         """
         Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
@@ -379,31 +389,30 @@ class SchedulerDisaggregationPrefillMixin:
             next_token_ids,
             extend_input_len_per_req,
             extend_logprob_start_len_per_req,
+            copy_done,
         ) = (
             result.logits_output,
             result.next_token_ids,
             result.extend_input_len_per_req,
             result.extend_logprob_start_len_per_req,
+            result.copy_done,
         )
+        if copy_done is not None:
+            copy_done.synchronize()
         logprob_pt = 0
         # Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
-        if self.enable_overlap:
-            # wait
-            logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
-                launch_done
-            )
-        else:
-            next_token_ids = result.next_token_ids.tolist()
-            if batch.return_logprob:
-                if logits_output.next_token_logprobs is not None:
-                    logits_output.next_token_logprobs = (
-                        logits_output.next_token_logprobs.tolist()
-                    )
-                if logits_output.input_token_logprobs is not None:
-                    logits_output.input_token_logprobs = tuple(
-                        logits_output.input_token_logprobs.tolist()
-                    )
+        next_token_ids = result.next_token_ids.tolist()
+        if batch.return_logprob:
+            if logits_output.next_token_logprobs is not None:
+                logits_output.next_token_logprobs = (
+                    logits_output.next_token_logprobs.tolist()
+                )
+            if logits_output.input_token_logprobs is not None:
+                logits_output.input_token_logprobs = tuple(
+                    logits_output.input_token_logprobs.tolist()
+                )
         hidden_state_offset = 0
         for i, (req, next_token_id) in enumerate(
@@ -415,24 +424,12 @@ class SchedulerDisaggregationPrefillMixin:
                 self.tree_cache.cache_unfinished_req(req)  # update the tree and lock
                 req.add_latency(RequestStage.PREFILL_FORWARD)
                 self.disagg_prefill_inflight_queue.append(req)
-                if (
-                    logits_output is not None
-                    and logits_output.hidden_states is not None
-                ):
-                    last_hidden_index = (
-                        hidden_state_offset + extend_input_len_per_req[i] - 1
-                    )
+                if self.spec_algorithm.is_eagle() and batch.spec_info is not None:
                     req.output_topk_p = batch.spec_info.topk_p[i]
                     req.output_topk_index = batch.spec_info.topk_index[i]
-                    if self.spec_algorithm.is_eagle3():
-                        req.hidden_states_tensor = (
-                            batch.spec_info.hidden_states[i].cpu().clone()
-                        )
-                    else:
-                        req.hidden_states_tensor = (
-                            logits_output.hidden_states[last_hidden_index].cpu().clone()
-                        )
-                    hidden_state_offset += extend_input_len_per_req[i]
+                    req.hidden_states_tensor = (
+                        batch.spec_info.hidden_states[i].cpu().clone()
+                    )
                 else:
                     req.hidden_states_tensor = None
                 if req.return_logprob:
@@ -491,8 +488,6 @@ class SchedulerDisaggregationPrefillMixin:
                 if self.enable_overlap:
                     self.send_kv_chunk(req, last_chunk=False, end_idx=req.tmp_end_idx)
-        # We need to remove the sync in the following function for overlap schedule.
-        self.set_next_batch_sampling_info_done(batch)
         self.maybe_send_health_check_signal()
     def process_disagg_prefill_inflight_queue(
@@ -631,227 +626,58 @@ class SchedulerDisaggregationPrefillMixin:
             .numpy()
         )
         req.start_send_idx = end_idx
+        state_indices = None
         if last_chunk:
             self.disagg_metadata_buffers.set_buf(req)
+            # Prepare extra pool indices for hybrid models
+            if isinstance(
+                self.token_to_kv_pool_allocator.get_kvcache(), HybridLinearKVPool
+            ):
+                # Mamba hybrid model: send single mamba state index
+                state_indices = [
+                    self.req_to_token_pool.req_index_to_mamba_index_mapping[
+                        req.req_pool_idx
+                    ]
+                    .cpu()
+                    .numpy()
+                ]
+            elif isinstance(self.token_to_kv_pool_allocator.get_kvcache(), SWAKVPool):
+                # SWA hybrid model: send last window KV indices
+                seq_len = len(req.fill_ids)
+                window_size = self.sliding_window_size
+                window_start = max(0, seq_len - window_size)
+                window_start = (window_start // page_size) * page_size
+                window_kv_indices_full = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, window_start:seq_len
+                ]
+                # Translate to SWA pool indices
+                window_kv_indices_swa = (
+                    self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                        window_kv_indices_full
+                    )
+                )
+                state_indices = window_kv_indices_swa.cpu().numpy()
+                state_indices = kv_to_page_indices(state_indices, page_size)
+            elif isinstance(
+                self.token_to_kv_pool_allocator.get_kvcache(), NSATokenToKVPool
+            ):
+                seq_len = len(req.fill_ids)
+                kv_indices_full = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, :seq_len
+                ]
+                state_indices = kv_indices_full.cpu().numpy()
+                state_indices = kv_to_page_indices(state_indices, page_size)
         page_indices = kv_to_page_indices(kv_indices, page_size)
         if len(page_indices) == 0:
             logger.info(
                 f"Skip sending kv chunk for request {req.rid=} {req.bootstrap_room=} because page_indices is empty"
             )
             return
-        req.disagg_kv_sender.send(page_indices)
-    # PP
-    @DynamicGradMode()
-    def event_loop_pp_disagg_prefill(self: Scheduler):
-        """
-        An event loop for the prefill server in pipeline parallelism.
-        Rules:
-        1. Each stage runs in the same order and is notified by the previous stage.
-        2. Each send/recv operation is blocking and matched by the neighboring stage.
-        Regular Schedule:
-        ====================================================================
-        Stage i                   | Stage i+1
-        send ith req              | recv ith req
-        send ith proxy            | recv ith proxy
-        send prev (i+1)th carry   | recv prev (i+1)th carry
-        ====================================================================
-        Prefill Server Schedule:
-        ====================================================================
-        Stage i                        | Stage i+1
-        send ith req                   | recv ith req
-        send ith bootstrap req         | recv ith bootstrap req
-        send ith transferred req       | recv ith transferred req
-        send ith proxy                 | recv ith proxy
-        send prev (i+1)th carry        | recv prev (i+1)th carry
-        send prev (i+1)th release req  | recv prev (i+1)th release req
-        ====================================================================
-        There are two additional elements compared to the regular schedule:
-        1. Bootstrap Requests:
-            a. Instead of polling the status on the current workers, we should wait for the previous stage to notify to avoid desynchronization.
-            b. The first stage polls the status and propagates the bootstrapped requests down to all other stages.
-            c. If the first stage polls successfully, by nature, other ranks are also successful because they performed a handshake together.
-        2. Transferred Requests + Release Requests:
-            a. The first stage polls the transfer finished requests, performs an intersection with the next stage's finished requests, and propagates down to the last stage.
-            b. The last stage receives the requests that have finished transfer on all stages (consensus), then sends them to the first stage to release the memory.
-            c. The first stage receives the release requests, releases the memory, and then propagates the release requests down to the last stage.
-        """
-        from sglang.srt.managers.scheduler import GenerationBatchResult
-        mbs = [None] * self.pp_size
-        last_mbs = [None] * self.pp_size
-        self.running_mbs = [
-            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
-        ]
-        pp_outputs: Optional[PPProxyTensors] = None
-        # Either success or failed
-        bootstrapped_rids: List[str] = []
-        transferred_rids: List[str] = []
-        release_rids: Optional[List[str]] = None
-        # transferred microbatch
-        tmbs = [None] * self.pp_size
-        ENABLE_RELEASE = True  # For debug
-        while True:
-            server_is_idle = True
-            for mb_id in range(self.pp_size):
-                self.running_batch = self.running_mbs[mb_id]
-                self.last_batch = last_mbs[mb_id]
-                recv_reqs = self.recv_requests()
-                self.process_input_requests(recv_reqs)
-                if self.pp_group.is_first_rank:
-                    # First rank, pop the bootstrap reqs from the bootstrap queue
-                    bootstrapped_reqs, failed_reqs = (
-                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
-                            return_failed_reqs=True
-                        )
-                    )
-                    bootstrapped_rids = [req.rid for req in bootstrapped_reqs] + [
-                        req.rid for req in failed_reqs
-                    ]
-                    self.waiting_queue.extend(bootstrapped_reqs)
-                else:
-                    # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
-                    bootstrapped_rids = self.recv_pyobj_from_prev_stage()
-                    bootstrapped_reqs = (
-                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
-                            rids_to_check=bootstrapped_rids
-                        )
-                    )
-                    self.waiting_queue.extend(bootstrapped_reqs)
-                if self.pp_group.is_first_rank:
-                    transferred_rids = self.get_transferred_rids()
-                # if other ranks,
-                else:
-                    # 1. recv previous stage's transferred reqs info
-                    prev_transferred_rids = self.recv_pyobj_from_prev_stage()
-                    # 2. get the current stage's transferred reqs info
-                    curr_transferred_rids = self.get_transferred_rids()
-                    # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
-                    transferred_rids = list(
-                        set(prev_transferred_rids) & set(curr_transferred_rids)
-                    )
-                tmbs[mb_id] = transferred_rids
-                self.process_prefill_chunk()
-                mbs[mb_id] = self.get_new_batch_prefill()
-                self.running_mbs[mb_id] = self.running_batch
-                self.cur_batch = mbs[mb_id]
-                if self.cur_batch:
-                    server_is_idle = False
-                    result = self.run_batch(self.cur_batch)
-                # send the outputs to the next step
-                if self.pp_group.is_last_rank:
-                    if self.cur_batch:
-                        next_token_ids = result.next_token_ids
-                        pp_outputs = PPProxyTensors(
-                            {
-                                "next_token_ids": next_token_ids,
-                            }
-                        )
-                        # send the output from the last round to let the next stage worker run post processing
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-                if ENABLE_RELEASE:
-                    if self.pp_group.is_last_rank:
-                        # At the last stage, all stages has reached the consensus to release memory for transferred_rids
-                        release_rids = transferred_rids
-                        # send to the first rank
-                        self.send_pyobj_to_next_stage(release_rids)
-                # receive outputs and post-process (filter finished reqs) the coming microbatch
-                next_mb_id = (mb_id + 1) % self.pp_size
-                next_pp_outputs = None
-                next_release_rids = None
-                if mbs[next_mb_id] is not None:
-                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
-                        self.pp_group.recv_tensor_dict(
-                            all_gather_group=self.attn_tp_group
-                        )
-                    )
-                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
-                    output_result = GenerationBatchResult(
-                        logits_output=None,
-                        pp_hidden_states_proxy_tensors=None,
-                        next_token_ids=next_pp_outputs["next_token_ids"],
-                        extend_input_len_per_req=None,
-                        extend_logprob_start_len_per_req=None,
-                        can_run_cuda_graph=result.can_run_cuda_graph,
-                    )
-                    self.process_batch_result_disagg_prefill(
-                        mbs[next_mb_id], output_result
-                    )
-                    last_mbs[next_mb_id] = mbs[next_mb_id]
-                if ENABLE_RELEASE:
-                    if tmbs[next_mb_id] is not None:
-                        # recv consensus rids from the previous rank
-                        next_release_rids = self.recv_pyobj_from_prev_stage()
-                        self.process_disagg_prefill_inflight_queue(next_release_rids)
-                # carry the outputs to the next stage
-                if not self.pp_group.is_last_rank:
-                    if pp_outputs:
-                        # send the outputs from the last round to let the next stage worker run post processing
-                        self.pp_group.send_tensor_dict(
-                            pp_outputs.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-                    if ENABLE_RELEASE:
-                        if release_rids is not None:
-                            self.send_pyobj_to_next_stage(release_rids)
-                if not self.pp_group.is_last_rank:
-                    # send out reqs to the next stage
-                    self.send_pyobj_to_next_stage(recv_reqs)
-                    self.send_pyobj_to_next_stage(bootstrapped_rids)
-                    self.send_pyobj_to_next_stage(transferred_rids)
-                    # send out proxy tensors to the next stage
-                    if self.cur_batch:
-                        # FIXME(lsyin): remove this assert
-                        assert result.pp_hidden_states_proxy_tensors.tensors is not None
-                        self.pp_group.send_tensor_dict(
-                            result.pp_hidden_states_proxy_tensors.tensors,
-                            all_gather_group=self.attn_tp_group,
-                        )
-                pp_outputs = next_pp_outputs
-                release_rids = next_release_rids
-                self.running_batch.batch_is_full = False
-            if not ENABLE_RELEASE:
-                if len(self.disagg_prefill_inflight_queue) > 0:
-                    self.process_disagg_prefill_inflight_queue()
-            # When the server is idle, self-check and re-init some states
-            if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.check_tree_cache()
-                self.new_token_ratio = self.init_new_token_ratio
+        req.disagg_kv_sender.send(page_indices, state_indices)
     def send_pyobj_to_next_stage(self, data):
         if self.attn_tp_rank == 0:

sglang/srt/distributed/device_communicators/all_reduce_utils.py CHANGED Viewed

@@ -3,13 +3,13 @@ MiB = 1024 * 1024
 SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
     9: {
         2: 64 * MiB,  # 64 MB
-        4: 32 * MiB,  # 32 MB
-        6: 64 * MiB,  # 64 MB
-        8: 64 * MiB,  # 64 MB
+        4: 64 * MiB,  # 64 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
     },
     10: {
         2: 64 * MiB,  # 64 MB
-        4: 32 * MiB,  # 32 MB
+        4: 64 * MiB,  # 64 MB
         6: 128 * MiB,  # 128 MB
         8: 128 * MiB,  # 128 MB
     },

sglang/srt/distributed/device_communicators/custom_all_reduce.py CHANGED Viewed

@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
     is_weak_contiguous,
 )
 from sglang.srt.distributed.parallel_state import in_the_same_node_as
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import is_cuda, is_hip, log_info_on_rank0
 logger = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ try:
         ops.meta_size()
     else:
         # Use custom allreduce from sgl kernel (ROCM and TRT-LLM)
-        import sgl_kernel
+        import sgl_kernel  # noqa: F401
     custom_ar = True
 except Exception:
     # For CPUs
@@ -185,7 +185,7 @@ class CustomAllreduce:
             # is enough for 131072 such tuples. The largest model I've seen only
             # needs less than 10000 of registered tuples.
             self.rank_data = torch.empty(
-                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+                max_size, dtype=torch.uint8, device=self.device
             )
             self._ptr = ops.init_custom_ar(
                 self.meta_ptrs, self.rank_data, rank, self.full_nvlink
@@ -202,7 +202,7 @@ class CustomAllreduce:
             )
             handles, offsets = self._gather_ipc_meta(shard_data)
             self.rank_data = torch.empty(
-                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+                max_size, dtype=torch.uint8, device=self.device
             )
             self._ptr = ops.init_custom_ar(
                 self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
@@ -301,11 +301,11 @@ class CustomAllreduce:
         if _is_hip:
             handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
             handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
-            logger.info("Registering %d cuda graph addresses", len(offset))
+            log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses")
             ops.register_graph_buffers(self._ptr, handles, offsets)
         else:
             handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
-            logger.info("Registering %d cuda graph addresses", len(offset))
+            log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses")
             # We cannot directly use `dist.all_gather_object` here
             # because it is incompatible with `gloo` backend under inference mode.
             # see https://github.com/pytorch/pytorch/issues/126032 for details.

sglang/srt/distributed/device_communicators/pymscclpp.py CHANGED Viewed

@@ -4,7 +4,7 @@ import math
 import os
 from contextlib import contextmanager
 from enum import IntEnum
-from typing import Any, Callable, List, Optional, TypeVar, Union
+from typing import Optional, Union
 import torch
 import torch.distributed as dist
@@ -24,7 +24,7 @@ if _is_hip:
     mscclpp_is_available = False
 if _is_cuda:
     try:
-        import sgl_kernel
+        import sgl_kernel  # noqa: F401
         mscclpp_is_available = True
     except:

sglang/srt/distributed/device_communicators/pynccl.py CHANGED Viewed

@@ -30,6 +30,7 @@ class PyNcclCommunicator:
         group: Union[ProcessGroup, StatelessProcessGroup],
         device: Union[int, str, torch.device],
         library_path: Optional[str] = None,
+        use_current_stream: bool = False,
     ):
         """
         Args:
@@ -74,6 +75,7 @@ class PyNcclCommunicator:
         self.available = True
         self.disabled = False
+        self.use_current_stream = use_current_stream
         self.nccl_version = self.nccl.ncclGetRawVersion()
         if self.rank == 0:
@@ -123,6 +125,21 @@ class PyNcclCommunicator:
         # when we are using CUDA graph.
         self.disabled = True
+    def _resolve_stream(self, stream: Optional[torch.cuda.Stream]):
+        """Return the stream to use for NCCL calls.
+        Behavior mirrors the previous inline logic:
+        - if an explicit stream is provided, return it
+        - if stream is None and self.use_current_stream is True, return
+          torch.cuda.current_stream()
+        - otherwise return the communicator's default stream (self.stream)
+        """
+        if stream is not None:
+            return stream
+        if self.use_current_stream:
+            return torch.cuda.current_stream()
+        return self.stream
     def all_reduce(
         self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
     ):
@@ -135,8 +152,7 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         self.nccl.ncclAllReduce(
             buffer_type(tensor.data_ptr()),
             buffer_type(tensor.data_ptr()),
@@ -163,8 +179,7 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         if sizes is not None:
             split_offset = 0
@@ -210,8 +225,7 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         if sizes is not None:
             split_offset = 0
@@ -249,8 +263,7 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         self.nccl.ncclSend(
             buffer_type(tensor.data_ptr()),
             tensor.numel(),
@@ -267,8 +280,7 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         self.nccl.ncclRecv(
             buffer_type(tensor.data_ptr()),
             tensor.numel(),
@@ -285,8 +297,8 @@ class PyNcclCommunicator:
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}"
         )
-        if stream is None:
-            stream = self.stream
+        stream = self._resolve_stream(stream)
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer

sglang/srt/distributed/device_communicators/pynccl_allocator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from packaging import version
 from torch.cuda.memory import CUDAPluggableAllocator
 from sglang.srt.distributed.parallel_state import GroupCoordinator
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.server_args import get_global_server_args
 nccl_allocator_source = """
 #include <nccl.h>
@@ -32,7 +32,7 @@ _graph_pool_id = None
 def is_symmetric_memory_enabled():
-    return global_server_args_dict["enable_symm_mem"]
+    return get_global_server_args().enable_symm_mem
 def set_graph_pool_id(graph_pool_id):

sglang/srt/distributed/device_communicators/symm_mem.py CHANGED Viewed

@@ -9,7 +9,7 @@ from torch.distributed import ProcessGroup
 from sglang.srt.distributed.device_communicators.all_reduce_utils import (
     SYMM_MEM_ALL_REDUCE_MAX_SIZES,
 )
-from sglang.srt.utils import get_device_capability, is_cuda, is_hip
+from sglang.srt.utils import is_cuda, is_hip
 try:
     import torch.distributed._symmetric_memory as torch_symm_mem

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl