sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@
|
|
4
4
|
# Adapted from
|
5
5
|
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
|
6
6
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
7
|
-
"""
|
7
|
+
"""Distributed state.
|
8
8
|
It takes over the control of the distributed environment from PyTorch.
|
9
9
|
The typical workflow is:
|
10
10
|
|
@@ -53,16 +53,26 @@ from sglang.srt.utils import (
|
|
53
53
|
|
54
54
|
_is_npu = is_npu()
|
55
55
|
_is_cpu = is_cpu()
|
56
|
+
_supports_custom_op = supports_custom_op()
|
56
57
|
|
57
58
|
IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
|
58
59
|
|
59
60
|
|
61
|
+
TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
|
62
|
+
|
63
|
+
# use int value instead of ReduceOp.SUM to support torch compile
|
64
|
+
REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
|
65
|
+
|
66
|
+
|
60
67
|
@dataclass
|
61
68
|
class GraphCaptureContext:
|
62
69
|
stream: torch.cuda.Stream if not _is_npu else torch.npu.Stream
|
63
70
|
|
64
71
|
|
65
|
-
|
72
|
+
@dataclass
|
73
|
+
class P2PWork:
|
74
|
+
work: Optional[torch.distributed.Work]
|
75
|
+
payload: Optional[torch.Tensor]
|
66
76
|
|
67
77
|
|
68
78
|
def _split_tensor_dict(
|
@@ -114,7 +124,7 @@ def _register_group(group: "GroupCoordinator") -> None:
|
|
114
124
|
_groups[group.unique_name] = weakref.ref(group)
|
115
125
|
|
116
126
|
|
117
|
-
if
|
127
|
+
if _supports_custom_op:
|
118
128
|
|
119
129
|
def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
|
120
130
|
assert group_name in _groups, f"Group {group_name} is not found."
|
@@ -205,12 +215,14 @@ class GroupCoordinator:
|
|
205
215
|
use_pynccl: bool # a hint of whether to use PyNccl
|
206
216
|
use_pymscclpp: bool # a hint of whether to use PyMsccl
|
207
217
|
use_custom_allreduce: bool # a hint of whether to use CustomAllreduce
|
218
|
+
use_torch_symm_mem: bool # a hint of whether to use SymmMemAllReduce
|
208
219
|
use_message_queue_broadcaster: (
|
209
220
|
bool # a hint of whether to use message queue broadcaster
|
210
221
|
)
|
211
222
|
# communicators are only created for world size > 1
|
212
223
|
pynccl_comm: Optional[Any] # PyNccl communicator
|
213
224
|
ca_comm: Optional[Any] # Custom allreduce communicator
|
225
|
+
symm_mem_comm: Optional[Any] # Symm mem communicator
|
214
226
|
mq_broadcaster: Optional[Any] # shared memory broadcaster
|
215
227
|
|
216
228
|
def __init__(
|
@@ -221,6 +233,7 @@ class GroupCoordinator:
|
|
221
233
|
use_pynccl: bool,
|
222
234
|
use_pymscclpp: bool,
|
223
235
|
use_custom_allreduce: bool,
|
236
|
+
use_torch_symm_mem: bool,
|
224
237
|
use_hpu_communicator: bool,
|
225
238
|
use_xpu_communicator: bool,
|
226
239
|
use_npu_communicator: bool,
|
@@ -269,12 +282,13 @@ class GroupCoordinator:
|
|
269
282
|
self.use_pynccl = use_pynccl
|
270
283
|
self.use_pymscclpp = use_pymscclpp
|
271
284
|
self.use_custom_allreduce = use_custom_allreduce
|
285
|
+
self.use_torch_symm_mem = use_torch_symm_mem
|
272
286
|
self.use_hpu_communicator = use_hpu_communicator
|
273
287
|
self.use_xpu_communicator = use_xpu_communicator
|
274
288
|
self.use_npu_communicator = use_npu_communicator
|
275
289
|
self.use_message_queue_broadcaster = use_message_queue_broadcaster
|
276
290
|
|
277
|
-
#
|
291
|
+
# Lazy import to avoid documentation build error
|
278
292
|
from sglang.srt.distributed.device_communicators.custom_all_reduce import (
|
279
293
|
CustomAllreduce,
|
280
294
|
)
|
@@ -284,6 +298,9 @@ class GroupCoordinator:
|
|
284
298
|
from sglang.srt.distributed.device_communicators.pynccl import (
|
285
299
|
PyNcclCommunicator,
|
286
300
|
)
|
301
|
+
from sglang.srt.distributed.device_communicators.symm_mem import (
|
302
|
+
SymmMemCommunicator,
|
303
|
+
)
|
287
304
|
|
288
305
|
if is_hip():
|
289
306
|
from sglang.srt.distributed.device_communicators.quick_all_reduce import (
|
@@ -332,6 +349,13 @@ class GroupCoordinator:
|
|
332
349
|
except Exception as e:
|
333
350
|
logger.warning(f"Failed to initialize QuickAllReduce: {e}")
|
334
351
|
|
352
|
+
self.symm_mem_comm: Optional[SymmMemCommunicator] = None
|
353
|
+
if self.use_torch_symm_mem and self.world_size > 1:
|
354
|
+
self.symm_mem_comm = SymmMemCommunicator(
|
355
|
+
group=self.cpu_group,
|
356
|
+
device=self.device,
|
357
|
+
)
|
358
|
+
|
335
359
|
# Create communicator for other hardware backends
|
336
360
|
from sglang.srt.distributed.device_communicators.hpu_communicator import (
|
337
361
|
HpuCommunicator,
|
@@ -436,6 +460,7 @@ class GroupCoordinator:
|
|
436
460
|
# custom allreduce | enabled | enabled |
|
437
461
|
# PyNccl | disabled| enabled |
|
438
462
|
# PyMscclpp | disabled| enabled |
|
463
|
+
# TorchSymmMem | disabled| enabled |
|
439
464
|
# torch.distributed | enabled | disabled|
|
440
465
|
#
|
441
466
|
# Note: When custom quick allreduce is enabled, a runtime check
|
@@ -489,14 +514,12 @@ class GroupCoordinator:
|
|
489
514
|
|
490
515
|
if input_.is_cpu:
|
491
516
|
if is_shm_available(input_.dtype, self.world_size, self.local_size):
|
492
|
-
torch.ops.sgl_kernel.shm_allreduce(
|
493
|
-
input_, torch.distributed.ReduceOp.SUM
|
494
|
-
)
|
517
|
+
torch.ops.sgl_kernel.shm_allreduce(input_, REDUCE_OP_SUM)
|
495
518
|
else:
|
496
519
|
torch.distributed.all_reduce(input_, group=self.device_group)
|
497
520
|
return input_
|
498
521
|
|
499
|
-
if not
|
522
|
+
if not _supports_custom_op:
|
500
523
|
self._all_reduce_in_place(input_)
|
501
524
|
return input_
|
502
525
|
|
@@ -522,23 +545,29 @@ class GroupCoordinator:
|
|
522
545
|
|
523
546
|
outplace_all_reduce_method = None
|
524
547
|
if (
|
525
|
-
self.qr_comm is not None
|
526
|
-
and not self.qr_comm.disabled
|
527
|
-
and self.qr_comm.should_quick_allreduce(input_)
|
528
|
-
):
|
529
|
-
outplace_all_reduce_method = "qr"
|
530
|
-
elif (
|
531
548
|
self.ca_comm is not None
|
532
549
|
and not self.ca_comm.disabled
|
533
550
|
and self.ca_comm.should_custom_ar(input_)
|
534
551
|
):
|
535
552
|
outplace_all_reduce_method = "ca"
|
553
|
+
elif (
|
554
|
+
self.qr_comm is not None
|
555
|
+
and not self.qr_comm.disabled
|
556
|
+
and self.qr_comm.should_quick_allreduce(input_)
|
557
|
+
):
|
558
|
+
outplace_all_reduce_method = "qr"
|
536
559
|
elif (
|
537
560
|
self.pymscclpp_comm is not None
|
538
561
|
and not self.pymscclpp_comm.disabled
|
539
562
|
and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
|
540
563
|
):
|
541
564
|
outplace_all_reduce_method = "pymscclpp"
|
565
|
+
elif (
|
566
|
+
self.symm_mem_comm is not None
|
567
|
+
and not self.symm_mem_comm.disabled
|
568
|
+
and self.symm_mem_comm.should_symm_mem_allreduce(input_)
|
569
|
+
):
|
570
|
+
outplace_all_reduce_method = "symm_mem"
|
542
571
|
if outplace_all_reduce_method is not None:
|
543
572
|
return torch.ops.sglang.outplace_all_reduce(
|
544
573
|
input_,
|
@@ -552,16 +581,20 @@ class GroupCoordinator:
|
|
552
581
|
def _all_reduce_out_place(
|
553
582
|
self, input_: torch.Tensor, outplace_all_reduce_method: str
|
554
583
|
) -> torch.Tensor:
|
555
|
-
qr_comm = self.qr_comm
|
556
584
|
ca_comm = self.ca_comm
|
585
|
+
qr_comm = self.qr_comm
|
557
586
|
pymscclpp_comm = self.pymscclpp_comm
|
587
|
+
symm_mem_comm = self.symm_mem_comm
|
558
588
|
assert any([qr_comm, ca_comm, pymscclpp_comm])
|
559
|
-
if outplace_all_reduce_method == "
|
560
|
-
assert not qr_comm.disabled
|
561
|
-
out = qr_comm.quick_all_reduce(input_)
|
562
|
-
elif outplace_all_reduce_method == "ca":
|
589
|
+
if outplace_all_reduce_method == "ca":
|
563
590
|
assert not ca_comm.disabled
|
564
591
|
out = ca_comm.custom_all_reduce(input_)
|
592
|
+
elif outplace_all_reduce_method == "qr":
|
593
|
+
assert not qr_comm.disabled
|
594
|
+
out = qr_comm.quick_all_reduce(input_)
|
595
|
+
elif outplace_all_reduce_method == "symm_mem":
|
596
|
+
assert not symm_mem_comm.disabled
|
597
|
+
out = symm_mem_comm.all_reduce(input_)
|
565
598
|
else:
|
566
599
|
assert not pymscclpp_comm.disabled
|
567
600
|
out = pymscclpp_comm.all_reduce(input_)
|
@@ -636,7 +669,7 @@ class GroupCoordinator:
|
|
636
669
|
)
|
637
670
|
|
638
671
|
def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
|
639
|
-
if _is_npu or not
|
672
|
+
if _is_npu or not _supports_custom_op:
|
640
673
|
self._all_gather_into_tensor(output, input)
|
641
674
|
else:
|
642
675
|
torch.ops.sglang.reg_all_gather_into_tensor(
|
@@ -696,15 +729,13 @@ class GroupCoordinator:
|
|
696
729
|
)
|
697
730
|
|
698
731
|
# All-gather.
|
699
|
-
if input_.is_cpu and is_shm_available(
|
700
|
-
input_.dtype, self.world_size, self.local_size
|
701
|
-
):
|
702
|
-
return torch.ops.sgl_kernel.shm_allgather(input_, dim)
|
703
|
-
|
704
732
|
if input_.is_cpu:
|
705
|
-
|
706
|
-
|
707
|
-
|
733
|
+
if is_shm_available(input_.dtype, self.world_size, self.local_size):
|
734
|
+
return torch.ops.sgl_kernel.shm_allgather(input_, dim)
|
735
|
+
else:
|
736
|
+
torch.distributed.all_gather_into_tensor(
|
737
|
+
output_tensor, input_, group=self.device_group
|
738
|
+
)
|
708
739
|
else:
|
709
740
|
self.all_gather_into_tensor(output_tensor, input_)
|
710
741
|
|
@@ -860,45 +891,63 @@ class GroupCoordinator:
|
|
860
891
|
torch.distributed.all_gather_object(objs, obj, group=self.cpu_group)
|
861
892
|
return objs
|
862
893
|
|
863
|
-
def send_object(
|
864
|
-
|
865
|
-
|
894
|
+
def send_object(
|
895
|
+
self,
|
896
|
+
obj: Any,
|
897
|
+
dst: int,
|
898
|
+
async_send: bool = False,
|
899
|
+
) -> List[P2PWork]:
|
900
|
+
"""
|
901
|
+
Send the input object list to the destination rank.
|
902
|
+
This function uses the CPU group for all communications.
|
866
903
|
|
867
|
-
|
904
|
+
TODO: If you want to use GPU communication, please add a new argument (e.g., data_group, group),
|
905
|
+
use other functions (e.g., send), or implement a new function (e.g., send_object_device).
|
906
|
+
|
907
|
+
NOTE: `dst` is the local rank of the destination rank.
|
908
|
+
"""
|
868
909
|
|
910
|
+
assert dst < self.world_size, f"Invalid dst rank ({dst})"
|
869
911
|
assert dst != self.rank_in_group, (
|
870
912
|
"Invalid destination rank. Destination rank is the same "
|
871
913
|
"as the current rank."
|
872
914
|
)
|
915
|
+
send_func = torch.distributed.isend if async_send else torch.distributed.send
|
873
916
|
|
874
917
|
# Serialize object to tensor and get the size as well
|
875
|
-
object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
|
876
|
-
device=torch.cuda.current_device()
|
877
|
-
)
|
878
|
-
|
918
|
+
object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
|
879
919
|
size_tensor = torch.tensor(
|
880
|
-
[object_tensor.numel()],
|
881
|
-
dtype=torch.long,
|
882
|
-
device="cpu",
|
920
|
+
[object_tensor.numel()], dtype=torch.long, device="cpu"
|
883
921
|
)
|
922
|
+
|
884
923
|
# Send object size
|
885
|
-
|
924
|
+
p2p_work = []
|
925
|
+
size_work = send_func(
|
926
|
+
size_tensor,
|
927
|
+
self.ranks[dst],
|
928
|
+
group=self.cpu_group,
|
929
|
+
)
|
930
|
+
if async_send:
|
931
|
+
p2p_work.append(P2PWork(size_work, size_tensor))
|
886
932
|
|
887
|
-
|
888
|
-
torch.distributed.send(
|
933
|
+
object_work = send_func(
|
889
934
|
object_tensor,
|
890
|
-
|
891
|
-
group=self.
|
935
|
+
self.ranks[dst],
|
936
|
+
group=self.cpu_group,
|
892
937
|
)
|
938
|
+
if async_send:
|
939
|
+
p2p_work.append(P2PWork(object_work, object_tensor))
|
893
940
|
|
894
|
-
return
|
941
|
+
return p2p_work
|
895
942
|
|
896
|
-
def recv_object(
|
943
|
+
def recv_object(
|
944
|
+
self,
|
945
|
+
src: int,
|
946
|
+
) -> Any:
|
897
947
|
"""Receive the input object list from the source rank."""
|
898
948
|
"""NOTE: `src` is the local rank of the source rank."""
|
899
949
|
|
900
950
|
assert src < self.world_size, f"Invalid src rank ({src})"
|
901
|
-
|
902
951
|
assert (
|
903
952
|
src != self.rank_in_group
|
904
953
|
), "Invalid source rank. Source rank is the same as the current rank."
|
@@ -906,27 +955,25 @@ class GroupCoordinator:
|
|
906
955
|
size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
|
907
956
|
|
908
957
|
# Receive object size
|
909
|
-
|
958
|
+
# We have to use irecv here to make it work for both isend and send.
|
959
|
+
work = torch.distributed.irecv(
|
910
960
|
size_tensor, src=self.ranks[src], group=self.cpu_group
|
911
961
|
)
|
962
|
+
work.wait()
|
912
963
|
|
913
964
|
# Tensor to receive serialized objects into.
|
914
|
-
object_tensor = torch.empty( # type: ignore[call-overload]
|
965
|
+
object_tensor: Any = torch.empty( # type: ignore[call-overload]
|
915
966
|
size_tensor.item(), # type: ignore[arg-type]
|
916
967
|
dtype=torch.uint8,
|
917
|
-
device=
|
968
|
+
device="cpu",
|
918
969
|
)
|
919
970
|
|
920
|
-
|
921
|
-
object_tensor, src=self.ranks[src], group=self.
|
971
|
+
work = torch.distributed.irecv(
|
972
|
+
object_tensor, src=self.ranks[src], group=self.cpu_group
|
922
973
|
)
|
974
|
+
work.wait()
|
923
975
|
|
924
|
-
|
925
|
-
rank_object == rank_size
|
926
|
-
), "Received object sender rank does not match the size sender rank."
|
927
|
-
|
928
|
-
obj = pickle.loads(object_tensor.cpu().numpy())
|
929
|
-
|
976
|
+
obj = pickle.loads(object_tensor.numpy())
|
930
977
|
return obj
|
931
978
|
|
932
979
|
def broadcast_tensor_dict(
|
@@ -1016,12 +1063,13 @@ class GroupCoordinator:
|
|
1016
1063
|
tensor_dict: Dict[str, Union[torch.Tensor, Any]],
|
1017
1064
|
dst: Optional[int] = None,
|
1018
1065
|
all_gather_group: Optional["GroupCoordinator"] = None,
|
1019
|
-
|
1066
|
+
async_send: bool = False,
|
1067
|
+
) -> Optional[List[P2PWork]]:
|
1020
1068
|
"""Send the input tensor dictionary.
|
1021
1069
|
NOTE: `dst` is the local rank of the source rank.
|
1022
1070
|
"""
|
1023
1071
|
# Bypass the function if we are using only 1 GPU.
|
1024
|
-
if
|
1072
|
+
if self.world_size == 1:
|
1025
1073
|
return tensor_dict
|
1026
1074
|
|
1027
1075
|
all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
|
@@ -1046,7 +1094,10 @@ class GroupCoordinator:
|
|
1046
1094
|
# 1. Superior D2D transfer bandwidth
|
1047
1095
|
# 2. Ability to overlap send and recv operations
|
1048
1096
|
# Thus the net performance gain justifies this approach.
|
1049
|
-
|
1097
|
+
|
1098
|
+
send_func = torch.distributed.isend if async_send else torch.distributed.send
|
1099
|
+
p2p_works = self.send_object(metadata_list, dst=dst, async_send=async_send)
|
1100
|
+
|
1050
1101
|
for tensor in tensor_list:
|
1051
1102
|
if tensor.numel() == 0:
|
1052
1103
|
# Skip sending empty tensors.
|
@@ -1056,15 +1107,11 @@ class GroupCoordinator:
|
|
1056
1107
|
if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
|
1057
1108
|
tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
|
1058
1109
|
|
1059
|
-
if tensor.is_cpu
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
else:
|
1065
|
-
# use group for GPU tensors
|
1066
|
-
torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
|
1067
|
-
return None
|
1110
|
+
comm_group = metadata_group if tensor.is_cpu else group
|
1111
|
+
work = send_func(tensor, self.ranks[dst], group=comm_group)
|
1112
|
+
if async_send:
|
1113
|
+
p2p_works.append(P2PWork(work, tensor))
|
1114
|
+
return p2p_works
|
1068
1115
|
|
1069
1116
|
def recv_tensor_dict(
|
1070
1117
|
self,
|
@@ -1110,17 +1157,15 @@ class GroupCoordinator:
|
|
1110
1157
|
orig_shape = tensor.shape
|
1111
1158
|
tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
|
1112
1159
|
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
torch.distributed.recv(tensor, src=self.ranks[src], group=group)
|
1160
|
+
# We have to use irecv here to make it work for both isend and send.
|
1161
|
+
comm_group = metadata_group if tensor.is_cpu else group
|
1162
|
+
work = torch.distributed.irecv(
|
1163
|
+
tensor, src=self.ranks[src], group=comm_group
|
1164
|
+
)
|
1165
|
+
work.wait()
|
1166
|
+
|
1121
1167
|
if use_all_gather:
|
1122
|
-
|
1123
|
-
tensor = all_gather_group.all_gather(tensor, dim=0) # type: ignore
|
1168
|
+
tensor = all_gather_group.all_gather(tensor, dim=0)
|
1124
1169
|
tensor = tensor.reshape(orig_shape)
|
1125
1170
|
|
1126
1171
|
tensor_dict[key] = tensor
|
@@ -1198,6 +1243,7 @@ def init_world_group(
|
|
1198
1243
|
use_pynccl=False,
|
1199
1244
|
use_pymscclpp=False,
|
1200
1245
|
use_custom_allreduce=False,
|
1246
|
+
use_torch_symm_mem=False,
|
1201
1247
|
use_hpu_communicator=False,
|
1202
1248
|
use_xpu_communicator=False,
|
1203
1249
|
use_npu_communicator=False,
|
@@ -1213,11 +1259,14 @@ def init_model_parallel_group(
|
|
1213
1259
|
use_message_queue_broadcaster: bool = False,
|
1214
1260
|
group_name: Optional[str] = None,
|
1215
1261
|
use_mscclpp_allreduce: Optional[bool] = None,
|
1262
|
+
use_symm_mem_allreduce: Optional[bool] = None,
|
1216
1263
|
) -> GroupCoordinator:
|
1217
1264
|
if use_custom_allreduce is None:
|
1218
1265
|
use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
|
1219
1266
|
if use_mscclpp_allreduce is None:
|
1220
1267
|
use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
|
1268
|
+
if use_symm_mem_allreduce is None:
|
1269
|
+
use_symm_mem_allreduce = _ENABLE_SYMM_MEM_ALL_REDUCE
|
1221
1270
|
return GroupCoordinator(
|
1222
1271
|
group_ranks=group_ranks,
|
1223
1272
|
local_rank=local_rank,
|
@@ -1225,6 +1274,7 @@ def init_model_parallel_group(
|
|
1225
1274
|
use_pynccl=not _is_npu,
|
1226
1275
|
use_pymscclpp=use_mscclpp_allreduce,
|
1227
1276
|
use_custom_allreduce=use_custom_allreduce,
|
1277
|
+
use_torch_symm_mem=use_symm_mem_allreduce,
|
1228
1278
|
use_hpu_communicator=True,
|
1229
1279
|
use_xpu_communicator=True,
|
1230
1280
|
use_npu_communicator=True,
|
@@ -1310,6 +1360,7 @@ logger = logging.getLogger(__name__)
|
|
1310
1360
|
|
1311
1361
|
_ENABLE_CUSTOM_ALL_REDUCE = True
|
1312
1362
|
_ENABLE_MSCCLPP_ALL_REDUCE = False
|
1363
|
+
_ENABLE_SYMM_MEM_ALL_REDUCE = False
|
1313
1364
|
|
1314
1365
|
|
1315
1366
|
def set_custom_all_reduce(enable: bool):
|
@@ -1322,6 +1373,11 @@ def set_mscclpp_all_reduce(enable: bool):
|
|
1322
1373
|
_ENABLE_MSCCLPP_ALL_REDUCE = enable
|
1323
1374
|
|
1324
1375
|
|
1376
|
+
def set_symm_mem_all_reduce(enable: bool):
|
1377
|
+
global _ENABLE_SYMM_MEM_ALL_REDUCE
|
1378
|
+
_ENABLE_SYMM_MEM_ALL_REDUCE = enable
|
1379
|
+
|
1380
|
+
|
1325
1381
|
def init_distributed_environment(
|
1326
1382
|
world_size: int = -1,
|
1327
1383
|
rank: int = -1,
|
@@ -1586,6 +1642,16 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator):
|
|
1586
1642
|
_TP = old_tp_group
|
1587
1643
|
|
1588
1644
|
|
1645
|
+
def get_world_size():
|
1646
|
+
"""Return world size for the world group."""
|
1647
|
+
return get_world_group().world_size
|
1648
|
+
|
1649
|
+
|
1650
|
+
def get_world_rank():
|
1651
|
+
"""Return my rank for the world group."""
|
1652
|
+
return get_world_group().rank_in_group
|
1653
|
+
|
1654
|
+
|
1589
1655
|
def get_tensor_model_parallel_world_size():
|
1590
1656
|
"""Return world size for the tensor model parallel group."""
|
1591
1657
|
return get_tp_group().world_size
|
@@ -1596,6 +1662,16 @@ def get_tensor_model_parallel_rank():
|
|
1596
1662
|
return get_tp_group().rank_in_group
|
1597
1663
|
|
1598
1664
|
|
1665
|
+
def get_pipeline_model_parallel_world_size():
|
1666
|
+
"""Return world size for the pipeline model parallel group."""
|
1667
|
+
return get_pp_group().world_size
|
1668
|
+
|
1669
|
+
|
1670
|
+
def get_pipeline_model_parallel_rank():
|
1671
|
+
"""Return my rank for the pipeline model parallel group."""
|
1672
|
+
return get_pp_group().rank_in_group
|
1673
|
+
|
1674
|
+
|
1599
1675
|
def get_moe_expert_parallel_world_size():
|
1600
1676
|
"""Return world size for the moe expert parallel group."""
|
1601
1677
|
return get_moe_ep_group().world_size
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -33,6 +33,8 @@ import zmq
|
|
33
33
|
import zmq.asyncio
|
34
34
|
from PIL.Image import Image
|
35
35
|
|
36
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
37
|
+
|
36
38
|
# Fix a bug of Python threading
|
37
39
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
38
40
|
|
@@ -45,6 +47,7 @@ from sglang.srt.managers.data_parallel_controller import (
|
|
45
47
|
)
|
46
48
|
from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
|
47
49
|
from sglang.srt.managers.io_struct import (
|
50
|
+
DestroyWeightsUpdateGroupReqInput,
|
48
51
|
EmbeddingReqInput,
|
49
52
|
GenerateReqInput,
|
50
53
|
GetWeightsByNameReqInput,
|
@@ -138,6 +141,12 @@ class Engine(EngineBase):
|
|
138
141
|
context, zmq.DEALER, self.port_args.rpc_ipc_name, True
|
139
142
|
)
|
140
143
|
|
144
|
+
if server_args.enable_trace:
|
145
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
146
|
+
if server_args.disaggregation_mode == "null":
|
147
|
+
thread_label = "Tokenizer"
|
148
|
+
trace_set_thread_info(thread_label)
|
149
|
+
|
141
150
|
def generate(
|
142
151
|
self,
|
143
152
|
# The input prompt. It can be a single prompt or a batch of prompts.
|
@@ -364,9 +373,9 @@ class Engine(EngineBase):
|
|
364
373
|
loop = asyncio.get_event_loop()
|
365
374
|
return loop.run_until_complete(self.tokenizer_manager.flush_cache())
|
366
375
|
|
367
|
-
def start_profile(self):
|
376
|
+
def start_profile(self, **kwargs):
|
368
377
|
loop = asyncio.get_event_loop()
|
369
|
-
loop.run_until_complete(self.tokenizer_manager.start_profile())
|
378
|
+
loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
|
370
379
|
|
371
380
|
def stop_profile(self):
|
372
381
|
loop = asyncio.get_event_loop()
|
@@ -425,6 +434,19 @@ class Engine(EngineBase):
|
|
425
434
|
self.tokenizer_manager.init_weights_update_group(obj, None)
|
426
435
|
)
|
427
436
|
|
437
|
+
def destroy_weights_update_group(
|
438
|
+
self,
|
439
|
+
group_name: str,
|
440
|
+
):
|
441
|
+
"""Destroy parameter update group."""
|
442
|
+
obj = DestroyWeightsUpdateGroupReqInput(
|
443
|
+
group_name=group_name,
|
444
|
+
)
|
445
|
+
loop = asyncio.get_event_loop()
|
446
|
+
return loop.run_until_complete(
|
447
|
+
self.tokenizer_manager.destroy_weights_update_group(obj, None)
|
448
|
+
)
|
449
|
+
|
428
450
|
def update_weights_from_distributed(
|
429
451
|
self,
|
430
452
|
names: list[str],
|
@@ -655,7 +677,15 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
655
677
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
656
678
|
os.environ["CUDA_MODULE_LOADING"] = "AUTO"
|
657
679
|
# flashinfer uses this environment variable for various kernels from MoE to quant kernels
|
658
|
-
os.environ
|
680
|
+
if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
|
681
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
682
|
+
|
683
|
+
if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
|
684
|
+
# Default to warning level, to avoid too many logs
|
685
|
+
os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
|
686
|
+
if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
|
687
|
+
# Need to set log to console, otherwise the log level won't take effect
|
688
|
+
os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
|
659
689
|
|
660
690
|
# Can also be passed as argument
|
661
691
|
os.environ["SGLANG_RUN_ID"] = (
|
@@ -673,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
673
703
|
if server_args.attention_backend == "flashinfer":
|
674
704
|
assert_pkg_version(
|
675
705
|
"flashinfer_python",
|
676
|
-
"0.
|
706
|
+
"0.4.0rc3",
|
677
707
|
"Please uninstall the old version and "
|
678
708
|
"reinstall the latest version by following the instructions "
|
679
709
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -681,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
681
711
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
682
712
|
assert_pkg_version(
|
683
713
|
"sgl-kernel",
|
684
|
-
"0.3.
|
714
|
+
"0.3.14",
|
685
715
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
686
716
|
)
|
687
717
|
|
@@ -703,6 +733,24 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
703
733
|
mp.set_start_method("spawn", force=True)
|
704
734
|
|
705
735
|
|
736
|
+
def _init_tokenizer_manager(
|
737
|
+
server_args: ServerArgs, port_args: PortArgs
|
738
|
+
) -> TokenizerManager:
|
739
|
+
# Launch tokenizer process
|
740
|
+
tokenizer_manager = TokenizerManager(server_args, port_args)
|
741
|
+
|
742
|
+
# Initialize templates
|
743
|
+
template_manager = TemplateManager()
|
744
|
+
template_manager.initialize_templates(
|
745
|
+
tokenizer_manager=tokenizer_manager,
|
746
|
+
model_path=server_args.model_path,
|
747
|
+
chat_template=server_args.chat_template,
|
748
|
+
completion_template=server_args.completion_template,
|
749
|
+
)
|
750
|
+
|
751
|
+
return tokenizer_manager, template_manager
|
752
|
+
|
753
|
+
|
706
754
|
def _launch_subprocesses(
|
707
755
|
server_args: ServerArgs, port_args: Optional[PortArgs] = None
|
708
756
|
) -> Tuple[TokenizerManager, TemplateManager, Dict]:
|
@@ -764,7 +812,6 @@ def _launch_subprocesses(
|
|
764
812
|
pp_rank,
|
765
813
|
None,
|
766
814
|
writer,
|
767
|
-
None,
|
768
815
|
),
|
769
816
|
)
|
770
817
|
|
@@ -815,23 +862,15 @@ def _launch_subprocesses(
|
|
815
862
|
),
|
816
863
|
)
|
817
864
|
detoken_proc.start()
|
865
|
+
|
866
|
+
# Init tokenizer manager first, as the bootstrap server is initialized here
|
818
867
|
if server_args.tokenizer_worker_num > 1:
|
819
868
|
# Launch multi-tokenizer router
|
820
869
|
tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
|
821
|
-
|
822
|
-
# Initialize templates
|
823
870
|
template_manager = None
|
824
871
|
else:
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
# Initialize templates
|
829
|
-
template_manager = TemplateManager()
|
830
|
-
template_manager.initialize_templates(
|
831
|
-
tokenizer_manager=tokenizer_manager,
|
832
|
-
model_path=server_args.model_path,
|
833
|
-
chat_template=server_args.chat_template,
|
834
|
-
completion_template=server_args.completion_template,
|
872
|
+
tokenizer_manager, template_manager = _init_tokenizer_manager(
|
873
|
+
server_args, port_args
|
835
874
|
)
|
836
875
|
|
837
876
|
# Wait for the model to finish loading
|
@@ -855,5 +894,7 @@ def _launch_subprocesses(
|
|
855
894
|
|
856
895
|
# Assume all schedulers have the same scheduler_info
|
857
896
|
scheduler_info = scheduler_infos[0]
|
897
|
+
|
858
898
|
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
899
|
+
|
859
900
|
return tokenizer_manager, template_manager, scheduler_info
|