PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/distributed/device_communicators/symm_mem.py ADDED Viewed

@@ -0,0 +1,164 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/bf214ca22625e311a2c4c0dfbf7af19128f4919c/vllm/distributed/device_communicators/symm_mem.py
+import logging
+from typing import Optional, Union
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from sglang.srt.distributed.device_communicators.all_reduce_utils import (
+    SYMM_MEM_ALL_REDUCE_MAX_SIZES,
+)
+from sglang.srt.utils import get_device_capability, is_cuda, is_hip
+try:
+    import torch.distributed._symmetric_memory as torch_symm_mem
+    symm_mem_available = True
+except ImportError:
+    symm_mem_available = False
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+symm_mem_is_available = False
+if _is_hip:
+    symm_mem_is_available = False
+if _is_cuda:
+    symm_mem_is_available = True
+class SymmMemCommunicator:
+    """
+    Thin wrapper around symmetric-memory collectives.
+    This communicator:
+      - Validates device capability and world size.
+      - Allocates a shared symmetric buffer.
+      - Chooses between 'multimem' and 'two-shot' all-reduce kernels.
+      - Exposes a fast-path all_reduce() compatible with bfloat16 inputs.
+    If any prerequisite is not met, the instance remains disabled and will
+    decline to perform symmetric-memory all-reduce.
+    """
+    # Mapping: compute capability major -> supported world sizes for multimem
+    # If the current (cc_major, world_size) is not listed, we fall back
+    # to the two-shot path.
+    _WORLD_SIZES_MULTIMEM = {
+        9: [4, 6, 8],
+        10: [6, 8],
+    }
+    def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device]):
+        """
+        Args:
+            group: Torch process group used for rendezvous and naming.
+            device: Target CUDA device (index, 'cuda:X', or torch.device).
+        """
+        self.disabled = True
+        if not symm_mem_available:
+            return
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        torch.cuda.set_device(device)
+        self.dtype = torch.bfloat16
+        self.device = device
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+        self.device_capability = torch.cuda.get_device_capability(device)[0]
+        if self.device_capability < 9:
+            logger.warning(
+                "SymmMemCommunicator: Device capability %s not supported, "
+                "communicator is not available.",
+                self.device_capability,
+            )
+            return
+        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]:
+            logger.warning(
+                "SymmMemCommunicator: World size %d not supported, "
+                "communicator is not available.",
+                self.world_size,
+            )
+            return
+        self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+            self.world_size
+        ]
+        self.buffer = torch_symm_mem.empty(
+            self.max_size // self.dtype.itemsize,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        if handle.multicast_ptr == 0:
+            logger.warning(
+                "SymmMemCommunicator: symmetric memory "
+                "multicast operations are not supported."
+            )
+            self.buffer = None
+            self.disabled = True
+            return
+        self.disabled = False
+    def should_symm_mem_allreduce(self, inp: torch.Tensor):
+        """
+        Fast-path eligibility check for a given tensor.
+        Conditions:
+          - Communicator must be enabled.
+          - dtype must be bfloat16 (matches kernel + buffer dtype).
+          - Total byte size must be 4-byte aligned (hardware requirement).
+          - Payload must be smaller than the symmetric-memory max size.
+        Returns:
+            True if the symmetric-memory path can handle this tensor.
+        """
+        if self.disabled:
+            return False
+        if inp.dtype != self.dtype:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # enforce 4-byte alignment
+        if inp_size % 4 != 0:
+            return False
+        return inp_size < self.max_size
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: Optional[torch.Tensor] = None
+    ) -> Optional[torch.Tensor]:
+        """
+        Perform an in-place sum all-reduce via symmetric memory.
+        Args:
+            inp: Input tensor on the target CUDA device (bfloat16).
+            out: Optional output tensor; if omitted, a new tensor is allocated.
+        Returns:
+            The reduced tensor (same shape as inp), or None if disabled.
+        Implementation details:
+            - Stages 'inp' into the symmetric buffer.
+            - Selects 'multimem' or 'two_shot' kernel based on topology.
+            - Writes the result into 'out' and returns it.
+        """
+        if out is None:
+            out = torch.empty_like(inp)
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
+        if self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]:
+            torch.ops.symm_mem.multimem_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        else:
+            torch.ops.symm_mem.two_shot_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        out.copy_(self.buffer[: inp.numel()].view(out.shape))
+        return out

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""vLLM distributed state.
+"""Distributed state.
 It takes over the control of the distributed environment from PyTorch.
 The typical workflow is:
@@ -53,19 +53,26 @@ from sglang.srt.utils import (
 _is_npu = is_npu()
 _is_cpu = is_cpu()
+_supports_custom_op = supports_custom_op()
 IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+# use int value instead of ReduceOp.SUM to support torch compile
+REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
 @dataclass
 class GraphCaptureContext:
     stream: torch.cuda.Stream if not _is_npu else torch.npu.Stream
-TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
-# use int value instead of ReduceOp.SUM to support torch compile
-REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
+@dataclass
+class P2PWork:
+    work: Optional[torch.distributed.Work]
+    payload: Optional[torch.Tensor]
 def _split_tensor_dict(
@@ -117,7 +124,7 @@ def _register_group(group: "GroupCoordinator") -> None:
     _groups[group.unique_name] = weakref.ref(group)
-if supports_custom_op():
+if _supports_custom_op:
     def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
         assert group_name in _groups, f"Group {group_name} is not found."
@@ -208,12 +215,14 @@ class GroupCoordinator:
     use_pynccl: bool  # a hint of whether to use PyNccl
     use_pymscclpp: bool  # a hint of whether to use PyMsccl
     use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    use_torch_symm_mem: bool  # a hint of whether to use SymmMemAllReduce
     use_message_queue_broadcaster: (
         bool  # a hint of whether to use message queue broadcaster
     )
     # communicators are only created for world size > 1
     pynccl_comm: Optional[Any]  # PyNccl communicator
     ca_comm: Optional[Any]  # Custom allreduce communicator
+    symm_mem_comm: Optional[Any]  # Symm mem communicator
     mq_broadcaster: Optional[Any]  # shared memory broadcaster
     def __init__(
@@ -224,6 +233,7 @@ class GroupCoordinator:
         use_pynccl: bool,
         use_pymscclpp: bool,
         use_custom_allreduce: bool,
+        use_torch_symm_mem: bool,
         use_hpu_communicator: bool,
         use_xpu_communicator: bool,
         use_npu_communicator: bool,
@@ -272,12 +282,13 @@ class GroupCoordinator:
         self.use_pynccl = use_pynccl
         self.use_pymscclpp = use_pymscclpp
         self.use_custom_allreduce = use_custom_allreduce
+        self.use_torch_symm_mem = use_torch_symm_mem
         self.use_hpu_communicator = use_hpu_communicator
         self.use_xpu_communicator = use_xpu_communicator
         self.use_npu_communicator = use_npu_communicator
         self.use_message_queue_broadcaster = use_message_queue_broadcaster
-        # lazy import to avoid documentation build error
+        # Lazy import to avoid documentation build error
         from sglang.srt.distributed.device_communicators.custom_all_reduce import (
             CustomAllreduce,
         )
@@ -287,6 +298,9 @@ class GroupCoordinator:
         from sglang.srt.distributed.device_communicators.pynccl import (
             PyNcclCommunicator,
         )
+        from sglang.srt.distributed.device_communicators.symm_mem import (
+            SymmMemCommunicator,
+        )
         if is_hip():
             from sglang.srt.distributed.device_communicators.quick_all_reduce import (
@@ -335,6 +349,13 @@ class GroupCoordinator:
                 except Exception as e:
                     logger.warning(f"Failed to initialize QuickAllReduce: {e}")
+        self.symm_mem_comm: Optional[SymmMemCommunicator] = None
+        if self.use_torch_symm_mem and self.world_size > 1:
+            self.symm_mem_comm = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
         # Create communicator for other hardware backends
         from sglang.srt.distributed.device_communicators.hpu_communicator import (
             HpuCommunicator,
@@ -439,6 +460,7 @@ class GroupCoordinator:
             # custom allreduce       | enabled | enabled |
             # PyNccl                 | disabled| enabled |
             # PyMscclpp              | disabled| enabled |
+            # TorchSymmMem           | disabled| enabled |
             # torch.distributed      | enabled | disabled|
             #
             # Note: When custom quick allreduce is enabled, a runtime check
@@ -497,7 +519,7 @@ class GroupCoordinator:
                 torch.distributed.all_reduce(input_, group=self.device_group)
             return input_
-        if not supports_custom_op():
+        if not _supports_custom_op:
             self._all_reduce_in_place(input_)
             return input_
@@ -523,23 +545,29 @@ class GroupCoordinator:
         outplace_all_reduce_method = None
         if (
-            self.qr_comm is not None
-            and not self.qr_comm.disabled
-            and self.qr_comm.should_quick_allreduce(input_)
-        ):
-            outplace_all_reduce_method = "qr"
-        elif (
             self.ca_comm is not None
             and not self.ca_comm.disabled
             and self.ca_comm.should_custom_ar(input_)
         ):
             outplace_all_reduce_method = "ca"
+        elif (
+            self.qr_comm is not None
+            and not self.qr_comm.disabled
+            and self.qr_comm.should_quick_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "qr"
         elif (
             self.pymscclpp_comm is not None
             and not self.pymscclpp_comm.disabled
             and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
         ):
             outplace_all_reduce_method = "pymscclpp"
+        elif (
+            self.symm_mem_comm is not None
+            and not self.symm_mem_comm.disabled
+            and self.symm_mem_comm.should_symm_mem_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "symm_mem"
         if outplace_all_reduce_method is not None:
             return torch.ops.sglang.outplace_all_reduce(
                 input_,
@@ -553,16 +581,20 @@ class GroupCoordinator:
     def _all_reduce_out_place(
         self, input_: torch.Tensor, outplace_all_reduce_method: str
     ) -> torch.Tensor:
-        qr_comm = self.qr_comm
         ca_comm = self.ca_comm
+        qr_comm = self.qr_comm
         pymscclpp_comm = self.pymscclpp_comm
+        symm_mem_comm = self.symm_mem_comm
         assert any([qr_comm, ca_comm, pymscclpp_comm])
-        if outplace_all_reduce_method == "qr":
-            assert not qr_comm.disabled
-            out = qr_comm.quick_all_reduce(input_)
-        elif outplace_all_reduce_method == "ca":
+        if outplace_all_reduce_method == "ca":
             assert not ca_comm.disabled
             out = ca_comm.custom_all_reduce(input_)
+        elif outplace_all_reduce_method == "qr":
+            assert not qr_comm.disabled
+            out = qr_comm.quick_all_reduce(input_)
+        elif outplace_all_reduce_method == "symm_mem":
+            assert not symm_mem_comm.disabled
+            out = symm_mem_comm.all_reduce(input_)
         else:
             assert not pymscclpp_comm.disabled
             out = pymscclpp_comm.all_reduce(input_)
@@ -637,7 +669,7 @@ class GroupCoordinator:
             )
     def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
-        if _is_npu or not supports_custom_op():
+        if _is_npu or not _supports_custom_op:
             self._all_gather_into_tensor(output, input)
         else:
             torch.ops.sglang.reg_all_gather_into_tensor(
@@ -697,15 +729,13 @@ class GroupCoordinator:
         )
         # All-gather.
-        if input_.is_cpu and is_shm_available(
-            input_.dtype, self.world_size, self.local_size
-        ):
-            return torch.ops.sgl_kernel.shm_allgather(input_, dim)
         if input_.is_cpu:
-            torch.distributed.all_gather_into_tensor(
-                output_tensor, input_, group=self.device_group
-            )
+            if is_shm_available(input_.dtype, self.world_size, self.local_size):
+                return torch.ops.sgl_kernel.shm_allgather(input_, dim)
+            else:
+                torch.distributed.all_gather_into_tensor(
+                    output_tensor, input_, group=self.device_group
+                )
         else:
             self.all_gather_into_tensor(output_tensor, input_)
@@ -861,45 +891,63 @@ class GroupCoordinator:
         torch.distributed.all_gather_object(objs, obj, group=self.cpu_group)
         return objs
-    def send_object(self, obj: Any, dst: int) -> None:
-        """Send the input object list to the destination rank."""
-        """NOTE: `dst` is the local rank of the destination rank."""
+    def send_object(
+        self,
+        obj: Any,
+        dst: int,
+        async_send: bool = False,
+    ) -> List[P2PWork]:
+        """
+        Send the input object list to the destination rank.
+        This function uses the CPU group for all communications.
-        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+        TODO: If you want to use GPU communication, please add a new argument (e.g., data_group, group),
+        use other functions (e.g., send), or implement a new function (e.g., send_object_device).
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
         assert dst != self.rank_in_group, (
             "Invalid destination rank. Destination rank is the same "
             "as the current rank."
         )
+        send_func = torch.distributed.isend if async_send else torch.distributed.send
         # Serialize object to tensor and get the size as well
-        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).cuda(
-            device=torch.cuda.current_device()
-        )
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
         size_tensor = torch.tensor(
-            [object_tensor.numel()],
-            dtype=torch.long,
-            device="cpu",
+            [object_tensor.numel()], dtype=torch.long, device="cpu"
         )
         # Send object size
-        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
+        p2p_work = []
+        size_work = send_func(
+            size_tensor,
+            self.ranks[dst],
+            group=self.cpu_group,
+        )
+        if async_send:
+            p2p_work.append(P2PWork(size_work, size_tensor))
-        # Send object
-        torch.distributed.send(
+        object_work = send_func(
             object_tensor,
-            dst=self.ranks[dst],
-            group=self.device_group,
+            self.ranks[dst],
+            group=self.cpu_group,
         )
+        if async_send:
+            p2p_work.append(P2PWork(object_work, object_tensor))
-        return None
+        return p2p_work
-    def recv_object(self, src: int) -> Any:
+    def recv_object(
+        self,
+        src: int,
+    ) -> Any:
         """Receive the input object list from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
         assert src < self.world_size, f"Invalid src rank ({src})"
         assert (
             src != self.rank_in_group
         ), "Invalid source rank. Source rank is the same as the current rank."
@@ -907,27 +955,25 @@ class GroupCoordinator:
         size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
         # Receive object size
-        rank_size = torch.distributed.recv(
+        # We have to use irecv here to make it work for both isend and send.
+        work = torch.distributed.irecv(
             size_tensor, src=self.ranks[src], group=self.cpu_group
         )
+        work.wait()
         # Tensor to receive serialized objects into.
-        object_tensor = torch.empty(  # type: ignore[call-overload]
+        object_tensor: Any = torch.empty(  # type: ignore[call-overload]
             size_tensor.item(),  # type: ignore[arg-type]
             dtype=torch.uint8,
-            device=torch.cuda.current_device(),
+            device="cpu",
         )
-        rank_object = torch.distributed.recv(
-            object_tensor, src=self.ranks[src], group=self.device_group
+        work = torch.distributed.irecv(
+            object_tensor, src=self.ranks[src], group=self.cpu_group
         )
+        work.wait()
-        assert (
-            rank_object == rank_size
-        ), "Received object sender rank does not match the size sender rank."
-        obj = pickle.loads(object_tensor.cpu().numpy())
+        obj = pickle.loads(object_tensor.numpy())
         return obj
     def broadcast_tensor_dict(
@@ -1017,12 +1063,13 @@ class GroupCoordinator:
         tensor_dict: Dict[str, Union[torch.Tensor, Any]],
         dst: Optional[int] = None,
         all_gather_group: Optional["GroupCoordinator"] = None,
-    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        async_send: bool = False,
+    ) -> Optional[List[P2PWork]]:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
         """
         # Bypass the function if we are using only 1 GPU.
-        if not torch.distributed.is_initialized() or self.world_size == 1:
+        if self.world_size == 1:
             return tensor_dict
         all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
@@ -1047,7 +1094,10 @@ class GroupCoordinator:
         # 1. Superior D2D transfer bandwidth
         # 2. Ability to overlap send and recv operations
         # Thus the net performance gain justifies this approach.
-        self.send_object(metadata_list, dst=dst)
+        send_func = torch.distributed.isend if async_send else torch.distributed.send
+        p2p_works = self.send_object(metadata_list, dst=dst, async_send=async_send)
         for tensor in tensor_list:
             if tensor.numel() == 0:
                 # Skip sending empty tensors.
@@ -1057,15 +1107,11 @@ class GroupCoordinator:
             if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
                 tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
-            if tensor.is_cpu:
-                # use metadata_group for CPU tensors
-                torch.distributed.send(
-                    tensor, dst=self.ranks[dst], group=metadata_group
-                )
-            else:
-                # use group for GPU tensors
-                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
-        return None
+            comm_group = metadata_group if tensor.is_cpu else group
+            work = send_func(tensor, self.ranks[dst], group=comm_group)
+            if async_send:
+                p2p_works.append(P2PWork(work, tensor))
+        return p2p_works
     def recv_tensor_dict(
         self,
@@ -1111,17 +1157,15 @@ class GroupCoordinator:
                     orig_shape = tensor.shape
                     tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
-                if tensor.is_cpu:
-                    # use metadata_group for CPU tensors
-                    torch.distributed.recv(
-                        tensor, src=self.ranks[src], group=metadata_group
-                    )
-                else:
-                    # use group for GPU tensors
-                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
+                # We have to use irecv here to make it work for both isend and send.
+                comm_group = metadata_group if tensor.is_cpu else group
+                work = torch.distributed.irecv(
+                    tensor, src=self.ranks[src], group=comm_group
+                )
+                work.wait()
                 if use_all_gather:
-                    # do the allgather
-                    tensor = all_gather_group.all_gather(tensor, dim=0)  # type: ignore
+                    tensor = all_gather_group.all_gather(tensor, dim=0)
                     tensor = tensor.reshape(orig_shape)
                 tensor_dict[key] = tensor
@@ -1199,6 +1243,7 @@ def init_world_group(
         use_pynccl=False,
         use_pymscclpp=False,
         use_custom_allreduce=False,
+        use_torch_symm_mem=False,
         use_hpu_communicator=False,
         use_xpu_communicator=False,
         use_npu_communicator=False,
@@ -1214,11 +1259,14 @@ def init_model_parallel_group(
     use_message_queue_broadcaster: bool = False,
     group_name: Optional[str] = None,
     use_mscclpp_allreduce: Optional[bool] = None,
+    use_symm_mem_allreduce: Optional[bool] = None,
 ) -> GroupCoordinator:
     if use_custom_allreduce is None:
         use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
     if use_mscclpp_allreduce is None:
         use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
+    if use_symm_mem_allreduce is None:
+        use_symm_mem_allreduce = _ENABLE_SYMM_MEM_ALL_REDUCE
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
@@ -1226,6 +1274,7 @@ def init_model_parallel_group(
         use_pynccl=not _is_npu,
         use_pymscclpp=use_mscclpp_allreduce,
         use_custom_allreduce=use_custom_allreduce,
+        use_torch_symm_mem=use_symm_mem_allreduce,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
         use_npu_communicator=True,
@@ -1311,6 +1360,7 @@ logger = logging.getLogger(__name__)
 _ENABLE_CUSTOM_ALL_REDUCE = True
 _ENABLE_MSCCLPP_ALL_REDUCE = False
+_ENABLE_SYMM_MEM_ALL_REDUCE = False
 def set_custom_all_reduce(enable: bool):
@@ -1323,6 +1373,11 @@ def set_mscclpp_all_reduce(enable: bool):
     _ENABLE_MSCCLPP_ALL_REDUCE = enable
+def set_symm_mem_all_reduce(enable: bool):
+    global _ENABLE_SYMM_MEM_ALL_REDUCE
+    _ENABLE_SYMM_MEM_ALL_REDUCE = enable
 def init_distributed_environment(
     world_size: int = -1,
     rank: int = -1,

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -47,6 +47,7 @@ from sglang.srt.managers.data_parallel_controller import (
 )
 from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
 from sglang.srt.managers.io_struct import (
+    DestroyWeightsUpdateGroupReqInput,
     EmbeddingReqInput,
     GenerateReqInput,
     GetWeightsByNameReqInput,
@@ -433,6 +434,19 @@ class Engine(EngineBase):
             self.tokenizer_manager.init_weights_update_group(obj, None)
         )
+    def destroy_weights_update_group(
+        self,
+        group_name: str,
+    ):
+        """Destroy parameter update group."""
+        obj = DestroyWeightsUpdateGroupReqInput(
+            group_name=group_name,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.destroy_weights_update_group(obj, None)
+        )
     def update_weights_from_distributed(
         self,
         names: list[str],
@@ -666,6 +680,13 @@ def _set_envs_and_config(server_args: ServerArgs):
     if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
         os.environ["TRTLLM_ENABLE_PDL"] = "1"
+    if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
+        # Default to warning level, to avoid too many logs
+        os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
+    if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
+        # Need to set log to console, otherwise the log level won't take effect
+        os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
     # Can also be passed as argument
     os.environ["SGLANG_RUN_ID"] = (
         f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
@@ -682,7 +703,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.3.1",
+            "0.4.0rc3",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -690,7 +711,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.3.9.post2",
+            "0.3.14",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
@@ -791,7 +812,6 @@ def _launch_subprocesses(
                         pp_rank,
                         None,
                         writer,
-                        None,
                     ),
                 )

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl