PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +41 -27
sglang/bench_one_batch.py +60 -4
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +83 -71
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +46 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +452 -0
sglang/srt/entrypoints/http_server.py +603 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +8 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +71 -0
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +65 -14
sglang/srt/layers/logits_processor.py +49 -64
sglang/srt/layers/moe/ep_moe/layer.py +24 -16
sglang/srt/layers/moe/fused_moe_native.py +84 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
sglang/srt/layers/parameter.py +18 -8
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +10 -4
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1184 -31
sglang/srt/layers/sampler.py +64 -6
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +24 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +57 -3
sglang/srt/managers/schedule_batch.py +78 -45
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +326 -201
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +210 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +26 -30
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +44 -19
sglang/srt/model_loader/loader.py +83 -6
sglang/srt/model_loader/weight_utils.py +145 -6
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +17 -5
sglang/srt/models/dbrx.py +13 -5
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +11 -11
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +15 -25
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +4 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +9 -9
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +20 -7
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +7 -4
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +143 -18
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +77 -15
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +164 -129
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +2 -1
sglang/test/test_utils.py +83 -22
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -1,8 +1,9 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import contextlib
 import functools
 import importlib
 import logging
+import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 import torch
@@ -11,12 +12,19 @@ import torch.library
 from sglang.srt.utils import is_hpu
 logger = logging.getLogger(__name__)
+use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
 if not is_hpu():
-    try:
-        import custom_ar
-    except ImportError as e:
-        logger.warning("Failed to import from custom_ar with %r", e)
+    if use_vllm_custom_allreduce:
+        try:
+            import vllm._C
+        except ImportError as e:
+            logger.warning("Failed to import from vllm._C with %r", e)
+    else:
+        try:
+            import sgl_kernel
+        except ImportError as e:
+            logger.warning("Failed to import from custom_ar with %r", e)
 def hint_on_error(fn):
@@ -48,48 +56,78 @@ def hint_on_error(fn):
     return wrapper
-# custom ar
-def init_custom_ar(
-    ipc_tensors: List[torch.Tensor],
-    rank_data: torch.Tensor,
-    rank: int,
-    full_nvlink: bool,
-) -> int:
-    return torch.ops._C_vllm_ar.init_custom_ar(
-        ipc_tensors, rank_data, rank, full_nvlink
-    )
-def all_reduce(
-    fa: int,
-    inp: torch.Tensor,
-    out: torch.Tensor,
-    reg_buffer: int,
-    reg_buffer_sz_bytes: int,
-) -> None:
-    torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
-def dispose(fa: int) -> None:
-    torch.ops._C_vllm_ar.dispose(fa)
-def meta_size() -> int:
-    return torch.ops._C_vllm_ar.meta_size()
+if use_vllm_custom_allreduce:
+    # custom ar
+    def init_custom_ar(
+        ipc_tensors: List[torch.Tensor],
+        rank_data: torch.Tensor,
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return torch.ops._C_custom_ar.init_custom_ar(
+            ipc_tensors, rank_data, rank, full_nvlink
+        )
-def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
-    return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+    def dispose(fa: int) -> None:
+        torch.ops._C_custom_ar.dispose(fa)
+    def meta_size() -> int:
+        return torch.ops._C_custom_ar.meta_size()
+    def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+        return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+else:
+    # custom ar
+    def init_custom_ar(
+        rank_id: int,
+        world_size: int,
+        rank_data_base: torch.Tensor,
+        buffers: List[int],
+        tmp_result_buffers: List[int],
+        barrier_in: List[int],
+        barrier_out: List[int],
+    ) -> int:
+        return sgl_kernel.ops.init_custom_reduce(
+            rank_id,
+            world_size,
+            rank_data_base,
+            buffers,
+            tmp_result_buffers,
+            barrier_in,
+            barrier_out,
+        )
+    def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        sgl_kernel.ops.custom_reduce(fa, inp, out)
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
-    return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
+    def dispose(fa: int) -> None:
+        sgl_kernel.ops.custom_dispose(fa)
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
-def register_graph_buffers(
-    fa: int, handles: List[List[int]], offsets: List[List[int]]
-) -> None:
-    torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456

sglang/srt/configs/device_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ class DeviceConfig:
     device: Optional[torch.device]
     def __init__(self, device: str = "cuda") -> None:
-        if device in ["cuda", "xpu", "hpu"]:
+        if device in ["cuda", "xpu", "hpu", "cpu"]:
             self.device_type = device
         else:
             raise RuntimeError(f"Not supported device type: {device}")

sglang/srt/configs/load_config.py CHANGED Viewed

@@ -20,6 +20,7 @@ class LoadFormat(str, enum.Enum):
     GGUF = "gguf"
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
+    LAYERED = "layered"
 @dataclass

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -402,6 +402,7 @@ def is_multimodal_model(model_architectures: List[str]):
         or "LlavaVidForCausalLM" in model_architectures
         or "MllamaForConditionalGeneration" in model_architectures
         or "Qwen2VLForConditionalGeneration" in model_architectures
+        or "MiniCPMV" in model_architectures
     ):
         return True
     else:

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -18,6 +18,8 @@ from dataclasses import dataclass
 from threading import Event, Lock
 from typing import Any, Optional, Tuple
+from sglang.srt.server_args import ServerArgs
 @dataclass
 class CacheEntry:
@@ -69,3 +71,22 @@ class BaseGrammarBackend:
     def reset(self):
         with self.cache_lock:
             self.cache.clear()
+def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
+    if server_args.grammar_backend == "outlines":
+        from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
+        grammar_backend = OutlinesGrammarBackend(
+            tokenizer,
+            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
+            allow_jump_forward=not server_args.disable_jump_forward,
+        )
+    elif server_args.grammar_backend == "xgrammar":
+        from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
+        grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
+    else:
+        raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
+    return grammar_backend

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import List, Tuple
 import torch
 from xgrammar import (
     CompiledGrammar,
+    Grammar,
     GrammarCompiler,
     GrammarMatcher,
     TokenizerInfo,
@@ -133,10 +134,13 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
                 logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
                 return None
         elif key_type == "regex":
-            logger.warning(
-                "regex hasn't been supported by xgrammar yet. This is skipped."
-            )
-            return None
+            try:
+                ctx = self.grammar_compiler.compile_grammar(
+                    Grammar.from_regex(key_string)
+                )
+            except RuntimeError as e:
+                logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
+                return None
         else:
             raise ValueError(f"Invalid key_type: {key_type}")

sglang/srt/conversation.py CHANGED Viewed

@@ -452,7 +452,6 @@ def generate_chat_conv(
     # Add a blank message for the assistant.
     conv.append_message(conv.roles[1], None)
     return conv
@@ -555,3 +554,17 @@ register_conv_template(
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
     )
 )
+# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmv",
+        system_message="You are a helpful assistant",
+        system_template="<|im_start|>system\n{system_message}.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)

sglang/srt/distributed/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .communication_op import *
-from .parallel_state import *
-from .utils import *
+from sglang.srt.distributed.communication_op import *
+from sglang.srt.distributed.parallel_state import *
+from sglang.srt.distributed.utils import *

sglang/srt/distributed/communication_op.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/communication_op.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/communication_op.py
 from typing import Any, Dict, Optional, Union
 import torch

sglang/srt/distributed/device_communicators/cuda_wrapper.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/cuda_wrapper.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/cuda_wrapper.py
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.

sglang/srt/distributed/device_communicators/custom_all_reduce.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce.py
 import ctypes
 import logging
 import os
@@ -6,7 +7,6 @@ from contextlib import contextmanager
 from functools import wraps
 from typing import Callable, List, Optional, TypeVar, Union
-import pynvml
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -20,8 +20,19 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
 from sglang.srt.distributed.parallel_state import in_the_same_node_as
 from sglang.srt.utils import cuda_device_count_stateless, is_cuda
+logger = logging.getLogger(__name__)
+if is_cuda():
+    try:
+        import pynvml
+    except ImportError as e:
+        logger.warning("Failed to import pynvml with %r", e)
 try:
-    ops.meta_size()
+    if ops.use_vllm_custom_allreduce:
+        ops.meta_size()
+    else:
+        import sgl_kernel
     custom_ar = True
 except Exception:
     # For AMD GPUs and CPUs
@@ -29,7 +40,6 @@ except Exception:
 logger = logging.getLogger(__name__)
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
@@ -47,7 +57,7 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
 @with_nvml_context
-def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+def is_full_nvlink(physical_device_ids: List[int]) -> bool:
     """
     query if the set of gpus are fully connected by nvlink (1 hop)
     """
@@ -175,9 +185,12 @@ class CustomAllreduce:
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        assert is_cuda()
+        if is_cuda():
+            assert is_cuda()
-        full_nvlink = is_full_nvlink(physical_device_ids)
+            full_nvlink = is_full_nvlink(physical_device_ids)
+        else:
+            full_nvlink = False
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
@@ -196,32 +209,64 @@ class CustomAllreduce:
             )
             return
-        self.disabled = False
-        # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
-        # temporary buffer for storing intermediate allreduce results.
-        self.meta_ptrs = self.create_shared_buffer(
-            ops.meta_size() + max_size, group=group
-        )
-        # This is a pre-registered IPC buffer. In eager mode, input tensors
-        # are first copied into this buffer before allreduce is performed
-        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
-        # This is a buffer for storing the tuples of pointers pointing to
-        # IPC buffers from all ranks. Each registered tuple has size of
-        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
-        # is enough for 131072 such tuples. The largest model I've seen only
-        # needs less than 10000 of registered tuples.
-        self.rank_data = torch.empty(
-            8 * 1024 * 1024, dtype=torch.uint8, device=self.device
-        )
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
         self.full_nvlink = full_nvlink
-        self._ptr = ops.init_custom_ar(
-            self.meta_ptrs, self.rank_data, rank, self.full_nvlink
-        )
-        ops.register_buffer(self._ptr, self.buffer_ptrs)
+        if ops.use_vllm_custom_allreduce:
+            # Buffers memory are owned by this Python class and passed to C++.
+            # Meta data composes of two parts: meta data for synchronization and a
+            # temporary buffer for storing intermediate allreduce results.
+            self.meta_ptrs = self.create_shared_buffer(
+                ops.meta_size() + max_size, group=group
+            )
+            # This is a pre-registered IPC buffer. In eager mode, input tensors
+            # are first copied into this buffer before allreduce is performed
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            # This is a buffer for storing the tuples of pointers pointing to
+            # IPC buffers from all ranks. Each registered tuple has size of
+            # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+            # is enough for 131072 such tuples. The largest model I've seen only
+            # needs less than 10000 of registered tuples.
+            self.rank_data = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+            )
+            ops.register_buffer(self._ptr, self.buffer_ptrs)
+        else:
+            # From TensorRT-LLM getMaxRequiredWorkspaceSize
+            self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
+            # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
+            self.barrier_max_size = 8 * (36 + 2) * 8
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            self.tmp_result_buffer_ptrs = self.create_shared_buffer(
+                max_size, group=group
+            )
+            self.rank_data_base = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self.barrier_in_ptrs = self.create_shared_buffer(
+                self.barrier_max_size, group=group
+            )
+            self.barrier_out_ptrs = self.create_shared_buffer(
+                self.barrier_max_size, group=group
+            )
+            self._ptr = ops.init_custom_ar(
+                rank,
+                world_size,
+                self.rank_data_base,
+                self.buffer_ptrs,
+                self.tmp_result_buffer_ptrs,
+                self.barrier_in_ptrs,
+                self.barrier_out_ptrs,
+            )
+        self.disabled = False
     @staticmethod
     def create_shared_buffer(
@@ -300,12 +345,31 @@ class CustomAllreduce:
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
-            return inp_size < self.max_size
+        if ops.use_vllm_custom_allreduce:
+            if self.world_size == 2 or self.full_nvlink:
+                return inp_size < self.max_size
+            return False
+        if self.world_size == 2:
+            return (
+                inp_size < self.max_size
+                and inp_size < self.max_required_workspace_size[0]
+            )
+        if self.full_nvlink:
+            return (
+                inp_size < self.max_size
+                and inp_size < self.max_required_workspace_size[1]
+            )
         return False
     def all_reduce(
-        self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
+        self,
+        inp: torch.Tensor,
+        *,
+        out: torch.Tensor = None,
+        registered: bool = False,
     ):
         """Performs an out-of-place all reduce.
@@ -315,12 +379,15 @@ class CustomAllreduce:
         """
         if out is None:
             out = torch.empty_like(inp)
-        if registered:
-            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        if ops.use_vllm_custom_allreduce:
+            if registered:
+                ops.all_reduce(self._ptr, inp, out, 0, 0)
+            else:
+                ops.all_reduce(
+                    self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+                )
         else:
-            ops.all_reduce(
-                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
-            )
+            ops.all_reduce(self._ptr, inp, out)
         return out
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -336,17 +403,20 @@ class CustomAllreduce:
                 # allreduce is out-of-place.
                 return torch.empty_like(input)
         else:
-            # Note: outside of cuda graph context, custom allreduce incurs a
-            # cost of cudaMemcpy, which should be small (<=1% of overall
-            # latency) compared to the performance gain of using custom kernels
             return self.all_reduce(input, registered=False)
     def close(self):
         if not self.disabled and self._ptr:
             ops.dispose(self._ptr)
+            if ops.use_vllm_custom_allreduce:
+                self.free_shared_buffer(self.meta_ptrs)
+                self.free_shared_buffer(self.buffer_ptrs)
+            else:
+                self.free_shared_buffer(self.buffer_ptrs)
+                self.free_shared_buffer(self.tmp_result_buffer_ptrs)
+                self.free_shared_buffer(self.barrier_in_ptrs)
+                self.free_shared_buffer(self.barrier_out_ptrs)
             self._ptr = 0
-            self.free_shared_buffer(self.meta_ptrs)
-            self.free_shared_buffer(self.buffer_ptrs)
     def __del__(self):
         self.close()

sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce_utils.py
 import ctypes
 import json
 import logging
@@ -7,7 +8,6 @@ import pickle
 import subprocess
 import sys
 import tempfile
-from functools import lru_cache
 from itertools import product
 from typing import Dict, List, Optional, Sequence

sglang/srt/distributed/device_communicators/hpu_communicator.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/hpu_communicator.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/hpu_communicator.py
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup

sglang/srt/distributed/device_communicators/pynccl.py CHANGED Viewed

@@ -1,8 +1,10 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/pynccl.py
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
 import logging
 from contextlib import contextmanager
 from typing import Optional, Union
+# ===================== import region =====================
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup, ReduceOp
@@ -143,6 +145,57 @@ class PyNcclCommunicator:
             cudaStream_t(stream.cuda_stream),
         )
+    def all_gather(
+        self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclReduceScatter(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            output_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
@@ -179,6 +232,32 @@ class PyNcclCommunicator:
             cudaStream_t(stream.cuda_stream),
         )
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
     @contextmanager
     def change_state(
         self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None

sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl