PyPI - sglang - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl - Mend

sglang 0.4.0py3-none-any.whl → 0.4.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

sglang/__init__.py +1 -1
sglang/srt/constrained/outlines_backend.py +5 -0
sglang/srt/constrained/xgrammar_backend.py +5 -5
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +20 -5
sglang/srt/layers/attention/torch_native_backend.py +22 -8
sglang/srt/layers/attention/triton_backend.py +22 -8
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +661 -0
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/fp8.py +559 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +4 -2
sglang/srt/layers/sampler.py +2 -0
sglang/srt/layers/torchao_utils.py +23 -45
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/scheduler.py +69 -65
sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/model_executor/cuda_graph_runner.py +15 -1
sglang/srt/model_executor/model_runner.py +11 -4
sglang/srt/model_parallel.py +1 -5
sglang/srt/models/commandr.py +2 -2
sglang/srt/models/deepseek_v2.py +87 -7
sglang/srt/models/grok.py +0 -5
sglang/srt/models/llama.py +0 -5
sglang/srt/models/mixtral.py +12 -9
sglang/srt/models/phi3_small.py +0 -5
sglang/srt/models/qwen2_moe.py +0 -5
sglang/srt/models/torch_native_llama.py +0 -5
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +3 -3
sglang/srt/server_args.py +43 -4
sglang/srt/utils.py +50 -0
sglang/version.py +1 -1
{sglang-0.4.0.dist-info → sglang-0.4.0.post1.dist-info}/METADATA +5 -4
{sglang-0.4.0.dist-info → sglang-0.4.0.post1.dist-info}/RECORD +43 -38
{sglang-0.4.0.dist-info → sglang-0.4.0.post1.dist-info}/LICENSE +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -2,23 +2,24 @@
 Common utilities for torchao.
 """
-from typing import Dict, Set
 import torch
-def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
-    """Quantize a Tensor with torchao quantization specified by torchao_config
+def apply_torchao_config_to_model(
+    model: torch.nn.Module, torchao_config: str, filter_fn=None
+):
+    """Quantize a modelwith torchao quantization specified by torchao_config
     Args:
-       `param`: weight parameter of the linear module
-       `torchao_config`: type of quantization and their arguments we want to use to
-        quantize the Tensor, e.g. int4wo-128 means int4 weight only quantization with group_size
+       `model`: a model to be quantized based on torchao_config
+       `torchao_config` (str): type of quantization and their arguments we want to use to
+        quantize the model, e.g. int4wo-128 means int4 weight only quantization with group_size
         128
     """
     # Lazy import to suppress some warnings
     from torchao.quantization import (
         float8_dynamic_activation_float8_weight,
+        float8_weight_only,
         int4_weight_only,
         int8_dynamic_activation_int8_weight,
         int8_weight_only,
@@ -26,12 +27,17 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
     )
     from torchao.quantization.observer import PerRow, PerTensor
-    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
-    dummy_linear.weight = param
-    if "int8wo" in torchao_config:
-        quantize_(dummy_linear, int8_weight_only())
+    if filter_fn is None:
+        def filter_fn(module, fqn):
+            return "proj" in fqn
+    if torchao_config == "" or torchao_config is None:
+        return model
+    elif "int8wo" in torchao_config:
+        quantize_(model, int8_weight_only(), filter_fn=filter_fn)
     elif "int8dq" in torchao_config:
-        quantize_(dummy_linear, int8_dynamic_activation_int8_weight())
+        quantize_(model, int8_dynamic_activation_int8_weight(), filter_fn=filter_fn)
     elif "int4wo" in torchao_config:
         group_size = int(torchao_config.split("-")[-1])
         assert group_size in [
@@ -40,13 +46,11 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             128,
             256,
         ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
-        quantize_(dummy_linear, int4_weight_only(group_size=group_size))
+        quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
     elif "fp8wo" in torchao_config:
-        from torchao.quantization import float8_weight_only
         # this requires newer hardware
         # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
-        quantize_(dummy_linear, float8_weight_only())
+        quantize_(model, float8_weight_only(), filter_fn=filter_fn)
     elif "fp8dq" in torchao_config:
         granularity = torchao_config.split("-")[-1]
         GRANULARITY_MAP = {
@@ -57,39 +61,13 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             granularity in GRANULARITY_MAP
         ), f"Supported granularity are: {GRANULARITY_MAP.keys()}, got {granularity}"
         quantize_(
-            dummy_linear,
+            model,
             float8_dynamic_activation_float8_weight(
                 granularity=GRANULARITY_MAP[granularity]
             ),
+            filter_fn=filter_fn,
         )
     else:
         raise ValueError(f"Unexpected config: {torchao_config}")
-    return dummy_linear.weight
-def apply_torchao_config_(
-    self: torch.nn.Module,
-    params_dict: Dict[str, torch.Tensor],
-    param_suffixes: Set[str],
-) -> None:
-    """A util function used for quantizing the weight parameters after they are loaded if
-       self.torchao_config is specified
-    Args:
-      `self`: the model we want to quantize
-      `params_dict`: dictionary mapping from param_name to the parameter Tensor
-      `param_suffixes`: a set of suffixes, we'll quantize the Tensor matching these suffixes
-    Returns:
-       None, the `params_dict` is modified inplace and the weights of `self` model are quantized
-    """
-    if self.torchao_config:
-        for param_suffix in param_suffixes:
-            for name in params_dict:
-                param = params_dict[name]
-                if param_suffix in name and param.ndim == 2:
-                    params_dict[name] = torchao_quantize_param_data(
-                        param, self.torchao_config
-                    )
-        self.load_state_dict(params_dict, assign=True)
+    return model

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -58,6 +58,7 @@ global_server_args_dict = {
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
+    "enable_ep_moe": ServerArgs.enable_ep_moe,
 }

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -114,9 +114,6 @@ class Scheduler:
         self.skip_tokenizer_init = server_args.skip_tokenizer_init
         self.enable_metrics = server_args.enable_metrics
-        # Session info
-        self.sessions = {}
         # Init inter-process communication
         context = zmq.Context(2)
@@ -259,6 +256,10 @@ class Scheduler:
         self.num_generated_tokens = 0
         self.last_decode_stats_tic = time.time()
         self.stream_interval = server_args.stream_interval
+        self.current_stream = torch.get_device_module(self.device).current_stream()
+        # Session info
+        self.sessions = {}
         # Init chunked prefill
         self.chunked_prefill_size = server_args.chunked_prefill_size
@@ -356,6 +357,7 @@ class Scheduler:
             )
     def watchdog_thread(self):
+        """A watch dog thread that will try to kill the server itself if one batch takes too long."""
         self.watchdog_last_forward_ct = 0
         self.watchdog_last_time = time.time()
@@ -433,61 +435,6 @@ class Scheduler:
             self.last_batch = batch
-    def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
-        # Check if other DP workers have running batches
-        if local_batch is None:
-            num_tokens = 0
-        elif local_batch.forward_mode.is_decode():
-            num_tokens = local_batch.batch_size()
-        else:
-            num_tokens = local_batch.extend_num_tokens
-        local_num_tokens = torch.tensor([num_tokens], dtype=torch.int64)
-        global_num_tokens = torch.empty(self.tp_size, dtype=torch.int64)
-        torch.distributed.all_gather_into_tensor(
-            global_num_tokens,
-            local_num_tokens,
-            group=self.tp_cpu_group,
-        )
-        if local_batch is None and global_num_tokens.max().item() > 0:
-            local_batch = self.get_idle_batch()
-        if local_batch is not None:
-            local_batch.global_num_tokens = global_num_tokens.tolist()
-            # Check forward mode for cuda graph
-            if not self.server_args.disable_cuda_graph:
-                forward_mode_state = torch.tensor(
-                    (
-                        1
-                        if local_batch.forward_mode.is_decode()
-                        or local_batch.forward_mode.is_idle()
-                        else 0
-                    ),
-                    dtype=torch.int32,
-                )
-                torch.distributed.all_reduce(
-                    forward_mode_state,
-                    op=torch.distributed.ReduceOp.MIN,
-                    group=self.tp_cpu_group,
-                )
-                local_batch.can_run_dp_cuda_graph = forward_mode_state.item() == 1
-        return local_batch
-    def get_idle_batch(self):
-        idle_batch = ScheduleBatch.init_new(
-            [],
-            self.req_to_token_pool,
-            self.token_to_kv_pool,
-            self.tree_cache,
-            self.model_config,
-            self.enable_overlap,
-        )
-        idle_batch.prepare_for_idle()
-        return idle_batch
     def recv_requests(self):
         if self.tp_rank == 0 or self.server_args.enable_dp_attention:
             recv_reqs = []
@@ -993,7 +940,7 @@ class Scheduler:
             self.process_batch_result_prefill(batch, result)
         elif batch.forward_mode.is_dummy_first():
             batch.next_batch_sampling_info.update_regex_vocab_mask()
-            torch.cuda.current_stream().synchronize()
+            self.current_stream.synchronize()
             batch.next_batch_sampling_info.sampling_info_done.set()
     def process_batch_result_prefill(self, batch: ScheduleBatch, result):
@@ -1049,13 +996,14 @@ class Scheduler:
                     if req.grammar is not None:
                         req.grammar.accept_token(next_token_id)
+                        req.grammar.finished = req.finished()
                 else:
                     # being chunked reqs' prefill is not finished
                     req.is_being_chunked -= 1
             if batch.next_batch_sampling_info:
                 batch.next_batch_sampling_info.update_regex_vocab_mask()
-                torch.cuda.current_stream().synchronize()
+                self.current_stream.synchronize()
                 batch.next_batch_sampling_info.sampling_info_done.set()
         else:  # embedding or reward model
@@ -1127,10 +1075,11 @@ class Scheduler:
             if req.grammar is not None:
                 req.grammar.accept_token(next_token_id)
+                req.grammar.finished = req.finished()
         if batch.next_batch_sampling_info:
             batch.next_batch_sampling_info.update_regex_vocab_mask()
-            torch.cuda.current_stream().synchronize()
+            self.current_stream.synchronize()
             batch.next_batch_sampling_info.sampling_info_done.set()
         self.stream_output(batch.reqs)
@@ -1328,6 +1277,61 @@ class Scheduler:
                     )
                 )
+    def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
+        # Check if other DP workers have running batches
+        if local_batch is None:
+            num_tokens = 0
+        elif local_batch.forward_mode.is_decode():
+            num_tokens = local_batch.batch_size()
+        else:
+            num_tokens = local_batch.extend_num_tokens
+        local_num_tokens = torch.tensor([num_tokens], dtype=torch.int64)
+        global_num_tokens = torch.empty(self.tp_size, dtype=torch.int64)
+        torch.distributed.all_gather_into_tensor(
+            global_num_tokens,
+            local_num_tokens,
+            group=self.tp_cpu_group,
+        )
+        if local_batch is None and global_num_tokens.max().item() > 0:
+            local_batch = self.get_idle_batch()
+        if local_batch is not None:
+            local_batch.global_num_tokens = global_num_tokens.tolist()
+            # Check forward mode for cuda graph
+            if not self.server_args.disable_cuda_graph:
+                forward_mode_state = torch.tensor(
+                    (
+                        1
+                        if local_batch.forward_mode.is_decode()
+                        or local_batch.forward_mode.is_idle()
+                        else 0
+                    ),
+                    dtype=torch.int32,
+                )
+                torch.distributed.all_reduce(
+                    forward_mode_state,
+                    op=torch.distributed.ReduceOp.MIN,
+                    group=self.tp_cpu_group,
+                )
+                local_batch.can_run_dp_cuda_graph = forward_mode_state.item() == 1
+        return local_batch
+    def get_idle_batch(self):
+        idle_batch = ScheduleBatch.init_new(
+            [],
+            self.req_to_token_pool,
+            self.token_to_kv_pool,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+        )
+        idle_batch.prepare_for_idle()
+        return idle_batch
     def move_ready_grammar_requests(self):
         """Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
         num_ready_reqs = 0
@@ -1469,10 +1473,6 @@ def run_scheduler_process(
     dp_rank: Optional[int],
     pipe_writer,
 ):
-    # set cpu affinity to this gpu process
-    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
-        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
     # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
     if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
         dp_rank = int(os.environ["SGLANG_DP_RANK"])
@@ -1482,6 +1482,10 @@ def run_scheduler_process(
     else:
         configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
+    # set cpu affinity to this gpu process
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
     suppress_other_loggers()
     parent_process = psutil.Process().parent()

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -32,12 +32,13 @@ from sglang.srt.managers.io_struct import (
 from sglang.srt.managers.schedule_batch import ModelWorkerBatch
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_compiler_backend
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def resolve_future_token_ids(input_ids, future_token_ids_map):
     input_ids[:] = torch.where(
         input_ids < 0,
@@ -73,12 +74,13 @@ class TpModelWorkerClient:
         # Launch threads
         self.input_queue = Queue()
         self.output_queue = Queue()
-        self.forward_stream = torch.cuda.Stream()
+        self.forward_stream = torch.get_device_module(self.device).Stream()
         self.forward_thread = threading.Thread(
             target=self.forward_thread_func,
         )
         self.forward_thread.start()
         self.parent_process = psutil.Process().parent()
+        self.scheduler_stream = torch.get_device_module(self.device).current_stream()
     def get_worker_info(self):
         return self.worker.get_worker_info()
@@ -97,7 +99,7 @@ class TpModelWorkerClient:
     def forward_thread_func(self):
         try:
-            with torch.cuda.stream(self.forward_stream):
+            with torch.get_device_module(self.device).stream(self.forward_stream):
                 self.forward_thread_func_()
         except Exception:
             traceback = get_exception_traceback()
@@ -122,7 +124,7 @@ class TpModelWorkerClient:
             # Create event
             self.launch_done = threading.Event()
-            copy_done = torch.cuda.Event()
+            copy_done = torch.get_device_module(self.device).Event()
             # Resolve future tokens in the input
             input_ids = model_worker_batch.input_ids
@@ -190,7 +192,7 @@ class TpModelWorkerClient:
         )
         # A cuda stream sync here to avoid the cuda illegal memory access error.
-        torch.cuda.current_stream().synchronize()
+        self.scheduler_stream.synchronize()
         # Push a new batch to the queue
         self.input_queue.put((model_worker_batch, self.future_token_ids_ct))

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -27,6 +27,7 @@ from typing import List, Tuple, Union
 import torch
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import get_compiler_backend
 logger = logging.getLogger(__name__)
@@ -129,6 +130,9 @@ class BaseTokenToKVPool:
         return select_index.to(self.device, non_blocking=True)
     def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
         if self.is_not_in_free_group:
             self.free_slots = torch.concat((self.free_slots, free_index.cpu()))
         else:
@@ -234,7 +238,7 @@ class MHATokenToKVPool(BaseTokenToKVPool):
 # This compiled version is slower in the unit test
 # python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def copy_two_array(loc, dst_1, src_1, dst_2, src_2, dtype, store_dtype):
     dst_1[loc] = src_1.to(dtype).view(store_dtype)
     dst_2[loc] = src_2.to(dtype).view(store_dtype)

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -47,7 +47,7 @@ def _to_torch(model: torch.nn.Module, reverse: bool, batch_size: int):
                 if "FusedMoE" in sub.__class__.__name__:
                     if batch_size == 1:
                         # The performance of torch.compile on this layer is not always good when bs > 1,
-                        # so we decide to skip it for now.
+                        # so we decide to only use torch.compile when bs =1
                         sub._forward_method = fused_moe_forward_native
                 else:
                     sub._forward_method = sub.forward_native
@@ -130,6 +130,20 @@ class CudaGraphRunner:
             self.capture_bs = list(range(1, 32)) + [64, 128]
         else:
             self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+        if max(self.capture_bs) > model_runner.req_to_token_pool.size:
+            # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
+            # is very samll. We add more values here to make sure we capture the maximum bs.
+            self.capture_bs = list(
+                sorted(
+                    set(
+                        self.capture_bs
+                        + [model_runner.req_to_token_pool.size - 1]
+                        + [model_runner.req_to_token_pool.size]
+                    )
+                )
+            )
         self.capture_bs = [
             bs
             for bs in self.capture_bs

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -27,7 +27,6 @@ from vllm.distributed import (
     initialize_model_parallel,
     set_custom_all_reduce,
 )
-from vllm.distributed.parallel_state import in_the_same_node_as
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig
@@ -38,6 +37,7 @@ from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBack
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import Sampler
+from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
 from sglang.srt.lora.lora_manager import LoRAManager
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import (
@@ -111,11 +111,13 @@ class ModelRunner:
             )
         if self.is_multimodal:
-            logger.info(
-                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
-            )
             server_args.chunked_prefill_size = -1
             self.mem_fraction_static *= 0.95
+            logger.info(
+                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static} "
+                f"and turn off chunked prefill "
+                f"because this is a multimodal model."
+            )
             # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
@@ -139,6 +141,7 @@ class ModelRunner:
                 "torchao_config": server_args.torchao_config,
                 "enable_nan_detection": server_args.enable_nan_detection,
                 "enable_dp_attention": server_args.enable_dp_attention,
+                "enable_ep_moe": server_args.enable_ep_moe,
             }
         )
@@ -159,6 +162,10 @@ class ModelRunner:
         else:
             self.torch_tp_applied = False
+        apply_torchao_config_to_model(
+            self.model, global_server_args_dict["torchao_config"]
+        )
         # Init memory pool and attention backends
         if server_args.lora_paths is not None:
             self.init_lora_manager()

sglang/srt/model_parallel.py CHANGED Viewed

@@ -54,11 +54,7 @@ class RowwiseParallelMaybeWait(RowwiseParallel):
         )._prepare_output_fn(
             output_layouts, use_local_output, mod, outputs, device_mesh
         )
-        # wait for the output to be ready
-        if isinstance(outputs, AsyncCollectiveTensor):
-            return outputs.wait()
-        else:
-            return outputs
+        return torch.distributed._functional_collectives.wait_tensor(outputs)
 def tensor_parallel(

sglang/srt/models/commandr.py CHANGED Viewed

@@ -62,10 +62,10 @@ from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import get_compiler_backend, set_weight_attrs
-@torch.compile
+@torch.compile(backend=get_compiler_backend())
 def layer_norm_func(hidden_states, weight, variance_epsilon):
     input_dtype = hidden_states.dtype
     hidden_states = hidden_states.to(torch.float32)

sglang 0.4.0__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

sglang 0.4.0py3-none-any.whl → 0.4.0.post1py3-none-any.whl